Skip to content

Commit ec00b46

Browse files
chinyeungliJonoYang
authored andcommitted
Fixed #595 Add on-demand package data collection for gem
Signed-off-by: Chin Yeung Li <[email protected]>
1 parent 00b8826 commit ec00b46

File tree

6 files changed

+437
-0
lines changed

6 files changed

+437
-0
lines changed

minecode/collectors/rubygems.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
12+
import requests
13+
from packageurl import PackageURL
14+
15+
from minecode import priority_router
16+
from minecode.miners.rubygems import build_rubygem_packages_from_api_v2_data
17+
18+
"""
19+
Collect GEM packages from gem registries.
20+
"""
21+
22+
logger = logging.getLogger(__name__)
23+
handler = logging.StreamHandler()
24+
logger.addHandler(handler)
25+
logger.setLevel(logging.INFO)
26+
27+
28+
def get_package_json(name, version):
29+
"""
30+
Return the contents of the JSON file of the package.
31+
"""
32+
# Create the RubyGems API URL
33+
url = f"https://rubygems.org/api/v2/rubygems/{name}/versions/{version}.json"
34+
35+
try:
36+
response = requests.get(url)
37+
response.raise_for_status()
38+
return response.json()
39+
except requests.exceptions.HTTPError as err:
40+
logger.error(f"HTTP error occurred: {err}")
41+
42+
43+
def get_all_package_version(name):
44+
"""
45+
Return a list of all version numbers for the package name.
46+
"""
47+
url = f"https://rubygems.org/api/v1/versions/{name}.json"
48+
try:
49+
versions = []
50+
response = requests.get(url)
51+
response.raise_for_status()
52+
data = response.json()
53+
# Get all available versions
54+
for item in data:
55+
versions.append(item["number"])
56+
return versions
57+
except requests.exceptions.HTTPError as err:
58+
logger.error(f"HTTP error occurred: {err}")
59+
60+
61+
def map_gem_package(package_url, pipelines, priority=0):
62+
"""
63+
Add a gem `package_url` to the PackageDB.
64+
65+
Return an error string if any errors are encountered during the process
66+
"""
67+
from minecode.model_utils import add_package_to_scan_queue
68+
from minecode.model_utils import merge_or_create_package
69+
70+
error = ""
71+
package_json = get_package_json(
72+
name=package_url.name,
73+
version=package_url.version,
74+
)
75+
76+
if not package_json:
77+
error = f"Package does not exist on PyPI: {package_url}"
78+
logger.error(error)
79+
return error
80+
81+
metadata = package_json
82+
packages = build_rubygem_packages_from_api_v2_data(metadata, package_url)
83+
# packages = build_packages(package_json, package_url)
84+
85+
for package in packages:
86+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
87+
if error:
88+
break
89+
90+
# Submit package for scanning
91+
if db_package:
92+
add_package_to_scan_queue(
93+
package=db_package, pipelines=pipelines, priority=priority
94+
)
95+
96+
return error
97+
98+
99+
@priority_router.route("pkg:gem/.*")
100+
def process_request(purl_str, **kwargs):
101+
"""
102+
Process `priority_resource_uri` containing a gem Package URL (PURL) as a
103+
URI.
104+
105+
This involves obtaining Package information for the PURL from rubygem and
106+
using it to create a new PackageDB entry. The package is then added to the
107+
scan queue afterwards.
108+
"""
109+
from minecode.model_utils import DEFAULT_PIPELINES
110+
111+
addon_pipelines = kwargs.get("addon_pipelines", [])
112+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
113+
priority = kwargs.get("priority", 0)
114+
115+
package_url = PackageURL.from_string(purl_str)
116+
117+
if not package_url.version:
118+
versions = get_all_package_version(package_url.name)
119+
for version in versions:
120+
# package_url.version cannot be set as it will raise
121+
# AttributeError: can't set attribute
122+
# package_url.version = version
123+
purl = purl_str + "@" + version
124+
package_url = PackageURL.from_string(purl)
125+
error_msg = map_gem_package(package_url, pipelines, priority)
126+
127+
if error_msg:
128+
return error_msg
129+
else:
130+
error_msg = map_gem_package(package_url, pipelines, priority)
131+
132+
if error_msg:
133+
return error_msg

minecode/miners/rubygems.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,60 @@ def build_rubygem_packages_from_api_data(metadata, name, purl=None):
213213
yield package
214214

215215

216+
def build_rubygem_packages_from_api_v2_data(metadata_dict, purl):
217+
"""
218+
Yield ScannedPackage built from RubyGems API v2.
219+
purl: String value of the package url of the ResourceURI object
220+
"""
221+
222+
name = metadata_dict["name"]
223+
version = metadata_dict["version"]
224+
description = metadata_dict["description"]
225+
homepage_url = metadata_dict["homepage_uri"]
226+
repository_homepage_url = metadata_dict["project_uri"]
227+
release_date = metadata_dict["version_created_at"]
228+
229+
extracted_license_statement = []
230+
lic_list = metadata_dict["licenses"]
231+
if lic_list:
232+
extracted_license_statement = lic_list
233+
234+
# mapping of information that are common to all the downloads of a
235+
# version
236+
common_data = dict(
237+
name=name,
238+
version=version,
239+
description=description,
240+
homepage_url=homepage_url,
241+
repository_homepage_url=repository_homepage_url,
242+
release_date=release_date,
243+
extracted_license_statement=extracted_license_statement,
244+
)
245+
246+
author = metadata_dict["authors"]
247+
if author:
248+
parties = common_data.get("parties")
249+
if not parties:
250+
common_data["parties"] = []
251+
common_data["parties"].append(
252+
scan_models.Party(name=author, role="author"))
253+
254+
download_url = metadata_dict["gem_uri"]
255+
256+
download_data = dict(
257+
datasource_id="gem_pkginfo",
258+
type="gem",
259+
download_url=download_url,
260+
sha256=metadata_dict["sha"],
261+
)
262+
download_data.update(common_data)
263+
package = scan_models.PackageData.from_data(download_data)
264+
265+
package.datasource_id = "gem_api_metadata"
266+
package.set_purl(purl)
267+
yield package
268+
269+
216270
@map_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem")
217271
class RubyGemsPackageArchiveMetadataMapper(Mapper):
218272
"""Mapper to build on e Package from the metadata file found inside a gem."""
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
import os
12+
13+
from django.test import TestCase as DjangoTestCase
14+
15+
from packageurl import PackageURL
16+
17+
import packagedb
18+
from minecode.collectors import rubygems
19+
from minecode.tests import FIXTURES_REGEN
20+
from minecode.utils_test import JsonBasedTesting
21+
22+
23+
class RubyGemsPriorityQueueTests(JsonBasedTesting, DjangoTestCase):
24+
test_data_dir = os.path.join(
25+
os.path.dirname(os.path.dirname(__file__)), "testfiles"
26+
)
27+
28+
def setUp(self):
29+
super().setUp()
30+
self.expected_json_loc = self.get_test_loc("rubygems/apiv2/rails-8.0.2.json")
31+
with open(self.expected_json_loc) as f:
32+
self.expected_json_contents = json.load(f)
33+
34+
def test_get_package_json(self, regen=FIXTURES_REGEN):
35+
# As certain fields, such as "downloads," "versions_downloads," and
36+
# "downloads_count," may vary over time, we cannot rely on
37+
# "assertEqual" for comparison. Instead, we will verify that the
38+
# response includes some essential data such as "name" and "version"
39+
# to make sure json data is collected.
40+
json_contents = rubygems.get_package_json(
41+
name="rails",
42+
version="8.0.2",
43+
)
44+
self.assertEqual(json_contents["name"], "rails")
45+
self.assertEqual(json_contents["version"], "8.0.2")
46+
47+
def test_map_gem_package(self):
48+
package_count = packagedb.models.Package.objects.all().count()
49+
self.assertEqual(0, package_count)
50+
package_url = PackageURL.from_string("pkg:gem/[email protected]")
51+
rubygems.map_gem_package(package_url, ("test_pipeline"))
52+
package_count = packagedb.models.Package.objects.all().count()
53+
self.assertEqual(1, package_count)
54+
package = packagedb.models.Package.objects.all().first()
55+
expected_purl_str = "pkg:gem/[email protected]"
56+
expected_download_url = "https://rubygems.org/gems/rails-8.0.2.gem"
57+
self.assertEqual(expected_purl_str, package.purl)
58+
self.assertEqual(expected_download_url, package.download_url)

minecode/tests/miners/test_rubygems.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from django.test import TestCase as DjangoTestCase
1717

1818
from commoncode.fileutils import file_name
19+
from packageurl import PackageURL
1920

2021
from minecode import miners
2122
from minecode import route
@@ -26,6 +27,7 @@
2627
from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper
2728
from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataVisitor
2829
from minecode.miners.rubygems import build_rubygem_packages_from_api_data
30+
from minecode.miners.rubygems import build_rubygem_packages_from_api_v2_data
2931
from minecode.miners.rubygems import build_rubygem_packages_from_metadata
3032
from minecode.miners.rubygems import get_gem_metadata
3133
from minecode.models import ResourceURI
@@ -169,6 +171,18 @@ def test_RubyGemsApiVersionsJsonMapper(self):
169171
)
170172
self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN)
171173

174+
def test_build_rubygem_packages_from_api_v2_data(self):
175+
with open(self.get_test_loc("rubygems/apiv2/rails-8.0.2.json")) as cargo_meta:
176+
metadata = json.load(cargo_meta)
177+
package_url = PackageURL.from_string("pkg:gem/[email protected]")
178+
packages = build_rubygem_packages_from_api_v2_data(
179+
metadata, package_url)
180+
packages = [p.to_dict() for p in packages]
181+
expected_loc = self.get_test_loc(
182+
"rubygems/apiv2/expected-rails-8.0.2.json")
183+
self.check_expected_results(
184+
packages, expected_loc, regen=FIXTURES_REGEN)
185+
172186

173187
class RubyGemsArchiveMapperTest(JsonBasedTesting):
174188
test_data_dir = os.path.join(
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
[
2+
{
3+
"type": "gem",
4+
"namespace": null,
5+
"name": "rails",
6+
"version": "8.0.2",
7+
"qualifiers": {},
8+
"subpath": null,
9+
"primary_language": null,
10+
"description": "Ruby on Rails is a full-stack web framework optimized for programmer happiness and sustainable productivity. It encourages beautiful code by favoring convention over configuration.",
11+
"release_date": "2025-03-12T03:09:11.097Z",
12+
"parties": [
13+
{
14+
"type": null,
15+
"role": "author",
16+
"name": "David Heinemeier Hansson",
17+
"email": null,
18+
"url": null
19+
}
20+
],
21+
"keywords": [],
22+
"homepage_url": "https://rubyonrails.org",
23+
"download_url": "https://rubygems.org/gems/rails-8.0.2.gem",
24+
"size": null,
25+
"sha1": null,
26+
"md5": null,
27+
"sha256": "fdfaa5a83ec0388e02864e88d515959caedc88053b5f701c4deb1652d8f164c6",
28+
"sha512": null,
29+
"bug_tracking_url": null,
30+
"code_view_url": null,
31+
"vcs_url": null,
32+
"copyright": null,
33+
"holder": null,
34+
"declared_license_expression": "mit",
35+
"declared_license_expression_spdx": "MIT",
36+
"license_detections": [
37+
{
38+
"license_expression": "mit",
39+
"license_expression_spdx": "MIT",
40+
"matches": [
41+
{
42+
"license_expression": "mit",
43+
"license_expression_spdx": "MIT",
44+
"from_file": null,
45+
"start_line": 1,
46+
"end_line": 1,
47+
"matcher": "1-spdx-id",
48+
"score": 100,
49+
"matched_length": 1,
50+
"match_coverage": 100,
51+
"rule_relevance": 100,
52+
"rule_identifier": "spdx-license-identifier-mit-5da48780aba670b0860c46d899ed42a0f243ff06",
53+
"rule_url": null,
54+
"matched_text": "MIT"
55+
}
56+
],
57+
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
58+
}
59+
],
60+
"other_license_expression": null,
61+
"other_license_expression_spdx": null,
62+
"other_license_detections": [],
63+
"extracted_license_statement": "- MIT\n",
64+
"notice_text": null,
65+
"source_packages": [],
66+
"file_references": [],
67+
"is_private": false,
68+
"is_virtual": false,
69+
"extra_data": {},
70+
"dependencies": [],
71+
"repository_homepage_url": "https://rubygems.org/gems/rails",
72+
"repository_download_url": null,
73+
"api_data_url": null,
74+
"datasource_id": "gem_api_metadata",
75+
"purl": "pkg:gem/[email protected]"
76+
}
77+
]

0 commit comments

Comments
 (0)