Skip to content

Commit 57d5409

Browse files
authored
Merge pull request #603 from aboutcode-org/595-add_on-demand_package_data_collection_for_gem
Collect Rubygems PURL ondemand #595
2 parents 00b8826 + 31b4666 commit 57d5409

File tree

7 files changed

+434
-0
lines changed

7 files changed

+434
-0
lines changed

minecode/collectors/rubygems.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
12+
import requests
13+
from packageurl import PackageURL
14+
15+
from minecode import priority_router
16+
from minecode.miners.rubygems import build_rubygem_packages_from_api_v2_data
17+
from packagedb.models import PackageContentType
18+
19+
"""
20+
Collect GEM packages from gem registries.
21+
"""
22+
23+
logger = logging.getLogger(__name__)
24+
handler = logging.StreamHandler()
25+
logger.addHandler(handler)
26+
logger.setLevel(logging.INFO)
27+
28+
29+
def get_package_json(name, version):
30+
"""
31+
Return the contents of the JSON file of the package.
32+
"""
33+
# Create the RubyGems API URL
34+
url = f"https://rubygems.org/api/v2/rubygems/{name}/versions/{version}.json"
35+
36+
try:
37+
response = requests.get(url)
38+
response.raise_for_status()
39+
return response.json()
40+
except requests.exceptions.HTTPError as err:
41+
logger.error(f"HTTP error occurred: {err}")
42+
43+
44+
def get_all_package_version(name):
45+
"""
46+
Return a list of all version numbers for the package name.
47+
"""
48+
url = f"https://rubygems.org/api/v1/versions/{name}.json"
49+
try:
50+
versions = []
51+
response = requests.get(url)
52+
response.raise_for_status()
53+
data = response.json()
54+
# Get all available versions
55+
for item in data:
56+
versions.append(item["number"])
57+
return versions
58+
except requests.exceptions.HTTPError as err:
59+
logger.error(f"HTTP error occurred: {err}")
60+
61+
62+
def map_gem_package(package_url, pipelines, priority=0):
63+
"""
64+
Add a gem `package_url` to the PackageDB.
65+
66+
Return an error string if any errors are encountered during the process
67+
"""
68+
from minecode.model_utils import add_package_to_scan_queue
69+
from minecode.model_utils import merge_or_create_package
70+
71+
error = ""
72+
package_json = get_package_json(
73+
name=package_url.name,
74+
version=package_url.version,
75+
)
76+
77+
if not package_json:
78+
error = f"Package does not exist on rubygems.org: {package_url}"
79+
logger.error(error)
80+
return error
81+
82+
metadata = package_json
83+
packages = build_rubygem_packages_from_api_v2_data(metadata, package_url)
84+
85+
for package in packages:
86+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
87+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
88+
if error:
89+
break
90+
91+
# Submit package for scanning
92+
if db_package:
93+
add_package_to_scan_queue(
94+
package=db_package, pipelines=pipelines, priority=priority
95+
)
96+
97+
return error
98+
99+
100+
@priority_router.route("pkg:gem/.*")
101+
def process_request(purl_str, **kwargs):
102+
"""
103+
Process `priority_resource_uri` containing a gem Package URL (PURL) as a
104+
URI.
105+
106+
This involves obtaining Package information for the PURL from rubygem and
107+
using it to create a new PackageDB entry. The package is then added to the
108+
scan queue afterwards.
109+
"""
110+
from minecode.model_utils import DEFAULT_PIPELINES
111+
112+
addon_pipelines = kwargs.get("addon_pipelines", [])
113+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
114+
priority = kwargs.get("priority", 0)
115+
116+
package_url = PackageURL.from_string(purl_str)
117+
118+
if not package_url.version:
119+
versions = get_all_package_version(package_url.name)
120+
for version in versions:
121+
# package_url.version cannot be set as it will raise
122+
# AttributeError: can't set attribute
123+
# package_url.version = version
124+
purl = purl_str + "@" + version
125+
package_url = PackageURL.from_string(purl)
126+
error_msg = map_gem_package(package_url, pipelines, priority)
127+
128+
if error_msg:
129+
return error_msg
130+
else:
131+
error_msg = map_gem_package(package_url, pipelines, priority)
132+
133+
if error_msg:
134+
return error_msg

minecode/miners/rubygems.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,58 @@ def build_rubygem_packages_from_api_data(metadata, name, purl=None):
213213
yield package
214214

215215

216+
def build_rubygem_packages_from_api_v2_data(metadata_dict, purl):
217+
"""
218+
Yield ScannedPackage built from RubyGems API v2.
219+
purl: String value of the package url of the ResourceURI object
220+
"""
221+
name = metadata_dict["name"]
222+
version = metadata_dict["version"]
223+
description = metadata_dict["description"]
224+
homepage_url = metadata_dict["homepage_uri"]
225+
repository_homepage_url = metadata_dict["project_uri"]
226+
release_date = metadata_dict["version_created_at"]
227+
228+
extracted_license_statement = []
229+
lic_list = metadata_dict["licenses"]
230+
if lic_list:
231+
extracted_license_statement = lic_list
232+
233+
# mapping of information that are common to all the downloads of a
234+
# version
235+
common_data = dict(
236+
name=name,
237+
version=version,
238+
description=description,
239+
homepage_url=homepage_url,
240+
repository_homepage_url=repository_homepage_url,
241+
release_date=release_date,
242+
extracted_license_statement=extracted_license_statement,
243+
)
244+
245+
author = metadata_dict["authors"]
246+
if author:
247+
parties = common_data.get("parties")
248+
if not parties:
249+
common_data["parties"] = []
250+
common_data["parties"].append(scan_models.Party(name=author, role="author"))
251+
252+
download_url = metadata_dict["gem_uri"]
253+
254+
download_data = dict(
255+
datasource_id="gem_pkginfo",
256+
type="gem",
257+
download_url=download_url,
258+
sha256=metadata_dict["sha"],
259+
)
260+
download_data.update(common_data)
261+
package = scan_models.PackageData.from_data(download_data)
262+
263+
package.datasource_id = "gem_api_metadata"
264+
package.set_purl(purl)
265+
yield package
266+
267+
216268
@map_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem")
217269
class RubyGemsPackageArchiveMetadataMapper(Mapper):
218270
"""Mapper to build on e Package from the metadata file found inside a gem."""
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
import os
12+
13+
from django.test import TestCase as DjangoTestCase
14+
15+
from packageurl import PackageURL
16+
17+
import packagedb
18+
from minecode.collectors import rubygems
19+
from minecode.tests import FIXTURES_REGEN
20+
from minecode.utils_test import JsonBasedTesting
21+
22+
23+
class RubyGemsPriorityQueueTests(JsonBasedTesting, DjangoTestCase):
24+
test_data_dir = os.path.join(
25+
os.path.dirname(os.path.dirname(__file__)), "testfiles"
26+
)
27+
28+
def setUp(self):
29+
super().setUp()
30+
self.expected_json_loc = self.get_test_loc("rubygems/apiv2/rails-8.0.2.json")
31+
with open(self.expected_json_loc) as f:
32+
self.expected_json_contents = json.load(f)
33+
34+
def test_get_package_json(self, regen=FIXTURES_REGEN):
35+
# As certain fields, such as "downloads," "versions_downloads," and
36+
# "downloads_count," may vary over time, we cannot rely on
37+
# "assertEqual" for comparison. Instead, we will verify that the
38+
# response includes some essential data such as "name" and "version"
39+
# to make sure json data is collected.
40+
json_contents = rubygems.get_package_json(
41+
name="rails",
42+
version="8.0.2",
43+
)
44+
self.assertEqual(json_contents["name"], "rails")
45+
self.assertEqual(json_contents["version"], "8.0.2")
46+
47+
def test_map_gem_package(self):
48+
package_count = packagedb.models.Package.objects.all().count()
49+
self.assertEqual(0, package_count)
50+
package_url = PackageURL.from_string("pkg:gem/[email protected]")
51+
rubygems.map_gem_package(package_url, ("test_pipeline"))
52+
package_count = packagedb.models.Package.objects.all().count()
53+
self.assertEqual(1, package_count)
54+
package = packagedb.models.Package.objects.all().first()
55+
expected_purl_str = "pkg:gem/[email protected]"
56+
expected_download_url = "https://rubygems.org/gems/rails-8.0.2.gem"
57+
self.assertEqual(expected_purl_str, package.purl)
58+
self.assertEqual(expected_download_url, package.download_url)

minecode/tests/miners/test_rubygems.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from django.test import TestCase as DjangoTestCase
1717

1818
from commoncode.fileutils import file_name
19+
from packageurl import PackageURL
1920

2021
from minecode import miners
2122
from minecode import route
@@ -26,6 +27,7 @@
2627
from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper
2728
from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataVisitor
2829
from minecode.miners.rubygems import build_rubygem_packages_from_api_data
30+
from minecode.miners.rubygems import build_rubygem_packages_from_api_v2_data
2931
from minecode.miners.rubygems import build_rubygem_packages_from_metadata
3032
from minecode.miners.rubygems import get_gem_metadata
3133
from minecode.models import ResourceURI
@@ -169,6 +171,15 @@ def test_RubyGemsApiVersionsJsonMapper(self):
169171
)
170172
self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN)
171173

174+
def test_build_rubygem_packages_from_api_v2_data(self):
175+
with open(self.get_test_loc("rubygems/apiv2/rails-8.0.2.json")) as gem_data:
176+
metadata = json.load(gem_data)
177+
package_url = PackageURL.from_string("pkg:gem/[email protected]")
178+
packages = build_rubygem_packages_from_api_v2_data(metadata, package_url)
179+
packages = [p.to_dict() for p in packages]
180+
expected_loc = self.get_test_loc("rubygems/apiv2/expected-rails-8.0.2.json")
181+
self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN)
182+
172183

173184
class RubyGemsArchiveMapperTest(JsonBasedTesting):
174185
test_data_dir = os.path.join(
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
[
2+
{
3+
"type": "gem",
4+
"namespace": null,
5+
"name": "rails",
6+
"version": "8.0.2",
7+
"qualifiers": {},
8+
"subpath": null,
9+
"primary_language": null,
10+
"description": "Ruby on Rails is a full-stack web framework optimized for programmer happiness and sustainable productivity. It encourages beautiful code by favoring convention over configuration.",
11+
"release_date": "2025-03-12T03:09:11.097Z",
12+
"parties": [
13+
{
14+
"type": null,
15+
"role": "author",
16+
"name": "David Heinemeier Hansson",
17+
"email": null,
18+
"url": null
19+
}
20+
],
21+
"keywords": [],
22+
"homepage_url": "https://rubyonrails.org",
23+
"download_url": "https://rubygems.org/gems/rails-8.0.2.gem",
24+
"size": null,
25+
"sha1": null,
26+
"md5": null,
27+
"sha256": "fdfaa5a83ec0388e02864e88d515959caedc88053b5f701c4deb1652d8f164c6",
28+
"sha512": null,
29+
"bug_tracking_url": null,
30+
"code_view_url": null,
31+
"vcs_url": null,
32+
"copyright": null,
33+
"holder": null,
34+
"declared_license_expression": "mit",
35+
"declared_license_expression_spdx": "MIT",
36+
"license_detections": [
37+
{
38+
"license_expression": "mit",
39+
"license_expression_spdx": "MIT",
40+
"matches": [
41+
{
42+
"license_expression": "mit",
43+
"license_expression_spdx": "MIT",
44+
"from_file": null,
45+
"start_line": 1,
46+
"end_line": 1,
47+
"matcher": "1-spdx-id",
48+
"score": 100,
49+
"matched_length": 1,
50+
"match_coverage": 100,
51+
"rule_relevance": 100,
52+
"rule_identifier": "spdx-license-identifier-mit-5da48780aba670b0860c46d899ed42a0f243ff06",
53+
"rule_url": null,
54+
"matched_text": "MIT"
55+
}
56+
],
57+
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
58+
}
59+
],
60+
"other_license_expression": null,
61+
"other_license_expression_spdx": null,
62+
"other_license_detections": [],
63+
"extracted_license_statement": "- MIT\n",
64+
"notice_text": null,
65+
"source_packages": [],
66+
"file_references": [],
67+
"is_private": false,
68+
"is_virtual": false,
69+
"extra_data": {},
70+
"dependencies": [],
71+
"repository_homepage_url": "https://rubygems.org/gems/rails",
72+
"repository_download_url": null,
73+
"api_data_url": null,
74+
"datasource_id": "gem_api_metadata",
75+
"purl": "pkg:gem/[email protected]"
76+
}
77+
]

0 commit comments

Comments
 (0)