Skip to content

Commit 4c463f2

Browse files
authored
Add CRAN collector (#690)
* Add CRAN collector Signed-off-by: Tushar Goel <[email protected]> * Fix linting issues Signed-off-by: Tushar Goel <[email protected]> --------- Signed-off-by: Tushar Goel <[email protected]>
1 parent 44c77b5 commit 4c463f2

File tree

4 files changed

+5331
-0
lines changed

4 files changed

+5331
-0
lines changed

minecode/collectors/cran.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
import requests
12+
from packageurl import PackageURL
13+
14+
from minecode import priority_router
15+
from packagedb.models import PackageContentType
16+
17+
logger = logging.getLogger(__name__)
18+
handler = logging.StreamHandler()
19+
logger.addHandler(handler)
20+
logger.setLevel(logging.INFO)
21+
22+
23+
def get_cran_package_json(name):
24+
"""
25+
Return the contents of the JSON file of the package from CRAN DB API.
26+
Example: https://crandb.r-pkg.org/dplyr/all
27+
"""
28+
url = f"https://crandb.r-pkg.org/{name}/all"
29+
30+
try:
31+
response = requests.get(url)
32+
response.raise_for_status()
33+
return response.json()
34+
except requests.exceptions.HTTPError as err:
35+
logger.error(f"HTTP error occurred: {err}")
36+
37+
38+
def map_cran_package(package_url, pipelines, priority=0):
39+
"""
40+
Add a CRAN `package_url` to the PackageDB.
41+
"""
42+
from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package
43+
from minecode.miners.cran import build_packages
44+
45+
name = package_url.name
46+
package_json = get_cran_package_json(name)
47+
48+
if not package_json:
49+
error = f"Package does not exist on CRAN: {package_url}"
50+
logger.error(error)
51+
return error
52+
53+
packages = build_packages(package_json, package_url)
54+
55+
error = None
56+
for package in packages:
57+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
58+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
59+
if error:
60+
break
61+
62+
if db_package:
63+
add_package_to_scan_queue(package=db_package, pipelines=pipelines, priority=priority)
64+
65+
return error
66+
67+
68+
@priority_router.route("pkg:cran/.*")
69+
def process_request(purl_str, **kwargs):
70+
"""
71+
Process CRAN Package URL (PURL).
72+
"""
73+
from minecode.model_utils import DEFAULT_PIPELINES
74+
75+
addon_pipelines = kwargs.get("addon_pipelines", [])
76+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
77+
priority = kwargs.get("priority", 0)
78+
79+
package_url = PackageURL.from_string(purl_str)
80+
error_msg = map_cran_package(package_url, pipelines, priority)
81+
82+
if error_msg:
83+
return error_msg

minecode/miners/cran.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,57 @@ def build_packages_from_html(metadata, uri=None, purl=None):
194194
)
195195
package.set_purl(purl)
196196
yield package
197+
198+
199+
def build_packages(metadata_dict, purl):
200+
"""
201+
Yield ScannedPackage built from CRAN DB API.
202+
203+
metadata_dict format:
204+
{
205+
"versions": {
206+
"1.0.0": { ... },
207+
"1.1.0": { ... }
208+
}
209+
}
210+
"""
211+
purl_version = purl.version
212+
name = metadata_dict.get("Package") or purl.name
213+
214+
versions = metadata_dict.get("versions", {})
215+
for version, version_info in versions.items():
216+
if purl_version and not purl_version == version:
217+
continue
218+
219+
description = version_info.get("Description")
220+
homepage_url = version_info.get("URL")
221+
license_str = version_info.get("License")
222+
223+
authors = version_info.get("Author", "")
224+
parties = []
225+
if authors:
226+
parties.append(scan_models.Party(name=authors, role="author"))
227+
228+
# CRAN tarball download URL
229+
download_url = f"https://cran.r-project.org/src/contrib/{name}_{version}.tar.gz"
230+
231+
common_data = dict(
232+
name=name,
233+
version=version,
234+
description=description,
235+
homepage_url=homepage_url,
236+
extracted_license_statement=[license_str] if license_str else [],
237+
parties=parties,
238+
)
239+
240+
download_data = dict(
241+
datasource_id="cran_pkginfo",
242+
type="cran",
243+
download_url=download_url,
244+
)
245+
download_data.update(common_data)
246+
247+
package = scan_models.PackageData.from_data(download_data)
248+
package.datasource_id = "cran_api_metadata"
249+
package.set_purl(purl)
250+
yield package
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
import os
12+
13+
from django.test import TestCase as DjangoTestCase
14+
from packageurl import PackageURL
15+
16+
import packagedb
17+
from minecode.collectors import cran
18+
from minecode.utils_test import JsonBasedTesting
19+
20+
21+
class CranPriorityQueueTests(JsonBasedTesting, DjangoTestCase):
22+
test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles")
23+
24+
def setUp(self):
25+
super().setUp()
26+
self.expected_json_loc = self.get_test_loc("cran/dplyr.json")
27+
with open(self.expected_json_loc) as f:
28+
self.expected_json_contents = json.load(f)
29+
30+
def test_get_package_json(self):
31+
"""
32+
Verify get_cran_package_json() returns expected keys for CRAN package.
33+
"""
34+
json_contents = cran.get_cran_package_json(name="dplyr")
35+
self.assertIn("versions", json_contents)
36+
self.assertIn("dplyr", json_contents.get("Package", "dplyr"))
37+
38+
def test_map_cran_package(self):
39+
"""
40+
Verify map_cran_package() creates a Package in the DB with correct PURL
41+
and download URL.
42+
"""
43+
package_count = packagedb.models.Package.objects.all().count()
44+
self.assertEqual(0, package_count)
45+
46+
package_url = PackageURL.from_string("pkg:cran/[email protected]")
47+
cran.map_cran_package(package_url, ("test_pipeline",))
48+
49+
package_count = packagedb.models.Package.objects.all().count()
50+
self.assertEqual(1, package_count)
51+
52+
package = packagedb.models.Package.objects.all().first()
53+
expected_purl_str = "pkg:cran/[email protected]"
54+
expected_download_url = "https://cran.r-project.org/src/contrib/dplyr_1.1.0.tar.gz"
55+
56+
self.assertEqual(expected_purl_str, package.purl)
57+
self.assertEqual(expected_download_url, package.download_url)

0 commit comments

Comments
 (0)