Skip to content

Commit 7993df6

Browse files
authored
Merge pull request #598 from aboutcode-org/594_Add_on-demand_package_data_collection_for_cargo
Collect Cargo PURL ondemand #594
2 parents cc60e20 + 65f616c commit 7993df6

File tree

8 files changed

+832
-0
lines changed

8 files changed

+832
-0
lines changed

minecode/collectors/cargo.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
12+
import requests
13+
from packageurl import PackageURL
14+
15+
from minecode import priority_router
16+
from minecode.miners.cargo import build_packages
17+
from packagedb.models import PackageContentType
18+
19+
"""
20+
Collect Cargo packages from cargo registries.
21+
"""
22+
23+
logger = logging.getLogger(__name__)
24+
handler = logging.StreamHandler()
25+
logger.addHandler(handler)
26+
logger.setLevel(logging.INFO)
27+
28+
29+
def get_package_json(name):
30+
"""
31+
Return the contents of the JSON file of the package.
32+
"""
33+
# Create URLs using purl fields
34+
url = f"https://crates.io/api/v1/crates/{name}"
35+
36+
try:
37+
response = requests.get(url)
38+
response.raise_for_status()
39+
return response.json()
40+
except requests.exceptions.HTTPError as err:
41+
logger.error(f"HTTP error occurred: {err}")
42+
43+
44+
def map_cargo_package(package_url, pipelines, priority=0):
45+
"""
46+
Add a cargo `package_url` to the PackageDB.
47+
48+
Return an error string if any errors are encountered during the process
49+
"""
50+
from minecode.model_utils import add_package_to_scan_queue
51+
from minecode.model_utils import merge_or_create_package
52+
53+
package_json = get_package_json(name=package_url.name)
54+
55+
if not package_json:
56+
error = f"Package does not exist on crates.io: {package_url}"
57+
logger.error(error)
58+
return error
59+
60+
packages = build_packages(package_json, package_url)
61+
62+
for package in packages:
63+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
64+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
65+
if error:
66+
break
67+
68+
# Submit package for scanning
69+
if db_package:
70+
add_package_to_scan_queue(
71+
package=db_package, pipelines=pipelines, priority=priority
72+
)
73+
74+
return error
75+
76+
77+
@priority_router.route("pkg:cargo/.*")
78+
def process_request(purl_str, **kwargs):
79+
"""
80+
Process `priority_resource_uri` containing a cargo Package URL (PURL) as a
81+
URI.
82+
83+
This involves obtaining Package information for the PURL from cargo and
84+
using it to create a new PackageDB entry. The package is then added to the
85+
scan queue afterwards.
86+
"""
87+
from minecode.model_utils import DEFAULT_PIPELINES
88+
89+
addon_pipelines = kwargs.get("addon_pipelines", [])
90+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
91+
priority = kwargs.get("priority", 0)
92+
93+
package_url = PackageURL.from_string(purl_str)
94+
95+
error_msg = map_cargo_package(package_url, pipelines, priority)
96+
97+
if error_msg:
98+
return error_msg

minecode/miners/cargo.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import requests
11+
from packagedcode import models as scan_models
12+
13+
14+
def build_packages(metadata_dict, purl):
15+
"""
16+
Yield ScannedPackage built from crates.io.
17+
18+
The metadata_dict is a dictionary. It consists of four primary
19+
components: crate, version, keywords, and categories. Among these, the
20+
version is the key focus, while the other three provide a summary of
21+
the package.
22+
23+
purl: String value of the package url of the ResourceURI object
24+
"""
25+
purl_version = purl.version
26+
versions = metadata_dict["versions"]
27+
for version_info in versions:
28+
version = version_info["num"]
29+
if purl_version and not purl_version == version:
30+
continue
31+
description = version_info["description"]
32+
name = version_info["crate"]
33+
homepage_url = version_info["homepage"]
34+
repository_homepage_url = version_info["repository"]
35+
36+
extracted_license_statement = []
37+
lic = version_info["license"]
38+
if lic and lic != "UNKNOWN":
39+
extracted_license_statement.append(lic)
40+
41+
kw = metadata_dict["crate"]["keywords"]
42+
43+
# mapping of information that are common to all the downloads of a
44+
# version
45+
common_data = dict(
46+
name=name,
47+
version=version,
48+
description=description,
49+
homepage_url=homepage_url,
50+
repository_homepage_url=repository_homepage_url,
51+
extracted_license_statement=extracted_license_statement,
52+
keywords=kw,
53+
)
54+
55+
if version_info["published_by"]:
56+
if version_info["published_by"]["name"]:
57+
author = version_info["published_by"]["name"]
58+
else:
59+
author = version_info["published_by"]["login"]
60+
61+
if author:
62+
parties = common_data.get("parties")
63+
if not parties:
64+
common_data["parties"] = []
65+
common_data["parties"].append(
66+
scan_models.Party(name=author, role="author")
67+
)
68+
69+
download_path = version_info["dl_path"]
70+
if download_path:
71+
# As the consistently ends with "/download" (e.g.,
72+
# "/api/v1/crates/purl/0.1.5/download"), we need to obtain the
73+
# redirected URL to ensure the filename is not simply
74+
# "download."
75+
download_url = "https://crates.io/" + download_path
76+
response = requests.head(download_url, allow_redirects=True)
77+
download_url = response.url
78+
79+
download_data = dict(
80+
datasource_id="cargo_pkginfo",
81+
type="cargo",
82+
download_url=download_url,
83+
size=version_info["crate_size"],
84+
sha256=version_info["checksum"],
85+
)
86+
download_data.update(common_data)
87+
package = scan_models.PackageData.from_data(download_data)
88+
89+
package.datasource_id = "cargo_api_metadata"
90+
package.set_purl(purl)
91+
yield package
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
import os
12+
13+
from django.test import TestCase as DjangoTestCase
14+
15+
from packageurl import PackageURL
16+
17+
import packagedb
18+
from minecode.collectors import cargo
19+
from minecode.utils_test import JsonBasedTesting
20+
21+
22+
class CargoPriorityQueueTests(JsonBasedTesting, DjangoTestCase):
23+
test_data_dir = os.path.join(
24+
os.path.dirname(os.path.dirname(__file__)), "testfiles"
25+
)
26+
27+
def setUp(self):
28+
super().setUp()
29+
self.expected_json_loc = self.get_test_loc("cargo/sam.json")
30+
with open(self.expected_json_loc) as f:
31+
self.expected_json_contents = json.load(f)
32+
33+
def test_get_package_json(self):
34+
# As certain fields, such as "downloads," "recent_downloads," and
35+
# "num_versions," may vary over time when executing
36+
# "cargo.get_package_json(name="sam")", we cannot rely on
37+
# "assertEqual" for comparison. Instead, we will verify that the
38+
# response includes four primary components: crate, version,
39+
# keywords, and categories, and the the "id" under crate is "sam"
40+
expected_list = ["crate", "versions", "keywords", "categories"]
41+
json_contents = cargo.get_package_json(name="sam")
42+
keys = json_contents.keys()
43+
self.assertListEqual(list(keys), expected_list)
44+
self.assertEqual(json_contents["crate"]["id"], "sam")
45+
46+
def test_map_npm_package(self):
47+
package_count = packagedb.models.Package.objects.all().count()
48+
self.assertEqual(0, package_count)
49+
package_url = PackageURL.from_string("pkg:cargo/[email protected]")
50+
cargo.map_cargo_package(package_url, ("test_pipeline"))
51+
package_count = packagedb.models.Package.objects.all().count()
52+
self.assertEqual(1, package_count)
53+
package = packagedb.models.Package.objects.all().first()
54+
expected_purl_str = "pkg:cargo/[email protected]"
55+
expected_download_url = "https://static.crates.io/crates/sam/sam-0.3.1.crate"
56+
self.assertEqual(expected_purl_str, package.purl)
57+
self.assertEqual(expected_download_url, package.download_url)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
11+
import json
12+
import os
13+
14+
from django.test import TestCase as DjangoTestCase
15+
16+
from packageurl import PackageURL
17+
18+
from minecode import miners
19+
from minecode.tests import FIXTURES_REGEN
20+
from minecode.utils_test import JsonBasedTesting
21+
22+
23+
class TestCargoMap(JsonBasedTesting, DjangoTestCase):
24+
test_data_dir = os.path.join(
25+
os.path.dirname(os.path.dirname(__file__)), "testfiles"
26+
)
27+
28+
def test_build_packages_with_no_version(self):
29+
with open(self.get_test_loc("cargo/sam.json")) as cargo_meta:
30+
metadata = json.load(cargo_meta)
31+
package_url = PackageURL.from_string("pkg:cargo/sam")
32+
packages = miners.cargo.build_packages(metadata, package_url)
33+
packages = [p.to_dict() for p in packages]
34+
expected_loc = self.get_test_loc("cargo/expected-sam.json")
35+
self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN)
36+
37+
def test_build_packages_with_version(self):
38+
with open(self.get_test_loc("cargo/sam.json")) as cargo_meta:
39+
metadata = json.load(cargo_meta)
40+
package_url = PackageURL.from_string("pkg:cargo/[email protected]")
41+
packages = miners.cargo.build_packages(metadata, package_url)
42+
packages = [p.to_dict() for p in packages]
43+
expected_loc = self.get_test_loc("cargo/expected-sam-0.3.1.json")
44+
self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN)
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
[
2+
{
3+
"type": "cargo",
4+
"namespace": null,
5+
"name": "sam",
6+
"version": "0.3.1",
7+
"qualifiers": {},
8+
"subpath": null,
9+
"primary_language": null,
10+
"description": "A compile time instruction assembler.",
11+
"release_date": null,
12+
"parties": [],
13+
"keywords": [],
14+
"homepage_url": null,
15+
"download_url": "https://static.crates.io/crates/sam/sam-0.3.1.crate",
16+
"size": 3769,
17+
"sha1": null,
18+
"md5": null,
19+
"sha256": "5fe15167489125b7403c9730d2c1a4c163466a327f7672444643259e8ad22178",
20+
"sha512": null,
21+
"bug_tracking_url": null,
22+
"code_view_url": null,
23+
"vcs_url": null,
24+
"copyright": null,
25+
"holder": null,
26+
"declared_license_expression": "mit",
27+
"declared_license_expression_spdx": "MIT",
28+
"license_detections": [
29+
{
30+
"license_expression": "mit",
31+
"license_expression_spdx": "MIT",
32+
"matches": [
33+
{
34+
"license_expression": "mit",
35+
"license_expression_spdx": "MIT",
36+
"from_file": null,
37+
"start_line": 1,
38+
"end_line": 1,
39+
"matcher": "1-spdx-id",
40+
"score": 100,
41+
"matched_length": 1,
42+
"match_coverage": 100,
43+
"rule_relevance": 100,
44+
"rule_identifier": "spdx-license-identifier-mit-5da48780aba670b0860c46d899ed42a0f243ff06",
45+
"rule_url": null,
46+
"matched_text": "MIT"
47+
}
48+
],
49+
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
50+
}
51+
],
52+
"other_license_expression": null,
53+
"other_license_expression_spdx": null,
54+
"other_license_detections": [],
55+
"extracted_license_statement": "- MIT\n",
56+
"notice_text": null,
57+
"source_packages": [],
58+
"file_references": [],
59+
"is_private": false,
60+
"is_virtual": false,
61+
"extra_data": {},
62+
"dependencies": [],
63+
"repository_homepage_url": "https://github.com/ioncodes/sam",
64+
"repository_download_url": null,
65+
"api_data_url": null,
66+
"datasource_id": "cargo_api_metadata",
67+
"purl": "pkg:cargo/[email protected]"
68+
}
69+
]

0 commit comments

Comments
 (0)