Skip to content

Commit 2b0b155

Browse files
chinyeungliJonoYang
authored andcommitted
#594 - Add on-demand package data collection for cargo
Signed-off-by: Chin Yeung Li <[email protected]>
1 parent cc60e20 commit 2b0b155

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed

minecode/collectors/cargo.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
12+
import requests
13+
from packageurl import PackageURL
14+
15+
from minecode import priority_router
16+
from minecode.miners.cargo import build_packages
17+
from packagedb.models import PackageContentType
18+
19+
"""
20+
Collect Cargo packages from cargo registries.
21+
"""
22+
23+
logger = logging.getLogger(__name__)
24+
handler = logging.StreamHandler()
25+
logger.addHandler(handler)
26+
logger.setLevel(logging.INFO)
27+
28+
29+
def get_package_json(name):
30+
"""
31+
Return the contents of the JSON file of the package.
32+
"""
33+
# Create URLs using purl fields
34+
url = f"https://crates.io/api/v1/crates/{name}"
35+
36+
try:
37+
response = requests.get(url)
38+
response.raise_for_status()
39+
return response.json()
40+
except requests.exceptions.HTTPError as err:
41+
logger.error(f"HTTP error occurred: {err}")
42+
43+
44+
def map_cargo_package(package_url, pipelines, priority=0):
45+
"""
46+
Add a cargo `package_url` to the PackageDB.
47+
48+
Return an error string if any errors are encountered during the process
49+
"""
50+
from minecode.model_utils import add_package_to_scan_queue
51+
from minecode.model_utils import merge_or_create_package
52+
53+
package_json = get_package_json(name=package_url.name)
54+
55+
if not package_json:
56+
error = f"Package does not exist on crates.io: {package_url}"
57+
logger.error(error)
58+
return error
59+
60+
packages = build_packages(package_json, package_url)
61+
62+
for package in packages:
63+
package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
64+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
65+
if error:
66+
break
67+
68+
# Submit package for scanning
69+
if db_package:
70+
add_package_to_scan_queue(
71+
package=db_package, pipelines=pipelines, priority=priority
72+
)
73+
74+
return error
75+
76+
77+
@priority_router.route("pkg:cargo/.*")
78+
def process_request(purl_str, **kwargs):
79+
"""
80+
Process `priority_resource_uri` containing a cargo Package URL (PURL) as a
81+
URI.
82+
83+
This involves obtaining Package information for the PURL from cargo and
84+
using it to create a new PackageDB entry. The package is then added to the
85+
scan queue afterwards.
86+
"""
87+
from minecode.model_utils import DEFAULT_PIPELINES
88+
89+
addon_pipelines = kwargs.get("addon_pipelines", [])
90+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
91+
priority = kwargs.get("priority", 0)
92+
93+
package_url = PackageURL.from_string(purl_str)
94+
95+
error_msg = map_cargo_package(package_url, pipelines, priority)
96+
97+
if error_msg:
98+
return error_msg

minecode/miners/cargo.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import requests
11+
12+
from packagedcode import models as scan_models
13+
14+
15+
def build_packages(metadata_dict, purl):
16+
"""
17+
Yield ScannedPackage built from crates.io.
18+
19+
The metadata_dict is a dictionary. It consists of four primary
20+
components: crate, version, keywords, and categories. Among these, the
21+
version is the key focus, while the other three provide a summary of
22+
the package.
23+
24+
purl: String value of the package url of the ResourceURI object
25+
"""
26+
purl_version = purl.version
27+
versions = metadata_dict["versions"]
28+
for version_info in versions:
29+
version = version_info["num"]
30+
if purl_version and not purl_version == version:
31+
continue
32+
description = version_info["description"]
33+
name = version_info["crate"]
34+
homepage_url = version_info["homepage"]
35+
repository_homepage_url = version_info["repository"]
36+
37+
extracted_license_statement = []
38+
lic = version_info["license"]
39+
if lic and lic != "UNKNOWN":
40+
extracted_license_statement.append(lic)
41+
42+
kw = metadata_dict["crate"]["keywords"]
43+
44+
# mapping of information that are common to all the downloads of a
45+
# version
46+
common_data = dict(
47+
name=name,
48+
version=version,
49+
description=description,
50+
homepage_url=homepage_url,
51+
repository_homepage_url=repository_homepage_url,
52+
extracted_license_statement=extracted_license_statement,
53+
keywords=kw,
54+
)
55+
56+
if version_info["published_by"]:
57+
if version_info["published_by"]["name"]:
58+
author = version_info["published_by"]["name"]
59+
else:
60+
author = version_info["published_by"]["login"]
61+
62+
if author:
63+
parties = common_data.get("parties")
64+
if not parties:
65+
common_data["parties"] = []
66+
common_data["parties"].append(
67+
scan_models.Party(name=author, role="author")
68+
)
69+
70+
download_path = version_info["dl_path"]
71+
if download_path:
72+
# As the consistently ends with "/download" (e.g.,
73+
# "/api/v1/crates/purl/0.1.5/download"), we need to obtain the
74+
# redirected URL to ensure the filename is not simply
75+
# "download."
76+
download_url = "https://crates.io/" + download_path
77+
response = requests.head(download_url, allow_redirects=True)
78+
download_url = response.url
79+
80+
download_data = dict(
81+
datasource_id="cargo_pkginfo",
82+
type="cargo",
83+
download_url=download_url,
84+
size=version_info["crate_size"],
85+
sha256=version_info["checksum"],
86+
)
87+
download_data.update(common_data)
88+
package = scan_models.PackageData.from_data(download_data)
89+
90+
package.datasource_id = "cargo_api_metadata"
91+
package.set_purl(purl)
92+
yield package
93+

0 commit comments

Comments
 (0)