#594 - Add on-demand package data collection for cargo

chinyeungli · JonoYang · commit 2b0b155168a1 · 2025-04-07T12:50:55.000-07:00
Signed-off-by: Chin Yeung Li &lt;tli@nexb.com&gt;
diff --git a/minecode/collectors/cargo.py b/minecode/collectors/cargo.py
@@ -0,0 +1,98 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import logging
+
+import requests
+from packageurl import PackageURL
+
+from minecode import priority_router
+from minecode.miners.cargo import build_packages
+from packagedb.models import PackageContentType
+
+"""
+Collect Cargo packages from cargo registries.
+"""
+
+logger = logging.getLogger(__name__)
+handler = logging.StreamHandler()
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def get_package_json(name):
+    """
+    Return the contents of the JSON file of the package.
+    """
+    # Create URLs using purl fields
+    url = f"https://crates.io/api/v1/crates/{name}"
+
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.HTTPError as err:
+        logger.error(f"HTTP error occurred: {err}")
+
+
+def map_cargo_package(package_url, pipelines, priority=0):
+    """
+    Add a cargo `package_url` to the PackageDB.
+
+    Return an error string if any errors are encountered during the process
+    """
+    from minecode.model_utils import add_package_to_scan_queue
+    from minecode.model_utils import merge_or_create_package
+
+    package_json = get_package_json(name=package_url.name)
+
+    if not package_json:
+        error = f"Package does not exist on crates.io: {package_url}"
+        logger.error(error)
+        return error
+
+    packages = build_packages(package_json, package_url)
+
+    for package in packages:
+        package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE
+        db_package, _, _, error = merge_or_create_package(package, visit_level=0)
+        if error:
+            break
+
+        # Submit package for scanning
+        if db_package:
+            add_package_to_scan_queue(
+                package=db_package, pipelines=pipelines, priority=priority
+            )
+
+    return error
+
+
+@priority_router.route("pkg:cargo/.*")
+def process_request(purl_str, **kwargs):
+    """
+    Process `priority_resource_uri` containing a cargo Package URL (PURL) as a
+    URI.
+
+    This involves obtaining Package information for the PURL from cargo and
+    using it to create a new PackageDB entry. The package is then added to the
+    scan queue afterwards.
+    """
+    from minecode.model_utils import DEFAULT_PIPELINES
+
+    addon_pipelines = kwargs.get("addon_pipelines", [])
+    pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
+    priority = kwargs.get("priority", 0)
+
+    package_url = PackageURL.from_string(purl_str)
+
+    error_msg = map_cargo_package(package_url, pipelines, priority)
+
+    if error_msg:
+        return error_msg
diff --git a/minecode/miners/cargo.py b/minecode/miners/cargo.py
@@ -0,0 +1,93 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import requests
+
+from packagedcode import models as scan_models
+
+
+def build_packages(metadata_dict, purl):
+    """
+    Yield ScannedPackage built from crates.io.
+
+    The metadata_dict is a dictionary. It consists of four primary
+    components: crate, version, keywords, and categories. Among these, the
+    version is the key focus, while the other three provide a summary of
+    the package.
+
+    purl: String value of the package url of the ResourceURI object
+    """
+    purl_version = purl.version
+    versions = metadata_dict["versions"]
+    for version_info in versions:
+        version = version_info["num"]
+        if purl_version and not purl_version == version:
+            continue
+        description = version_info["description"]
+        name = version_info["crate"]
+        homepage_url = version_info["homepage"]
+        repository_homepage_url = version_info["repository"]
+
+        extracted_license_statement = []
+        lic = version_info["license"]
+        if lic and lic != "UNKNOWN":
+            extracted_license_statement.append(lic)
+
+        kw = metadata_dict["crate"]["keywords"]
+
+        # mapping of information that are common to all the downloads of a
+        # version
+        common_data = dict(
+            name=name,
+            version=version,
+            description=description,
+            homepage_url=homepage_url,
+            repository_homepage_url=repository_homepage_url,
+            extracted_license_statement=extracted_license_statement,
+            keywords=kw,
+        )
+
+        if version_info["published_by"]:
+            if version_info["published_by"]["name"]:
+                author = version_info["published_by"]["name"]
+            else:
+                author = version_info["published_by"]["login"]
+
+            if author:
+                parties = common_data.get("parties")
+                if not parties:
+                    common_data["parties"] = []
+                common_data["parties"].append(
+                    scan_models.Party(name=author, role="author")
+                )
+
+        download_path = version_info["dl_path"]
+        if download_path:
+            # As the  consistently ends with "/download" (e.g.,
+            # "/api/v1/crates/purl/0.1.5/download"), we need to obtain the
+            # redirected URL to ensure the filename is not simply
+            # "download."
+            download_url = "https://crates.io/" + download_path
+            response = requests.head(download_url, allow_redirects=True)
+            download_url = response.url
+
+            download_data = dict(
+                datasource_id="cargo_pkginfo",
+                type="cargo",
+                download_url=download_url,
+                size=version_info["crate_size"],
+                sha256=version_info["checksum"],
+            )
+            download_data.update(common_data)
+            package = scan_models.PackageData.from_data(download_data)
+
+            package.datasource_id = "cargo_api_metadata"
+            package.set_purl(purl)
+            yield package
+