Skip to content

Commit 8336056

Browse files
committed
chore: pypi inspector link generation now lives in a dataclass in the pypi registry code
Signed-off-by: Carl Flottmann <[email protected]>
1 parent 5f998e0 commit 8336056

File tree

5 files changed

+192
-235
lines changed

5 files changed

+192
-235
lines changed

src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py

Lines changed: 7 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66
import logging
77

88
from macaron.errors import HeuristicAnalyzerValueError
9-
from macaron.json_tools import JsonType, json_extract
9+
from macaron.json_tools import JsonType
1010
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
1111
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
1212
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
13-
from macaron.util import send_head_http_raw
1413

1514
logger: logging.Logger = logging.getLogger(__name__)
1615

@@ -23,13 +22,6 @@ class WheelAbsenceAnalyzer(BaseHeuristicAnalyzer):
2322
heuristic fails.
2423
"""
2524

26-
WHEEL: str = "bdist_wheel"
27-
# as per https://github.com/pypi/inspector/blob/main/inspector/main.py line 125
28-
INSPECTOR_TEMPLATE = (
29-
"{inspector_url_scheme}://{inspector_url_netloc}/project/"
30-
"{name}/{version}/packages/{first}/{second}/{rest}/{filename}"
31-
)
32-
3325
def __init__(self) -> None:
3426
super().__init__(
3527
name="wheel_absence_analyzer",
@@ -53,83 +45,17 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
5345
Raises
5446
------
5547
HeuristicAnalyzerValueError
56-
If there is no release information, or has other missing package information.
48+
If there is missing package information.
5749
"""
58-
releases = pypi_package_json.get_releases()
59-
if releases is None: # no release information
60-
error_msg = "There is no information for any release of this package."
61-
logger.debug(error_msg)
62-
raise HeuristicAnalyzerValueError(error_msg)
63-
64-
version = pypi_package_json.component_version
65-
if version is None: # check latest release version
66-
version = pypi_package_json.get_latest_version()
67-
68-
if version is None:
69-
error_msg = "There is no latest version of this package."
70-
logger.debug(error_msg)
71-
raise HeuristicAnalyzerValueError(error_msg)
72-
73-
# Contains a boolean field identifying if the link is reachable by this Macaron instance or not.
74-
inspector_links: dict[str, JsonType] = {}
75-
wheel_present: bool = False
76-
77-
release_distributions = json_extract(releases, [version], list)
78-
if release_distributions is None:
79-
error_msg = f"The version {version} is not available as a release."
50+
if not pypi_package_json.get_inspector_links():
51+
error_msg = "Unable to retrieve PyPI inspector information about package"
8052
logger.debug(error_msg)
8153
raise HeuristicAnalyzerValueError(error_msg)
8254

83-
for distribution in release_distributions:
84-
# validate data
85-
package_type = json_extract(distribution, ["packagetype"], str)
86-
if package_type is None:
87-
error_msg = f"The version {version} has no 'package type' field in a distribution"
88-
logger.debug(error_msg)
89-
raise HeuristicAnalyzerValueError(error_msg)
90-
91-
name = json_extract(pypi_package_json.package_json, ["info", "name"], str)
92-
if name is None:
93-
error_msg = f"The version {version} has no 'name' field in a distribution"
94-
logger.debug(error_msg)
95-
raise HeuristicAnalyzerValueError(error_msg)
96-
97-
blake2b_256 = json_extract(distribution, ["digests", "blake2b_256"], str)
98-
if blake2b_256 is None:
99-
error_msg = f"The version {version} has no 'blake2b_256' field in a distribution"
100-
logger.debug(error_msg)
101-
raise HeuristicAnalyzerValueError(error_msg)
102-
103-
filename = json_extract(distribution, ["filename"], str)
104-
if filename is None:
105-
error_msg = f"The version {version} has no 'filename' field in a distribution"
106-
logger.debug(error_msg)
107-
raise HeuristicAnalyzerValueError(error_msg)
108-
109-
if package_type == self.WHEEL:
110-
wheel_present = True
111-
112-
inspector_link = self.INSPECTOR_TEMPLATE.format(
113-
inspector_url_scheme=pypi_package_json.pypi_registry.inspector_url_scheme,
114-
inspector_url_netloc=pypi_package_json.pypi_registry.inspector_url_netloc,
115-
name=name,
116-
version=version,
117-
first=blake2b_256[0:2],
118-
second=blake2b_256[2:4],
119-
rest=blake2b_256[4:],
120-
filename=filename,
121-
)
122-
123-
# use a head request because we don't care about the response contents
124-
inspector_links[inspector_link] = False
125-
if send_head_http_raw(inspector_link):
126-
inspector_links[inspector_link] = True # link was reachable
127-
128-
detail_info: dict[str, JsonType] = {
129-
"inspector_links": inspector_links,
130-
}
55+
detail_info: dict = {"inspector_links": pypi_package_json.inspector_asset.package_link_reachability}
13156

132-
if wheel_present:
57+
# At least one wheel file exists
58+
if len(pypi_package_json.inspector_asset.package_whl_links) > 0:
13359
return HeuristicResult.PASS, detail_info
13460

13561
return HeuristicResult.FAIL, detail_info

src/macaron/repo_finder/repo_finder_pypi.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
1010
from macaron.repo_finder.repo_validator import find_valid_repository_url
1111
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry
12-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, find_or_create_pypi_asset
12+
from macaron.slsa_analyzer.package_registry.pypi_registry import (
13+
PyPIInspectorAsset,
14+
PyPIPackageJsonAsset,
15+
find_or_create_pypi_asset,
16+
)
1317
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
1418

1519
logger: logging.Logger = logging.getLogger(__name__)
@@ -58,7 +62,9 @@ def find_repo(
5862
pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None)
5963
if not pypi_registry:
6064
return "", RepoFinderInfo.PYPI_NO_REGISTRY
61-
pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "")
65+
pypi_asset = PyPIPackageJsonAsset(
66+
purl.name, purl.version, False, pypi_registry, {}, "", PyPIInspectorAsset("", [], {})
67+
)
6268

6369
if not pypi_asset:
6470
# This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry.

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
can_download_file,
3131
download_file_with_size_limit,
3232
send_get_http_raw,
33+
send_head_http_raw,
3334
stream_file_with_size_limit,
3435
)
3536

@@ -465,6 +466,33 @@ def extract_attestation(attestation_data: dict) -> dict | None:
465466
return attestations[0]
466467

467468

469+
# as per https://github.com/pypi/inspector/blob/main/inspector/main.py line 125
470+
INSPECTOR_TEMPLATE = (
471+
"{inspector_url_scheme}://{inspector_url_netloc}/project/"
472+
"{name}/{version}/packages/{first}/{second}/{rest}/{filename}"
473+
)
474+
475+
476+
@dataclass
477+
class PyPIInspectorAsset:
478+
"""The package PyPI inspector information."""
479+
480+
#: the pypi inspector link to the tarball
481+
package_sdist_link: str
482+
483+
#: the pypi inspector link(s) to the wheel(s)
484+
package_whl_links: list[str]
485+
486+
#: a mapping of inspector links to whether they are reachable
487+
package_link_reachability: dict[str, bool]
488+
489+
def __bool__(self) -> bool:
490+
"""Determine if this inspector object is empty."""
491+
if (self.package_sdist_link or self.package_whl_links) and self.package_link_reachability:
492+
return True
493+
return False
494+
495+
468496
@dataclass
469497
class PyPIPackageJsonAsset:
470498
"""The package JSON hosted on the PyPI registry."""
@@ -487,6 +515,9 @@ class PyPIPackageJsonAsset:
487515
#: the source code temporary location name
488516
package_sourcecode_path: str
489517

518+
#: the pypi inspector information about this package
519+
inspector_asset: PyPIInspectorAsset
520+
490521
#: The size of the asset (in bytes). This attribute is added to match the AssetLocator
491522
#: protocol and is not used because pypi API registry does not provide it.
492523
@property
@@ -753,6 +784,91 @@ def get_sha256(self) -> str | None:
753784
logger.debug("Found sha256 hash: %s", artifact_hash)
754785
return artifact_hash
755786

787+
def get_inspector_links(self) -> bool:
788+
"""Generate PyPI inspector links for this package version's distributions and fill in the inspector asset.
789+
790+
Returns
791+
-------
792+
bool
793+
True if the link generation was successful, False otherwise.
794+
"""
795+
if self.inspector_asset:
796+
return True
797+
798+
if not self.package_json and not self.download(""):
799+
logger.warning("No package metadata available, cannot get links")
800+
return False
801+
802+
releases = self.get_releases()
803+
if releases is None:
804+
logger.warning("Package has no releases, cannot create inspector links.")
805+
return False
806+
807+
version = self.component_version
808+
if self.component_version is None:
809+
version = self.get_latest_version()
810+
811+
if version is None:
812+
logger.warning("No version set, and no latest version exists. cannot create inspector links.")
813+
return False
814+
815+
distributions = json_extract(releases, [version], list)
816+
817+
if not distributions:
818+
logger.warning(
819+
"Package has no distributions for release version %s. Cannot create inspector links.", version
820+
)
821+
return False
822+
823+
for distribution in distributions:
824+
package_type = json_extract(distribution, ["packagetype"], str)
825+
if package_type is None:
826+
logger.warning("The version %s has no 'package type' field in a distribution", version)
827+
continue
828+
829+
name = json_extract(self.package_json, ["info", "name"], str)
830+
if name is None:
831+
logger.warning("The version %s has no 'name' field in a distribution", version)
832+
continue
833+
834+
blake2b_256 = json_extract(distribution, ["digests", "blake2b_256"], str)
835+
if blake2b_256 is None:
836+
logger.warning("The version %s has no 'blake2b_256' field in a distribution", version)
837+
continue
838+
839+
filename = json_extract(distribution, ["filename"], str)
840+
if filename is None:
841+
logger.warning("The version %s has no 'filename' field in a distribution", version)
842+
continue
843+
844+
link = INSPECTOR_TEMPLATE.format(
845+
inspector_url_scheme=self.pypi_registry.inspector_url_scheme,
846+
inspector_url_netloc=self.pypi_registry.inspector_url_netloc,
847+
name=name,
848+
version=version,
849+
first=blake2b_256[0:2],
850+
second=blake2b_256[2:4],
851+
rest=blake2b_256[4:],
852+
filename=filename,
853+
)
854+
855+
# use a head request because we don't care about the response contents
856+
reachable = False
857+
if send_head_http_raw(link):
858+
reachable = True # link was reachable
859+
860+
if package_type == "sdist":
861+
self.inspector_asset.package_sdist_link = link
862+
self.inspector_asset.package_link_reachability[link] = reachable
863+
elif package_type == "bdist_wheel":
864+
self.inspector_asset.package_whl_links.append(link)
865+
self.inspector_asset.package_link_reachability[link] = reachable
866+
else: # no other package types exist, so else statement should never occur
867+
logger.debug("Unknown package distribution type: %s", package_type)
868+
869+
# if all distributions were invalid and went along a 'continue' path
870+
return bool(self.inspector_asset)
871+
756872

757873
def find_or_create_pypi_asset(
758874
asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo
@@ -790,6 +906,8 @@ def find_or_create_pypi_asset(
790906
logger.debug("Failed to create PyPIPackageJson asset.")
791907
return None
792908

793-
asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "")
909+
asset = PyPIPackageJsonAsset(
910+
asset_name, asset_version, False, package_registry, {}, "", PyPIInspectorAsset("", [], {})
911+
)
794912
pypi_registry_info.metadata.append(asset)
795913
return asset

tests/malware_analyzer/pypi/conftest.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains test configurations for malware analyzer."""
@@ -8,7 +8,7 @@
88
import pytest
99

1010
from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata
11-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
11+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIInspectorAsset, PyPIPackageJsonAsset, PyPIRegistry
1212

1313

1414
@pytest.fixture(autouse=True)
@@ -26,4 +26,5 @@ def pypi_package_json() -> MagicMock:
2626
pypi_package.component = Component(
2727
purl="pkg:pypi/package", analysis=Analysis(), repository=None, repo_finder_metadata=RepoFinderMetadata()
2828
)
29+
pypi_package.inspector_asset = MagicMock(spec=PyPIInspectorAsset)
2930
return pypi_package

0 commit comments

Comments
 (0)