Skip to content

Commit 0a0bb1f

Browse files
authored
feat: add support for PyPI purls in purl resolution
Signed-off-by: tdruez <[email protected]>
1 parent 1eecd60 commit 0a0bb1f

File tree

10 files changed

+129
-26
lines changed

10 files changed

+129
-26
lines changed

component_catalog/admin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ class PackageAdmin(
831831
"version",
832832
"qualifiers",
833833
"subpath",
834-
"inferred_url",
834+
"inferred_repo_url",
835835
)
836836
},
837837
),
@@ -898,7 +898,7 @@ class PackageAdmin(
898898
]
899899
readonly_fields = DataspacedAdmin.readonly_fields + (
900900
"package_url",
901-
"inferred_url",
901+
"inferred_repo_url",
902902
)
903903
form = PackageAdminForm
904904
importer_class = PackageImporter
@@ -1071,7 +1071,7 @@ def components_links(self, obj):
10711071

10721072
@admin.display(description="Inferred URL")
10731073
def inferred_url(self, obj):
1074-
if inferred_url := obj.inferred_url:
1074+
if inferred_url := obj.inferred_repo_url:
10751075
return urlize_target_blank(inferred_url)
10761076
return ""
10771077

component_catalog/management/commands/componentfrompackage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def create_component_from_package(self, package):
101101
# The proper policy will be set from the ``license_expression`` value
102102
component_data.pop("usage_policy", None)
103103

104-
if inferred_url := package.inferred_url:
104+
if inferred_url := package.inferred_repo_url:
105105
component_data["code_view_url"] = inferred_url
106106
component_data["homepage_url"] = inferred_url
107107

component_catalog/models.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,8 @@
5454
from component_catalog.license_expression_dje import get_license_objects
5555
from component_catalog.license_expression_dje import parse_expression
5656
from component_catalog.license_expression_dje import render_expression_as_html
57+
from dejacode_toolkit import download
5758
from dejacode_toolkit import spdx
58-
from dejacode_toolkit.download import DataCollectionException
59-
from dejacode_toolkit.download import collect_package_data
6059
from dejacode_toolkit.purldb import PurlDB
6160
from dejacode_toolkit.purldb import pick_purldb_entry
6261
from dejacode_toolkit.scancodeio import ScanCodeIO
@@ -2036,7 +2035,7 @@ def package_url_filename(self):
20362035
return get_valid_filename(cleaned_package_url)
20372036

20382037
@property
2039-
def inferred_url(self):
2038+
def inferred_repo_url(self):
20402039
"""Return the URL deduced from the information available in a Package URL (purl)."""
20412040
return purl2url.get_repo_url(self.package_url)
20422041

@@ -2122,8 +2121,8 @@ def collect_data(self, force_update=False, save=True):
21222121
return
21232122

21242123
try:
2125-
package_data = collect_package_data(self.download_url)
2126-
except DataCollectionException as e:
2124+
package_data = download.collect_package_data(self.download_url)
2125+
except download.DataCollectionException as e:
21272126
tasks_logger.info(e)
21282127
return
21292128
tasks_logger.info("Package data collected.")
@@ -2476,7 +2475,7 @@ def create_from_url(cls, url, user):
24762475
scoped_packages_qs = cls.objects.scope(user.dataspace)
24772476

24782477
if is_purl_str(url):
2479-
download_url = purl2url.get_download_url(url)
2478+
download_url = download.infer_download_url(url)
24802479
package_url = PackageURL.from_string(url)
24812480
existing_packages = scoped_packages_qs.for_package_url(url, exact_match=True)
24822481
else:
@@ -2504,7 +2503,7 @@ def create_from_url(cls, url, user):
25042503
package_data.update(purldb_data)
25052504

25062505
if download_url and not purldb_data:
2507-
package_data = collect_package_data(download_url)
2506+
package_data = download.collect_package_data(download_url)
25082507

25092508
# Check for existing package by hash fields with a single database query
25102509
hash_fields = ["sha512", "sha256", "sha1", "md5"]

component_catalog/tests/test_models.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,7 +1701,7 @@ def test_package_model_update_from_data(self):
17011701
package.refresh_from_db()
17021702
self.assertEqual("apache-2.0", package.declared_license_expression)
17031703

1704-
@mock.patch("component_catalog.models.collect_package_data")
1704+
@mock.patch("dejacode_toolkit.download.collect_package_data")
17051705
def test_package_model_create_from_url(self, mock_collect):
17061706
self.assertIsNone(Package.create_from_url(url=" ", user=self.user))
17071707

@@ -1729,6 +1729,15 @@ def test_package_model_create_from_url(self, mock_collect):
17291729
self.assertEqual(purl, package.package_url)
17301730
mock_collect.assert_called_with("https://registry.npmjs.org/is-npm/-/is-npm-1.0.0.tgz")
17311731

1732+
purl = "pkg:pypi/[email protected]"
1733+
download_url = "https://files.pythonhosted.org/packages/Django-5.2.tar.gz"
1734+
mock_collect.return_value = {}
1735+
with mock.patch("dejacode_toolkit.download.PyPIFetcher.get_download_url") as mock_pypi_get:
1736+
mock_pypi_get.return_value = download_url
1737+
package = Package.create_from_url(url=purl, user=self.user)
1738+
self.assertEqual(purl, package.package_url)
1739+
mock_collect.assert_called_with(download_url)
1740+
17321741
@mock.patch("component_catalog.models.Package.get_purldb_entries")
17331742
@mock.patch("dejacode_toolkit.purldb.PurlDB.is_configured")
17341743
def test_package_model_create_from_url_enable_purldb_access(
@@ -2554,18 +2563,18 @@ def test_package_model_where_used_property(self):
25542563
)
25552564
self.assertEqual("Product 0\nComponent 1\n", package1.where_used(user=basic_user))
25562565

2557-
def test_package_model_inferred_url_property(self):
2566+
def test_package_model_inferred_repo_url_property(self):
25582567
package1 = Package.objects.create(filename="package", dataspace=self.dataspace)
2559-
self.assertIsNone(package1.inferred_url)
2568+
self.assertIsNone(package1.inferred_repo_url)
25602569

25612570
package1.set_package_url("pkg:pypi/[email protected]")
25622571
package1.save()
2563-
self.assertEqual("https://pypi.org/project/toml/0.10.2/", package1.inferred_url)
2572+
self.assertEqual("https://pypi.org/project/toml/0.10.2/", package1.inferred_repo_url)
25642573

25652574
package1.set_package_url("pkg:github/package-url/[email protected]?version_prefix=v")
25662575
package1.save()
25672576
expected = "https://github.com/package-url/packageurl-python/tree/v0.10.4"
2568-
self.assertEqual(expected, package1.inferred_url)
2577+
self.assertEqual(expected, package1.inferred_repo_url)
25692578

25702579
@mock.patch("dejacode_toolkit.purldb.PurlDB.find_packages")
25712580
def test_package_model_get_purldb_entries(self, mock_find_packages):

component_catalog/tests/test_views.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,7 +1726,7 @@ def test_package_create_ajax_view(self):
17261726
"sha1": "5ba93c9db0cff93f52b521d7420e43f6eda2784f",
17271727
"md5": "93b885adfe0da089cdf634904fd59f71",
17281728
}
1729-
with mock.patch("component_catalog.models.collect_package_data") as collect:
1729+
with mock.patch("dejacode_toolkit.download.collect_package_data") as collect:
17301730
collect.return_value = collected_data
17311731
response = self.client.post(package_add_url, data)
17321732

@@ -1749,7 +1749,7 @@ def test_package_create_ajax_view(self):
17491749
# Different URL but sha1 match in the db
17501750
data = {"download_urls": "https://url.com/file.ext"}
17511751
collected_data["download_url"] = data["download_urls"]
1752-
with mock.patch("component_catalog.models.collect_package_data") as collect:
1752+
with mock.patch("dejacode_toolkit.download.collect_package_data") as collect:
17531753
collect.return_value = collected_data
17541754
response = self.client.post(package_add_url, data)
17551755

component_catalog/views.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
from crispy_forms.utils import render_crispy_form
4848
from natsort import natsorted
4949
from packageurl import PackageURL
50-
from packageurl.contrib import purl2url
5150

5251
from component_catalog.filters import ComponentFilterSet
5352
from component_catalog.filters import PackageFilterSet
@@ -72,6 +71,7 @@
7271
from component_catalog.models import PackageAlreadyExistsWarning
7372
from component_catalog.models import Subcomponent
7473
from dejacode_toolkit.download import DataCollectionException
74+
from dejacode_toolkit.download import infer_download_url
7575
from dejacode_toolkit.purldb import PurlDB
7676
from dejacode_toolkit.scancodeio import ScanCodeIO
7777
from dejacode_toolkit.scancodeio import ScanStatus
@@ -1083,7 +1083,7 @@ class PackageDetailsView(
10831083
"package_url",
10841084
"filename",
10851085
"download_url",
1086-
"inferred_url",
1086+
"inferred_repo_url",
10871087
"size",
10881088
"release_date",
10891089
"primary_language",
@@ -1943,7 +1943,7 @@ def get_initial(self):
19431943
purl = PackageURL.from_string(package_url)
19441944
package_url_dict = purl.to_dict(encode=True, empty="")
19451945
initial.update(package_url_dict)
1946-
if download_url := purl2url.get_download_url(package_url):
1946+
if download_url := infer_download_url(purl):
19471947
initial.update({"download_url": download_url})
19481948

19491949
return initial

dejacode/settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def enable_rq_eager_mode():
468468
This function patch the django_rq.get_redis_connection to always return a fake
469469
redis connection using the `fakeredis` library.
470470
"""
471-
import django_rq.queues
471+
from django_rq import connection_utils
472472
from fakeredis import FakeRedis
473473
from fakeredis import FakeStrictRedis
474474

@@ -478,7 +478,7 @@ def enable_rq_eager_mode():
478478
def get_fake_redis_connection(config, use_strict_redis):
479479
return FakeStrictRedis() if use_strict_redis else FakeRedis()
480480

481-
django_rq.queues.get_redis_connection = get_fake_redis_connection
481+
connection_utils.get_redis_connection = get_fake_redis_connection
482482

483483

484484
DEJACODE_ASYNC = env.bool("DEJACODE_ASYNC", default=False)

dejacode_toolkit/download.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# See https://aboutcode.org for more information about AboutCode FOSS projects.
77
#
88

9+
from contextlib import suppress
910
from pathlib import Path
1011
from urllib.parse import unquote
1112
from urllib.parse import urlparse
@@ -14,13 +15,16 @@
1415
from django.utils.http import parse_header_parameters
1516

1617
import requests
18+
from packageurl import PackageURL
19+
from packageurl.contrib import purl2url
1720

1821
from dejacode_toolkit.utils import md5
1922
from dejacode_toolkit.utils import sha1
2023
from dejacode_toolkit.utils import sha256
2124
from dejacode_toolkit.utils import sha512
2225

2326
CONTENT_MAX_LENGTH = 536870912 # 512 MB
27+
DEFAULT_TIMEOUT = 5
2428

2529

2630
class DataCollectionException(Exception):
@@ -29,7 +33,7 @@ class DataCollectionException(Exception):
2933

3034
def collect_package_data(url):
3135
try:
32-
response = requests.get(url, timeout=5, stream=True)
36+
response = requests.get(url, timeout=DEFAULT_TIMEOUT, stream=True)
3337
except (TimeoutError, requests.RequestException) as e:
3438
raise DataCollectionException(e)
3539

@@ -73,3 +77,94 @@ def collect_package_data(url):
7377
}
7478

7579
return package_data
80+
81+
82+
class PyPIFetcher:
83+
"""
84+
Handle PyPI Package URL (PURL) resolution and download URL retrieval.
85+
86+
Adapted from fetchcode
87+
https://github.com/aboutcode-org/fetchcode/issues/190
88+
"""
89+
90+
purl_pattern = "pkg:pypi/.*"
91+
base_url = "https://pypi.org/pypi"
92+
93+
@staticmethod
94+
def fetch_json_response(url):
95+
"""Fetch a JSON response from the given URL and return the parsed JSON data."""
96+
response = requests.get(url, timeout=DEFAULT_TIMEOUT)
97+
if response.status_code != 200:
98+
raise Exception(f"Failed to fetch {url}: {response.status_code} {response.reason}")
99+
100+
try:
101+
return response.json()
102+
except ValueError as e:
103+
raise Exception(f"Failed to parse JSON from {url}: {str(e)}")
104+
105+
@classmethod
106+
def get_package_data(cls, purl):
107+
"""Fetch package data from PyPI API."""
108+
parsed_purl = PackageURL.from_string(purl)
109+
110+
if parsed_purl.version:
111+
api_url = f"{cls.base_url}/{parsed_purl.name}/{parsed_purl.version}/json"
112+
else:
113+
api_url = f"{cls.base_url}/{parsed_purl.name}/json"
114+
115+
return cls.fetch_json_response(api_url)
116+
117+
@classmethod
118+
def get_urls_info(cls, purl):
119+
"""Collect URL info dicts from PyPI API."""
120+
data = cls.get_package_data(purl)
121+
return data.get("urls", [])
122+
123+
@classmethod
124+
def get_download_url(cls, purl, preferred_type="sdist"):
125+
"""
126+
Get a single download URL from PyPI API.
127+
If no version is specified in the PURL, fetches the latest version.
128+
"""
129+
urls_info = cls.get_urls_info(purl)
130+
131+
if not urls_info:
132+
return
133+
134+
for url_info in urls_info:
135+
if url_info.get("packagetype") == preferred_type:
136+
return url_info["url"]
137+
138+
return urls_info[0]["url"]
139+
140+
@classmethod
141+
def get_all_download_urls(cls, purl):
142+
"""
143+
Get all download URLs from PyPI API.
144+
If no version is specified in the PURL, fetches the latest version.
145+
"""
146+
urls_info = cls.get_urls_info(purl)
147+
return [url_info["url"] for url_info in urls_info if "url" in url_info]
148+
149+
150+
def infer_download_url(purl):
151+
"""
152+
Infer the download URL for a package from its Package URL (purl).
153+
154+
Attempts resolution via ``purl2url`` first. Falls back to package-type-specific
155+
resolvers (which may make HTTP requests) when ``purl2url`` cannot resolve the URL.
156+
"""
157+
if isinstance(purl, PackageURL):
158+
purl_data = purl
159+
purl_str = str(purl)
160+
else:
161+
purl_data = PackageURL.from_string(purl)
162+
purl_str = purl
163+
164+
if download_url := purl2url.get_download_url(purl_str):
165+
return download_url
166+
167+
# PyPI is not supported by ``purl2url``, it requires an API call to resolve download URLs.
168+
if purl_data.type == "pypi":
169+
with suppress(Exception):
170+
return PyPIFetcher.get_download_url(purl_str, preferred_type="sdist")

dje/views.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1071,7 +1071,7 @@ def show_usage_policy(value):
10711071
]
10721072
)
10731073

1074-
if inferred_url := package.inferred_url:
1074+
if inferred_url := package.inferred_repo_url:
10751075
inferred_url_help = (
10761076
"A URL deduced from the information available in a Package URL (purl)."
10771077
)

reporting/forms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def get_model_data_for_column_template(dataspace=None):
219219
"package_url",
220220
"short_package_url",
221221
"where_used",
222-
"inferred_url",
222+
"inferred_repo_url",
223223
"is_vulnerable",
224224
],
225225
},

0 commit comments

Comments
 (0)