Skip to content

Commit 863576c

Browse files
committed
Add support for PyPI PURLs in purl resolution
Signed-off-by: tdruez <[email protected]>
1 parent 01c50f6 commit 863576c

File tree

5 files changed

+115
-12
lines changed

5 files changed

+115
-12
lines changed

component_catalog/models.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,8 @@
5454
from component_catalog.license_expression_dje import get_license_objects
5555
from component_catalog.license_expression_dje import parse_expression
5656
from component_catalog.license_expression_dje import render_expression_as_html
57+
from dejacode_toolkit import download
5758
from dejacode_toolkit import spdx
58-
from dejacode_toolkit.download import DataCollectionException
59-
from dejacode_toolkit.download import collect_package_data
6059
from dejacode_toolkit.purldb import PurlDB
6160
from dejacode_toolkit.purldb import pick_purldb_entry
6261
from dejacode_toolkit.scancodeio import ScanCodeIO
@@ -2122,8 +2121,8 @@ def collect_data(self, force_update=False, save=True):
21222121
return
21232122

21242123
try:
2125-
package_data = collect_package_data(self.download_url)
2126-
except DataCollectionException as e:
2124+
package_data = download.collect_package_data(self.download_url)
2125+
except download.DataCollectionException as e:
21272126
tasks_logger.info(e)
21282127
return
21292128
tasks_logger.info("Package data collected.")
@@ -2476,7 +2475,7 @@ def create_from_url(cls, url, user):
24762475
scoped_packages_qs = cls.objects.scope(user.dataspace)
24772476

24782477
if is_purl_str(url):
2479-
download_url = purl2url.get_download_url(url)
2478+
download_url = download.infer_download_url(url)
24802479
package_url = PackageURL.from_string(url)
24812480
existing_packages = scoped_packages_qs.for_package_url(url, exact_match=True)
24822481
else:
@@ -2504,7 +2503,7 @@ def create_from_url(cls, url, user):
25042503
package_data.update(purldb_data)
25052504

25062505
if download_url and not purldb_data:
2507-
package_data = collect_package_data(download_url)
2506+
package_data = download.collect_package_data(download_url)
25082507

25092508
# Check for existing package by hash fields with a single database query
25102509
hash_fields = ["sha512", "sha256", "sha1", "md5"]

component_catalog/tests/test_models.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1701,7 +1701,7 @@ def test_package_model_update_from_data(self):
17011701
package.refresh_from_db()
17021702
self.assertEqual("apache-2.0", package.declared_license_expression)
17031703

1704-
@mock.patch("component_catalog.models.collect_package_data")
1704+
@mock.patch("dejacode_toolkit.download.collect_package_data")
17051705
def test_package_model_create_from_url(self, mock_collect):
17061706
self.assertIsNone(Package.create_from_url(url=" ", user=self.user))
17071707

@@ -1729,6 +1729,15 @@ def test_package_model_create_from_url(self, mock_collect):
17291729
self.assertEqual(purl, package.package_url)
17301730
mock_collect.assert_called_with("https://registry.npmjs.org/is-npm/-/is-npm-1.0.0.tgz")
17311731

1732+
purl = "pkg:pypi/[email protected]"
1733+
download_url = "https://files.pythonhosted.org/packages/Django-5.2.tar.gz"
1734+
mock_collect.return_value = {}
1735+
with mock.patch("dejacode_toolkit.download.PyPIFetcher.get_download_url") as mock_pypi_get:
1736+
mock_pypi_get.return_value = download_url
1737+
package = Package.create_from_url(url=purl, user=self.user)
1738+
self.assertEqual(purl, package.package_url)
1739+
mock_collect.assert_called_with(download_url)
1740+
17321741
@mock.patch("component_catalog.models.Package.get_purldb_entries")
17331742
@mock.patch("dejacode_toolkit.purldb.PurlDB.is_configured")
17341743
def test_package_model_create_from_url_enable_purldb_access(

component_catalog/tests/test_views.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,7 +1726,7 @@ def test_package_create_ajax_view(self):
17261726
"sha1": "5ba93c9db0cff93f52b521d7420e43f6eda2784f",
17271727
"md5": "93b885adfe0da089cdf634904fd59f71",
17281728
}
1729-
with mock.patch("component_catalog.models.collect_package_data") as collect:
1729+
with mock.patch("dejacode_toolkit.download.collect_package_data") as collect:
17301730
collect.return_value = collected_data
17311731
response = self.client.post(package_add_url, data)
17321732

@@ -1749,7 +1749,7 @@ def test_package_create_ajax_view(self):
17491749
# Different URL but sha1 match in the db
17501750
data = {"download_urls": "https://url.com/file.ext"}
17511751
collected_data["download_url"] = data["download_urls"]
1752-
with mock.patch("component_catalog.models.collect_package_data") as collect:
1752+
with mock.patch("dejacode_toolkit.download.collect_package_data") as collect:
17531753
collect.return_value = collected_data
17541754
response = self.client.post(package_add_url, data)
17551755

component_catalog/views.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
from crispy_forms.utils import render_crispy_form
4848
from natsort import natsorted
4949
from packageurl import PackageURL
50-
from packageurl.contrib import purl2url
5150

5251
from component_catalog.filters import ComponentFilterSet
5352
from component_catalog.filters import PackageFilterSet
@@ -72,6 +71,7 @@
7271
from component_catalog.models import PackageAlreadyExistsWarning
7372
from component_catalog.models import Subcomponent
7473
from dejacode_toolkit.download import DataCollectionException
74+
from dejacode_toolkit.download import infer_download_url
7575
from dejacode_toolkit.purldb import PurlDB
7676
from dejacode_toolkit.scancodeio import ScanCodeIO
7777
from dejacode_toolkit.scancodeio import ScanStatus
@@ -1943,7 +1943,7 @@ def get_initial(self):
19431943
purl = PackageURL.from_string(package_url)
19441944
package_url_dict = purl.to_dict(encode=True, empty="")
19451945
initial.update(package_url_dict)
1946-
if download_url := purl2url.get_download_url(package_url):
1946+
if download_url := infer_download_url(purl):
19471947
initial.update({"download_url": download_url})
19481948

19491949
return initial

dejacode_toolkit/download.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# See https://aboutcode.org for more information about AboutCode FOSS projects.
77
#
88

9+
from contextlib import suppress
910
from pathlib import Path
1011
from urllib.parse import unquote
1112
from urllib.parse import urlparse
@@ -14,13 +15,16 @@
1415
from django.utils.http import parse_header_parameters
1516

1617
import requests
18+
from packageurl import PackageURL
19+
from packageurl.contrib import purl2url
1720

1821
from dejacode_toolkit.utils import md5
1922
from dejacode_toolkit.utils import sha1
2023
from dejacode_toolkit.utils import sha256
2124
from dejacode_toolkit.utils import sha512
2225

2326
CONTENT_MAX_LENGTH = 536870912 # 512 MB
27+
DEFAULT_TIMEOUT = 5
2428

2529

2630
class DataCollectionException(Exception):
@@ -29,7 +33,7 @@ class DataCollectionException(Exception):
2933

3034
def collect_package_data(url):
3135
try:
32-
response = requests.get(url, timeout=5, stream=True)
36+
response = requests.get(url, timeout=DEFAULT_TIMEOUT, stream=True)
3337
except (TimeoutError, requests.RequestException) as e:
3438
raise DataCollectionException(e)
3539

@@ -73,3 +77,94 @@ def collect_package_data(url):
7377
}
7478

7579
return package_data
80+
81+
82+
class PyPIFetcher:
83+
"""
84+
Handle PyPI Package URL (PURL) resolution and download URL retrieval.
85+
86+
Adapted from fetchcode
87+
https://github.com/aboutcode-org/fetchcode/issues/190
88+
"""
89+
90+
purl_pattern = "pkg:pypi/.*"
91+
base_url = "https://pypi.org/pypi"
92+
93+
@staticmethod
94+
def fetch_json_response(url):
95+
"""Fetch a JSON response from the given URL and return the parsed JSON data."""
96+
response = requests.get(url, timeout=DEFAULT_TIMEOUT)
97+
if response.status_code != 200:
98+
raise Exception(f"Failed to fetch {url}: {response.status_code} {response.reason}")
99+
100+
try:
101+
return response.json()
102+
except ValueError as e:
103+
raise Exception(f"Failed to parse JSON from {url}: {str(e)}")
104+
105+
@classmethod
106+
def get_package_data(cls, purl):
107+
"""Fetch package data from PyPI API."""
108+
parsed_purl = PackageURL.from_string(purl)
109+
110+
if parsed_purl.version:
111+
api_url = f"{cls.base_url}/{parsed_purl.name}/{parsed_purl.version}/json"
112+
else:
113+
api_url = f"{cls.base_url}/{parsed_purl.name}/json"
114+
115+
return cls.fetch_json_response(api_url)
116+
117+
@classmethod
118+
def get_urls_info(cls, purl):
119+
"""Collect URL info dicts from PyPI API."""
120+
data = cls.get_package_data(purl)
121+
return data.get("urls", [])
122+
123+
@classmethod
124+
def get_download_url(cls, purl, preferred_type="sdist"):
125+
"""
126+
Get a single download URL from PyPI API.
127+
If no version is specified in the PURL, fetches the latest version.
128+
"""
129+
urls_info = cls.get_urls_info(purl)
130+
131+
if not urls_info:
132+
return
133+
134+
for url_info in urls_info:
135+
if url_info.get("packagetype") == preferred_type:
136+
return url_info["url"]
137+
138+
return urls_info[0]["url"]
139+
140+
@classmethod
141+
def get_all_download_urls(cls, purl):
142+
"""
143+
Get all download URLs from PyPI API.
144+
If no version is specified in the PURL, fetches the latest version.
145+
"""
146+
urls_info = cls.get_urls_info(purl)
147+
return [url_info["url"] for url_info in urls_info if "url" in url_info]
148+
149+
150+
def infer_download_url(purl):
151+
"""
152+
Infer the download URL for a package from its Package URL (purl).
153+
154+
Attempts resolution via ``purl2url`` first. Falls back to package-type-specific
155+
resolvers (which may make HTTP requests) when ``purl2url`` cannot resolve the URL.
156+
"""
157+
if isinstance(purl, PackageURL):
158+
purl_data = purl
159+
purl_str = str(purl)
160+
else:
161+
purl_data = PackageURL.from_string(purl)
162+
purl_str = purl
163+
164+
if download_url := purl2url.get_download_url(purl_str):
165+
return download_url
166+
167+
# PyPI is not supported by ``purl2url``, it requires an API call to resolve download URLs.
168+
if purl_data.type == "pypi":
169+
with suppress(Exception):
170+
return PyPIFetcher.get_download_url(purl_str, preferred_type="sdist")

0 commit comments

Comments
 (0)