Skip to content

Commit 7891773

Browse files
authored
Merge pull request #592 from aboutcode-org/468_data_collection_for_PyPI
Collect pypi PURL ondemand #468
2 parents acb8249 + 40ed549 commit 7891773

File tree

4 files changed

+267
-2
lines changed

4 files changed

+267
-2
lines changed

minecode/collectors/pypi.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
12+
import requests
13+
from packageurl import PackageURL
14+
15+
from minecode import priority_router
16+
from minecode.miners.pypi import build_packages
17+
18+
"""
19+
Collect PyPI packages from pypi registries.
20+
"""
21+
22+
logger = logging.getLogger(__name__)
23+
handler = logging.StreamHandler()
24+
logger.addHandler(handler)
25+
logger.setLevel(logging.INFO)
26+
27+
28+
def get_package_json(name, version):
29+
"""
30+
Return the contents of the JSON file of the package described by the purl
31+
field arguments in a string.
32+
"""
33+
# Create URLs using purl fields
34+
url = f"https://pypi.org/pypi/{name}/{version}/json"
35+
36+
try:
37+
response = requests.get(url)
38+
response.raise_for_status()
39+
return response.json()
40+
except requests.exceptions.HTTPError as err:
41+
logger.error(f"HTTP error occurred: {err}")
42+
43+
44+
def get_all_package_version(name):
45+
"""
46+
Return a list of all version numbers for the package name.
47+
"""
48+
url = f"https://pypi.org/pypi/{name}/json"
49+
try:
50+
response = requests.get(url)
51+
response.raise_for_status()
52+
data = response.json()
53+
# Get all available versions
54+
versions = list(data["releases"].keys())
55+
return versions
56+
except requests.exceptions.HTTPError as err:
57+
logger.error(f"HTTP error occurred: {err}")
58+
59+
60+
def map_pypi_package(package_url, pipelines, priority=0):
61+
"""
62+
Add a pypi `package_url` to the PackageDB.
63+
64+
Return an error string if any errors are encountered during the process
65+
"""
66+
from minecode.model_utils import add_package_to_scan_queue
67+
from minecode.model_utils import merge_or_create_package
68+
69+
error = ""
70+
package_json = get_package_json(
71+
name=package_url.name,
72+
version=package_url.version,
73+
)
74+
75+
if not package_json:
76+
error = f"Package does not exist on PyPI: {package_url}"
77+
logger.error(error)
78+
return error
79+
80+
packages = build_packages(package_json, package_url)
81+
82+
for package in packages:
83+
db_package, _, _, error = merge_or_create_package(package, visit_level=0)
84+
if error:
85+
break
86+
87+
# Submit package for scanning
88+
if db_package:
89+
add_package_to_scan_queue(
90+
package=db_package, pipelines=pipelines, priority=priority
91+
)
92+
93+
return error
94+
95+
96+
@priority_router.route("pkg:pypi/.*")
97+
def process_request(purl_str, **kwargs):
98+
"""
99+
Process `priority_resource_uri` containing a pypi Package URL (PURL) as a
100+
URI.
101+
102+
This involves obtaining Package information for the PURL from pypi and
103+
using it to create a new PackageDB entry. The package is then added to the
104+
scan queue afterwards.
105+
"""
106+
from minecode.model_utils import DEFAULT_PIPELINES
107+
108+
addon_pipelines = kwargs.get("addon_pipelines", [])
109+
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)
110+
priority = kwargs.get("priority", 0)
111+
112+
package_url = PackageURL.from_string(purl_str)
113+
114+
if not package_url.version:
115+
versions = get_all_package_version(package_url.name)
116+
for version in versions:
117+
# package_url.version cannot be set as it will raise
118+
# AttributeError: can't set attribute
119+
# package_url.version = version
120+
purl = purl_str + "@" + version
121+
package_url = PackageURL.from_string(purl)
122+
error_msg = map_pypi_package(package_url, pipelines, priority)
123+
124+
if error_msg:
125+
return error_msg
126+
else:
127+
error_msg = map_pypi_package(package_url, pipelines, priority)
128+
129+
if error_msg:
130+
return error_msg

minecode/miners/pypi.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,17 +259,33 @@ def build_packages(metadata, purl=None):
259259
if not url:
260260
continue
261261

262+
packagetype = None
263+
if download.get("packagetype") == "sdist":
264+
packagetype = "pypi_sdist_pkginfo"
265+
else:
266+
packagetype = "pypi_bdist_pkginfo"
267+
262268
download_data = dict(
263269
download_url=url,
264270
size=download.get("size"),
265271
release_date=parse_date(download.get("upload_time")),
266-
datasource_id="pypi_sdist_pkginfo",
272+
datasource_id=packagetype,
267273
type="pypi",
268274
)
269275
# TODO: Check for other checksums
270276
download_data["md5"] = download.get("md5_digest")
271277
download_data.update(common_data)
272278
package = scan_models.PackageData.from_data(download_data)
273279
package.datasource_id = "pypi_api_metadata"
274-
package.set_purl(purl)
280+
281+
if purl:
282+
purl_str = purl.to_string()
283+
purl_filename_qualifiers = (
284+
purl_str + "?file_name=" + download.get("filename")
285+
)
286+
updated_purl = PackageURL.from_string(purl_filename_qualifiers)
287+
package.set_purl(updated_purl)
288+
else:
289+
package.set_purl(purl)
290+
275291
yield package
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
import os
12+
13+
from django.test import TestCase as DjangoTestCase
14+
15+
from packageurl import PackageURL
16+
17+
import packagedb
18+
from minecode.collectors import pypi
19+
from minecode.utils_test import JsonBasedTesting
20+
21+
22+
class PypiPriorityQueueTests(JsonBasedTesting, DjangoTestCase):
23+
test_data_dir = os.path.join(
24+
os.path.dirname(os.path.dirname(__file__)), "testfiles"
25+
)
26+
27+
def setUp(self):
28+
super().setUp()
29+
self.expected_json_loc = self.get_test_loc("pypi/cage_1.1.4.json")
30+
with open(self.expected_json_loc) as f:
31+
self.expected_json_contents = json.load(f)
32+
33+
def test_get_package_json(self):
34+
json_contents = pypi.get_package_json(
35+
name="cage",
36+
version="1.1.4",
37+
)
38+
self.assertEqual(self.expected_json_contents, json_contents)
39+
40+
def test_get_all_package_version(self):
41+
releases_list = pypi.get_all_package_version("cage")
42+
expected = ["1.1.2", "1.1.3", "1.1.4"]
43+
# At the time of creating this test, the CAGE project has three
44+
# releases. There may be additional releases in the future.
45+
# Therefore, we will verify that the number of releases is three
46+
# or greater and that it includes the expected release versions.
47+
self.assertTrue(len(releases_list) >= 3)
48+
for version in expected:
49+
self.assertIn(version, releases_list)
50+
51+
def test_map_npm_package(self):
52+
package_count = packagedb.models.Package.objects.all().count()
53+
self.assertEqual(0, package_count)
54+
package_url = PackageURL.from_string("pkg:pypi/[email protected]")
55+
pypi.map_pypi_package(package_url, ("test_pipeline"))
56+
package_count = packagedb.models.Package.objects.all().count()
57+
self.assertEqual(1, package_count)
58+
package = packagedb.models.Package.objects.all().first()
59+
expected_purl_str = "pkg:pypi/[email protected]"
60+
expected_download_url = (
61+
"http://www.alcyone.com/software/cage/cage-latest.tar.gz"
62+
)
63+
self.assertEqual(expected_purl_str, package.purl)
64+
self.assertEqual(expected_download_url, package.download_url)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"info": {
3+
"author": "Erik Max Francis",
4+
"author_email": "[email protected]",
5+
"bugtrack_url": null,
6+
"classifiers": [
7+
"Development Status :: 6 - Mature",
8+
"Intended Audience :: Developers",
9+
"Intended Audience :: End Users/Desktop",
10+
"Intended Audience :: Science/Research",
11+
"License :: OSI Approved :: GNU General Public License (GPL)",
12+
"Operating System :: OS Independent",
13+
"Programming Language :: Python",
14+
"Topic :: Games/Entertainment",
15+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
16+
"Topic :: Scientific/Engineering :: Mathematics"
17+
],
18+
"description": "CAGE is a fairy generic and complete cellular automaton simulation\r\n engine in Python. It supports both 1D and 2D automata, a variety\r\n of prepackaged rules, and the concept of \"agents\" which can move\r\n about independently on the map for implementing agent behavior.\r\n\r\n CAGE comes with numerous examples of fully-functional CA systems,\r\n including Conway's Game of Life, Langton's self-reproducing\r\n automaton, Langton's \"vants,\" and 1D automata rule explorers. It\r\n also comes with simple displayers (including a curses interface\r\n for 2D automata). Also included is a unique implementation of a\r\n finite state machine (ant.py).",
19+
"description_content_type": null,
20+
"docs_url": null,
21+
"download_url": "http://www.alcyone.com/software/cage/cage-latest.tar.gz",
22+
"downloads": {
23+
"last_day": -1,
24+
"last_month": -1,
25+
"last_week": -1
26+
},
27+
"dynamic": null,
28+
"home_page": "http://www.alcyone.com/software/cage/",
29+
"keywords": "cellular automata, Turing machines, Langton vants, self-organizing systems, finite state machines, finite state automata",
30+
"license": "GPL",
31+
"license_expression": null,
32+
"license_files": null,
33+
"maintainer": "",
34+
"maintainer_email": "",
35+
"name": "CAGE",
36+
"package_url": "https://pypi.org/project/CAGE/",
37+
"platform": "any; Unix for curses frontend",
38+
"project_url": "https://pypi.org/project/CAGE/",
39+
"project_urls": {
40+
"Download": "http://www.alcyone.com/software/cage/cage-latest.tar.gz",
41+
"Homepage": "http://www.alcyone.com/software/cage/"
42+
},
43+
"provides_extra": null,
44+
"release_url": "https://pypi.org/project/CAGE/1.1.4/",
45+
"requires_dist": null,
46+
"requires_python": null,
47+
"summary": "A generic and fairly complete cellular automata simulation engine.",
48+
"version": "1.1.4",
49+
"yanked": false,
50+
"yanked_reason": null
51+
},
52+
"last_serial": 944145,
53+
"urls": [],
54+
"vulnerabilities": []
55+
}

0 commit comments

Comments
 (0)