Skip to content

Commit 7819e4f

Browse files
committed
Support on-demand content in repair_metadata
1 parent 134afaa commit 7819e4f

File tree

4 files changed

+193
-5
lines changed

4 files changed

+193
-5
lines changed

pulp_python/app/tasks/repair.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22
import uuid
33
from gettext import gettext as _
44

5+
from requests_cache import CachedSession
6+
from requests.exceptions import RequestException
57
from django.db.models.query import QuerySet
68
from pulpcore.plugin.models import ProgressReport
79
from pulpcore.plugin.util import get_domain
810

911
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
12+
from pulp_python.app.utils import (
13+
artifact_to_python_content_data,
14+
fetch_json_release_metadata,
15+
parse_metadata,
16+
)
1117

1218
log = logging.getLogger(__name__)
1319

@@ -47,23 +53,32 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4753
Returns:
4854
int: The number of packages that were repaired.
4955
"""
50-
# TODO: Add on_demand content repair
51-
immediate_content = content.filter(contentartifact__artifact__isnull=False)
56+
immediate_content = (
57+
content.filter(contentartifact__artifact__isnull=False)
58+
.distinct()
59+
.prefetch_related("_artifacts")
60+
)
61+
on_demand_content = (
62+
content.filter(contentartifact__artifact__isnull=True)
63+
.distinct()
64+
.prefetch_related("contentartifact_set__remoteartifact_set")
65+
)
5266
domain = get_domain()
5367

5468
batch = []
5569
set_of_update_fields = set()
5670
total_repaired = 0
71+
session = None
5772

5873
progress_report = ProgressReport(
5974
message="Repairing packages' metadata",
6075
code="repair.metadata",
61-
total=immediate_content.count(),
76+
total=content.count(),
6277
)
6378
progress_report.save()
6479
with progress_report:
6580
for package in progress_report.iter(
66-
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
81+
immediate_content.iterator(chunk_size=1000)
6782
):
6883
new_data = artifact_to_python_content_data(
6984
package.filename, package._artifacts.get(), domain
@@ -82,8 +97,67 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
8297
batch = []
8398
set_of_update_fields.clear()
8499

100+
for package in progress_report.iter(
101+
on_demand_content.iterator(chunk_size=1000)
102+
):
103+
remote_artifacts = (
104+
package.contentartifact_set.get().remoteartifact_set.all()
105+
)
106+
session = CachedSession(
107+
"repair_endpoint_cache", backend="sqlite", use_temp=True
108+
)
109+
# We expect that PythonPackageContent always has correct name and version
110+
try:
111+
json_data = fetch_json_release_metadata(
112+
package.name,
113+
package.version,
114+
remote_artifacts.get().remote,
115+
session,
116+
)
117+
except RequestException as exc:
118+
log.warning(
119+
_("Could not fetch metadata for {} {} from PyPI. Error: {}").format(
120+
package.name, package.version, exc
121+
)
122+
)
123+
continue
124+
# Extract data only for the specific distribution we are currently checking
125+
# We expect that RemoteArtifact always has correct sha256
126+
dist_data = next(
127+
(
128+
dist
129+
for ra in remote_artifacts
130+
for dist in json_data["urls"]
131+
if ra.sha256 == dist["digests"]["sha256"]
132+
),
133+
None,
134+
)
135+
if not dist_data:
136+
log.warning(
137+
_("No matching distribution for {} was found.").format(package.name)
138+
)
139+
continue
140+
141+
new_data = parse_metadata(json_data["info"], package.version, dist_data)
142+
new_data.pop("url") # belongs to RemoteArtifact, not PythonPackageContent
143+
changed = False
144+
for field, value in new_data.items():
145+
if getattr(package, field) != value:
146+
setattr(package, field, value)
147+
set_of_update_fields.add(field)
148+
changed = True
149+
if changed:
150+
batch.append(package)
151+
if len(batch) == 1000:
152+
total_repaired += len(batch)
153+
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
154+
batch = []
155+
set_of_update_fields.clear()
156+
85157
if batch:
86158
total_repaired += len(batch)
87159
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
160+
if session:
161+
session.cache.clear()
88162

89163
return total_repaired

pulp_python/app/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
from packaging.utils import canonicalize_name
1010
from packaging.requirements import Requirement
1111
from packaging.version import parse, InvalidVersion
12+
from requests import Session
13+
14+
from pulpcore.plugin.models import Remote
1215

1316

1417
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -189,6 +192,21 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189192
return data
190193

191194

195+
def fetch_json_release_metadata(
196+
name: str, version: str, remote: Remote, session_obj: Session
197+
) -> dict:
198+
"""
199+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
200+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
201+
202+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
203+
"""
204+
url = f"{remote.url}pypi/{name}/{version}/json"
205+
response = session_obj.get(url, timeout=10)
206+
response.raise_for_status()
207+
return response.json()
208+
209+
192210
def python_content_to_json(base_path, content_query, version=None, domain=None):
193211
"""
194212
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_repair.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,49 @@ def _create(artifact_filename, filename, content_data):
3232
return _create
3333

3434

35+
@pytest.fixture
36+
def create_content_remote(python_bindings):
37+
def _create(filename, ra_url, ra_sha256, content_data, remote):
38+
commands = (
39+
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
40+
"from pulpcore.plugin.util import extract_pk, get_url; "
41+
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
42+
f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); "
43+
"c.save(); "
44+
f"ca = ContentArtifact(artifact=None, content=c, relative_path={filename!r}); "
45+
"ca.save(); "
46+
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
47+
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={ra_sha256!r}, url={ra_url!r}); " # noqa: E501
48+
"ra.save(); "
49+
"print(get_url(c))"
50+
)
51+
process = subprocess.run(
52+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
53+
)
54+
55+
assert process.returncode == 0
56+
content_href = process.stdout.decode().strip()
57+
return python_bindings.ContentPackagesApi.read(content_href)
58+
59+
return _create
60+
61+
62+
@pytest.mark.django_db
63+
@pytest.fixture
64+
def delete_content():
65+
def _delete(content_href):
66+
from pulpcore.plugin.util import extract_pk
67+
from pulp_python.app.models import PythonPackageContent
68+
69+
content = PythonPackageContent.objects.get(pk=extract_pk(content_href))
70+
content.version_memberships.all().delete()
71+
artifacts = content._artifacts.all()
72+
content.delete()
73+
artifacts.delete()
74+
75+
return _delete
76+
77+
3578
@pytest.fixture
3679
def move_to_repository(python_bindings, monitor_task):
3780
def _move(repo_href, content_hrefs):
@@ -84,6 +127,7 @@ def test_metadata_repair_command(
84127

85128
def test_metadata_repair_endpoint(
86129
create_content_direct,
130+
delete_content,
87131
download_python_file,
88132
monitor_task,
89133
move_to_repository,
@@ -124,3 +168,54 @@ def test_metadata_repair_endpoint(
124168
assert content.packagetype == "sdist"
125169
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
126170
assert content.author == ""
171+
delete_content(content.pulp_href)
172+
173+
174+
def test_metadata_repair_endpoint_on_demand(
175+
create_content_remote,
176+
delete_content,
177+
monitor_task,
178+
move_to_repository,
179+
python_bindings,
180+
python_remote_factory,
181+
python_repo_factory,
182+
):
183+
"""
184+
Test repairing of package metadata via `Repositories.repair_metadata` endpoint
185+
when only RemoteArtifact is present.
186+
"""
187+
python_egg_filename = "scipy-1.1.0.tar.gz"
188+
python_egg_url = urljoin(
189+
urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename
190+
)
191+
python_egg_sha256 = (
192+
"878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1"
193+
)
194+
data = {
195+
"name": "scipy",
196+
"version": "1.1.0",
197+
# Wrong metadata
198+
"author": "ME",
199+
"packagetype": "bdist",
200+
"requires_python": ">=3.8",
201+
}
202+
remote = python_remote_factory(includes=["scipy"])
203+
repo = python_repo_factory(remote=remote)
204+
205+
content = create_content_remote(
206+
python_egg_filename, python_egg_url, python_egg_sha256, data, remote
207+
)
208+
for field, test_value in data.items():
209+
assert getattr(content, field) == test_value
210+
move_to_repository(repo.pulp_href, [content.pulp_href])
211+
212+
response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href)
213+
monitor_task(response.task)
214+
215+
new_content = python_bindings.ContentPackagesApi.read(content.pulp_href)
216+
assert new_content.author == ""
217+
assert new_content.name == "scipy"
218+
assert new_content.packagetype == "sdist"
219+
assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
220+
assert new_content.version == "1.1.0"
221+
delete_content(content.pulp_href)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
requests_cache

0 commit comments

Comments
 (0)