Skip to content

Commit ecd45d5

Browse files
committed
Create and expose metadata file
fixes #1047
1 parent ed89e4e commit ecd45d5

File tree

13 files changed

+467
-16
lines changed

13 files changed

+467
-16
lines changed

CHANGES/1047.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added exposure of metadata file to Simple API (PEP 658)

pulp_python/app/management/commands/repair-python-metadata.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,14 @@ def repair_metadata(content):
2424
set_of_update_fields = set()
2525
total_repaired = 0
2626
for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
27+
# Get the main artifact
28+
main_artifact = (
29+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
30+
.first()
31+
.artifact
32+
)
2733
new_data = artifact_to_python_content_data(
28-
package.filename, package._artifacts.get(), package.pulp_domain
34+
package.filename, main_artifact, package.pulp_domain
2935
)
3036
changed = False
3137
for field, value in new_data.items():
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
# Generated manually on 2025-12-15 14:00 for creating missing metadata artifacts
2+
3+
from django.db import migrations
4+
5+
BATCH_SIZE = 1000
6+
7+
8+
def pulp_hashlib_new(name, *args, **kwargs):
9+
"""
10+
Copied and updated (to comply with migrations) from pulpcore.
11+
"""
12+
import hashlib as the_real_hashlib
13+
from django.conf import settings
14+
15+
if name not in settings.ALLOWED_CONTENT_CHECKSUMS:
16+
return None
17+
18+
return the_real_hashlib.new(name, *args, **kwargs)
19+
20+
21+
def init_and_validate(file, artifact_model, expected_digests):
22+
"""
23+
Copied and updated (to comply with migrations) from pulpcore.
24+
"""
25+
from django.conf import settings
26+
27+
digest_fields = []
28+
for alg in ("sha512", "sha384", "sha256", "sha224", "sha1", "md5"):
29+
if alg in settings.ALLOWED_CONTENT_CHECKSUMS:
30+
digest_fields.append(alg)
31+
32+
if isinstance(file, str):
33+
with open(file, "rb") as f:
34+
hashers = {
35+
n: hasher for n in digest_fields if (hasher := pulp_hashlib_new(n)) is not None
36+
}
37+
if not hashers:
38+
return None
39+
40+
size = 0
41+
while True:
42+
chunk = f.read(1048576) # 1 megabyte
43+
if not chunk:
44+
break
45+
for algorithm in hashers.values():
46+
algorithm.update(chunk)
47+
size = size + len(chunk)
48+
else:
49+
size = file.size
50+
hashers = file.hashers
51+
52+
mismatched_sha256 = None
53+
for algorithm, expected_digest in expected_digests.items():
54+
if algorithm not in hashers:
55+
return None
56+
actual_digest = hashers[algorithm].hexdigest()
57+
if expected_digest != actual_digest:
58+
# Store the actual value for later fixing if it differs from the package value
59+
mismatched_sha256 = actual_digest
60+
61+
attributes = {"size": size, "file": file}
62+
for algorithm in digest_fields:
63+
attributes[algorithm] = hashers[algorithm].hexdigest()
64+
65+
return artifact_model(**attributes), mismatched_sha256
66+
67+
68+
def extract_wheel_metadata(filename):
69+
"""
70+
Extract the metadata file content from a wheel file.
71+
Return the raw metadata content as bytes or None if metadata cannot be extracted.
72+
"""
73+
import zipfile
74+
75+
try:
76+
with zipfile.ZipFile(filename, "r") as f:
77+
for file_path in f.namelist():
78+
if file_path.endswith(".dist-info/METADATA"):
79+
return f.read(file_path)
80+
except (zipfile.BadZipFile, KeyError, OSError):
81+
pass
82+
return None
83+
84+
85+
def artifact_to_metadata_artifact(filename, artifact, md_digests, tmp_dir, artifact_model):
86+
"""
87+
Create artifact for metadata from the provided wheel artifact.
88+
Return (artifact, mismatched_sha256) on success, None on any failure.
89+
"""
90+
import shutil
91+
import tempfile
92+
93+
with tempfile.NamedTemporaryFile("wb", dir=tmp_dir, suffix=filename, delete=False) as temp_file:
94+
temp_wheel_path = temp_file.name
95+
artifact.file.seek(0)
96+
shutil.copyfileobj(artifact.file, temp_file)
97+
temp_file.flush()
98+
99+
metadata_content = extract_wheel_metadata(temp_wheel_path)
100+
if not metadata_content:
101+
return None
102+
103+
with tempfile.NamedTemporaryFile(
104+
"wb", dir=tmp_dir, suffix=".metadata", delete=False
105+
) as temp_md:
106+
temp_metadata_path = temp_md.name
107+
temp_md.write(metadata_content)
108+
temp_md.flush()
109+
110+
return init_and_validate(temp_metadata_path, artifact_model, md_digests)
111+
112+
113+
def create_missing_metadata_artifacts(apps, schema_editor):
114+
"""
115+
Create metadata artifacts for PythonPackageContent instances that have metadata_sha256
116+
but are missing the corresponding metadata artifact.
117+
"""
118+
import tempfile
119+
from django.conf import settings
120+
from django.db import models
121+
122+
PythonPackageContent = apps.get_model("python", "PythonPackageContent")
123+
ContentArtifact = apps.get_model("core", "ContentArtifact")
124+
Artifact = apps.get_model("core", "Artifact")
125+
126+
packages = (
127+
PythonPackageContent.objects.filter(
128+
metadata_sha256__isnull=False,
129+
filename__endswith=".whl",
130+
contentartifact__artifact__isnull=False,
131+
contentartifact__relative_path=models.F("filename"),
132+
)
133+
.exclude(metadata_sha256="")
134+
.prefetch_related("_artifacts")
135+
.only("filename", "metadata_sha256")
136+
)
137+
artifact_batch = []
138+
contentartifact_batch = []
139+
packages_batch = []
140+
141+
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
142+
for package in packages:
143+
# Get the main artifact for package
144+
main_artifact = package._artifacts.get()
145+
146+
filename = package.filename
147+
metadata_digests = {"sha256": package.metadata_sha256}
148+
result = artifact_to_metadata_artifact(
149+
filename, main_artifact, metadata_digests, temp_dir, Artifact
150+
)
151+
if result is None:
152+
# Unset metadata_sha256 when extraction or validation fails
153+
package.metadata_sha256 = None
154+
packages_batch.append(package)
155+
continue
156+
metadata_artifact, mismatched_sha256 = result
157+
if mismatched_sha256:
158+
# Fix the package if its metadata_sha256 differs from the actual value
159+
package.metadata_sha256 = mismatched_sha256
160+
packages_batch.append(package)
161+
162+
contentartifact = ContentArtifact(
163+
artifact=metadata_artifact,
164+
content=package,
165+
relative_path=f"{filename}.metadata",
166+
)
167+
artifact_batch.append(metadata_artifact)
168+
contentartifact_batch.append(contentartifact)
169+
170+
if len(artifact_batch) == BATCH_SIZE:
171+
Artifact.objects.bulk_create(artifact_batch, batch_size=BATCH_SIZE)
172+
ContentArtifact.objects.bulk_create(contentartifact_batch, batch_size=BATCH_SIZE)
173+
artifact_batch.clear()
174+
contentartifact_batch.clear()
175+
if len(packages_batch) == BATCH_SIZE:
176+
PythonPackageContent.objects.bulk_update(
177+
packages_batch, ["metadata_sha256"], batch_size=BATCH_SIZE
178+
)
179+
packages_batch.clear()
180+
181+
if artifact_batch:
182+
Artifact.objects.bulk_create(artifact_batch, batch_size=BATCH_SIZE)
183+
ContentArtifact.objects.bulk_create(contentartifact_batch, batch_size=BATCH_SIZE)
184+
if packages_batch:
185+
PythonPackageContent.objects.bulk_update(
186+
packages_batch, ["metadata_sha256"], batch_size=BATCH_SIZE
187+
)
188+
189+
190+
class Migration(migrations.Migration):
191+
192+
dependencies = [
193+
("python", "0018_packageprovenance"),
194+
]
195+
196+
operations = [
197+
migrations.RunPython(
198+
create_missing_metadata_artifacts,
199+
reverse_code=migrations.RunPython.noop,
200+
),
201+
]

pulp_python/app/serializers.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
import tempfile
34
from gettext import gettext as _
45
from django.conf import settings
56
from django.db.utils import IntegrityError
@@ -22,6 +23,7 @@
2223
)
2324
from pulp_python.app.utils import (
2425
DIST_EXTENSIONS,
26+
artifact_to_metadata_artifact,
2527
artifact_to_python_content_data,
2628
get_project_metadata_from_file,
2729
parse_project_metadata,
@@ -93,11 +95,31 @@ class Meta:
9395
model = python_models.PythonDistribution
9496

9597

98+
class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
99+
"""
100+
Custom field with overridden get_attribute method. Meant to be used only in
101+
PythonPackageContentSerializer to handle possible existence of metadata artifact.
102+
"""
103+
104+
def get_attribute(self, instance):
105+
# When content has multiple artifacts (wheel + metadata), return the main one
106+
if instance._artifacts.count() > 1:
107+
for ca in instance.contentartifact_set.all():
108+
if not ca.relative_path.endswith(".metadata"):
109+
return ca.artifact
110+
111+
return super().get_attribute(instance)
112+
113+
96114
class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
97115
"""
98116
A Serializer for PythonPackageContent.
99117
"""
100118

119+
artifact = PythonSingleContentArtifactField(
120+
help_text=_("Artifact file representing the physical content"),
121+
)
122+
101123
# Core metadata
102124
# Version 1.0
103125
author = serializers.CharField(
@@ -386,8 +408,21 @@ def deferred_validate(self, data):
386408
if attestations := data.pop("attestations", None):
387409
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)
388410

411+
# Create metadata artifact for wheel files
412+
if filename.endswith(".whl"):
413+
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
414+
data["metadata_artifact"] = metadata_artifact
415+
data["metadata_sha256"] = metadata_artifact.sha256
416+
389417
return data
390418

419+
def get_artifacts(self, validated_data):
420+
artifacts = super().get_artifacts(validated_data)
421+
if metadata_artifact := validated_data.pop("metadata_artifact", None):
422+
relative_path = f"{validated_data['filename']}.metadata"
423+
artifacts[relative_path] = metadata_artifact
424+
return artifacts
425+
391426
def retrieve(self, validated_data):
392427
content = python_models.PythonPackageContent.objects.filter(
393428
sha256=validated_data["sha256"], _pulp_domain=get_domain()
@@ -419,6 +454,7 @@ def create(self, validated_data):
419454

420455
class Meta:
421456
fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
457+
"artifact",
422458
"author",
423459
"author_email",
424460
"description",
@@ -514,6 +550,15 @@ def validate(self, data):
514550
data["provenance"] = self.handle_attestations(
515551
filename, data["sha256"], attestations, offline=True
516552
)
553+
# Create metadata artifact for wheel files
554+
if filename.endswith(".whl"):
555+
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
556+
if metadata_artifact := artifact_to_metadata_artifact(
557+
filename, artifact, tmp_dir=temp_dir
558+
):
559+
data["metadata_artifact"] = metadata_artifact
560+
data["metadata_sha256"] = metadata_artifact.sha256
561+
517562
return data
518563

519564
class Meta(PythonPackageContentSerializer.Meta):

pulp_python/app/tasks/repair.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
9595
progress_report.save()
9696
with progress_report:
9797
for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
98-
new_data = artifact_to_python_content_data(
99-
package.filename, package._artifacts.get(), domain
98+
# Get the main artifact
99+
main_artifact = (
100+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
101+
.first()
102+
.artifact
100103
)
104+
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
101105
total_repaired += update_package_if_needed(
102106
package, new_data, batch, set_of_update_fields
103107
)
@@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
113117
grouped_by_url = defaultdict(list)
114118

115119
for package in group_set:
116-
for ra in package.contentartifact_set.get().remoteartifact_set.all():
120+
for ra in (
121+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
122+
.first()
123+
.remoteartifact_set.all()
124+
):
117125
grouped_by_url[ra.remote.url].append((package, ra))
118126

119127
# Prioritize the URL that can serve the most packages

pulp_python/app/tasks/sync.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,15 @@ async def create_content(self, pkg):
229229
create a Content Unit to put into the pipeline
230230
"""
231231
declared_contents = {}
232+
page = await aget_remote_simple_page(pkg.name, self.remote)
233+
upstream_pkgs = {pkg.filename: pkg for pkg in page.packages}
234+
232235
for version, dists in pkg.releases.items():
233236
for package in dists:
234237
entry = parse_metadata(pkg.info, version, package)
235238
url = entry.pop("url")
236239
size = package["size"] or None
240+
d_artifacts = []
237241

238242
artifact = Artifact(sha256=entry["sha256"], size=size)
239243
package = PythonPackageContent(**entry)
@@ -245,11 +249,29 @@ async def create_content(self, pkg):
245249
remote=self.remote,
246250
deferred_download=self.deferred_download,
247251
)
248-
dc = DeclarativeContent(content=package, d_artifacts=[da])
252+
d_artifacts.append(da)
253+
254+
if upstream_pkg := upstream_pkgs.get(entry["filename"]):
255+
if upstream_pkg.has_metadata:
256+
url = upstream_pkg.metadata_url
257+
md_sha256 = upstream_pkg.metadata_digests.get("sha256")
258+
package.metadata_sha256 = md_sha256
259+
artifact = Artifact(sha256=md_sha256)
260+
261+
metadata_artifact = DeclarativeArtifact(
262+
artifact=artifact,
263+
url=url,
264+
relative_path=f"{entry['filename']}.metadata",
265+
remote=self.remote,
266+
deferred_download=self.deferred_download,
267+
)
268+
d_artifacts.append(metadata_artifact)
269+
270+
dc = DeclarativeContent(content=package, d_artifacts=d_artifacts)
249271
declared_contents[entry["filename"]] = dc
250272
await self.python_stage.put(dc)
251273

252-
if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)):
274+
if pkg.releases and page:
253275
if self.remote.provenance:
254276
await self.sync_provenance(page, declared_contents)
255277

0 commit comments

Comments
 (0)