Skip to content

Commit 01e1016

Browse files
authored
Merge pull request #541 from jcristau/fetch-content-gecko-sync
Bug 1796139 - Pull in fetch-content changes from mozilla-central
2 parents 9358d3e + 5a3d0b6 commit 01e1016

File tree

2 files changed

+72
-32
lines changed

2 files changed

+72
-32
lines changed

src/taskgraph/run-task/fetch-content

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import contextlib
1010
import datetime
1111
import gzip
1212
import hashlib
13+
import io
1314
import json
1415
import lzma
1516
import multiprocessing
@@ -194,6 +195,25 @@ def stream_download(url, sha256=None, size=None, headers=None):
194195
) if certifi else urllib.request.urlopen(req, timeout=60) as fh:
195196
if not url.endswith(".gz") and fh.info().get("Content-Encoding") == "gzip":
196197
fh = gzip.GzipFile(fileobj=fh)
198+
else:
199+
# when using gzip we can't compare size or length (inflated) against content-length (compressed)
200+
content_length = fh.getheader("content-length")
201+
if content_length:
202+
try:
203+
content_length = int(content_length)
204+
except ValueError:
205+
raise IntegrityError(
206+
"content-length header for %s is not an integer; got %s"
207+
% (url, content_length)
208+
)
209+
if size:
210+
if size != content_length:
211+
raise IntegrityError(
212+
"size mismatch on %s: wanted %d; content-length is %d"
213+
% (url, size, content_length)
214+
)
215+
else:
216+
size = content_length
197217

198218
while True:
199219
chunk = fh.read(65536)
@@ -252,8 +272,6 @@ def download_to_path(url, path, sha256=None, size=None, headers=None):
252272
fh.write(chunk)
253273

254274
return
255-
except IntegrityError:
256-
raise
257275
except Exception as e:
258276
log("Download failed: {}".format(e))
259277
continue
@@ -264,17 +282,15 @@ def download_to_path(url, path, sha256=None, size=None, headers=None):
264282
def download_to_memory(url, sha256=None, size=None):
265283
"""Download a URL to memory, possibly with verification."""
266284

267-
data = b""
268285
for _ in retrier(attempts=5, sleeptime=60):
269-
try:
270-
log("Downloading %s" % (url))
286+
data = b""
287+
log("Downloading %s" % (url))
271288

289+
try:
272290
for chunk in stream_download(url, sha256=sha256, size=size):
273291
data += chunk
274292

275293
return data
276-
except IntegrityError:
277-
raise
278294
except Exception as e:
279295
log("Download failed: {}".format(e))
280296
continue
@@ -317,21 +333,41 @@ def gpg_verify_path(path: pathlib.Path, public_key_data: bytes, signature_data:
317333
subprocess.run(["gpgconf", "--kill", "gpg-agent"], env=env)
318334

319335

320-
def open_tar_stream(path: pathlib.Path):
321-
""""""
322-
if path.suffix == ".bz2":
323-
return bz2.open(str(path), "rb")
324-
elif path.suffix in (".gz", ".tgz") :
325-
return gzip.open(str(path), "rb")
326-
elif path.suffix == ".xz":
327-
return lzma.open(str(path), "rb")
328-
elif path.suffix == ".zst":
329-
dctx = ZstdDecompressor()
330-
return dctx.stream_reader(path.open("rb"))
331-
elif path.suffix == ".tar":
332-
return path.open("rb")
333-
else:
334-
raise ValueError("unknown archive format for tar file: %s" % path)
336+
class ArchiveTypeNotSupported(Exception):
337+
def __init__(self, path: pathlib.Path):
338+
super(Exception, self).__init__("Archive type not supported for %s" % path)
339+
340+
341+
def open_stream(path: pathlib.Path):
342+
"""Attempt to identify a path as an extractable archive by looking at its
343+
content."""
344+
fh = path.open(mode="rb")
345+
magic = fh.read(6)
346+
fh.seek(0)
347+
if magic[:2] == b"PK":
348+
return "zip", fh
349+
if magic[:2] == b"\x1f\x8b":
350+
fh = gzip.GzipFile(fileobj=fh)
351+
elif magic[:3] == b"BZh":
352+
fh = bz2.BZ2File(fh)
353+
elif magic == b"\xfd7zXZ\x00":
354+
fh = lzma.LZMAFile(fh)
355+
elif magic[:4] == b"\x28\xb5\x2f\xfd":
356+
fh = ZstdDecompressor().stream_reader(fh)
357+
fh = io.BufferedReader(fh)
358+
try:
359+
# A full tar info header is 512 bytes.
360+
headers = fh.peek(512)
361+
# 257 is the offset of the ustar magic.
362+
magic = headers[257 : 257 + 8]
363+
# For older unix tar, rely on TarInfo.frombuf's checksum check
364+
if magic in (b"ustar\x0000", b"ustar \x00") or tarfile.TarInfo.frombuf(
365+
headers[:512], tarfile.ENCODING, "surrogateescape"
366+
):
367+
return "tar", fh
368+
except Exception as e:
369+
pass
370+
raise ArchiveTypeNotSupported(path)
335371

336372

337373
def archive_type(path: pathlib.Path):
@@ -344,7 +380,7 @@ def archive_type(path: pathlib.Path):
344380
return None
345381

346382

347-
def extract_archive(path, dest_dir, typ):
383+
def extract_archive(path, dest_dir):
348384
"""Extract an archive to a destination directory."""
349385

350386
# Resolve paths to absolute variants.
@@ -356,8 +392,8 @@ def extract_archive(path, dest_dir, typ):
356392

357393
# We pipe input to the decompressor program so that we can apply
358394
# custom decompressors that the program may not know about.
395+
typ, ifh = open_stream(path)
359396
if typ == "tar":
360-
ifh = open_tar_stream(path)
361397
# On Windows, the tar program doesn't support things like symbolic
362398
# links, while Windows actually support them. The tarfile module in
363399
# python does. So use that. But since it's significantly slower than
@@ -404,10 +440,8 @@ def repack_archive(
404440
):
405441
assert orig != dest
406442
log("Repacking as %s" % dest)
407-
orig_typ = archive_type(orig)
443+
orig_typ, ifh = open_stream(orig)
408444
typ = archive_type(dest)
409-
if not orig_typ:
410-
raise Exception("Archive type not supported for %s" % orig.name)
411445
if not typ:
412446
raise Exception("Archive type not supported for %s" % dest.name)
413447

@@ -433,7 +467,7 @@ def repack_archive(
433467
ctx = ZstdCompressor()
434468
if orig_typ == "zip":
435469
assert typ == "tar"
436-
zip = zipfile.ZipFile(orig)
470+
zip = zipfile.ZipFile(ifh)
437471
# Convert the zip stream to a tar on the fly.
438472
with ctx.stream_writer(fh) as compressor, tarfile.open(
439473
fileobj=compressor, mode="w:"
@@ -475,7 +509,6 @@ def repack_archive(
475509
raise Exception("Repacking a tar to zip is not supported")
476510
assert typ == "tar"
477511

478-
ifh = open_tar_stream(orig)
479512
if filter:
480513
# To apply the filter, we need to open the tar stream and
481514
# tweak it.
@@ -518,11 +551,12 @@ def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None):
518551
if not extract:
519552
return
520553

521-
typ = archive_type(dest_path)
522-
if typ:
523-
extract_archive(dest_path, dest_dir, typ)
554+
try:
555+
extract_archive(dest_path, dest_dir)
524556
log("Removing %s" % dest_path)
525557
dest_path.unlink()
558+
except ArchiveTypeNotSupported:
559+
pass
526560

527561

528562
def fetch_urls(downloads):

test/test_scripts_fetch_content.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,13 @@ def mock_urlopen(req, timeout=None, *, cafile=None):
7373
# create a mock context manager
7474
cm = MagicMock()
7575
cm.getcode.return_value = 200
76+
77+
def getheader(field):
78+
if field.lower() == "content-length":
79+
return size
80+
7681
# simulates chunking
82+
cm.getheader = getheader
7783
cm.read.side_effect = [b"foo", b"bar", None]
7884
cm.__enter__.return_value = cm
7985
return cm

0 commit comments

Comments
 (0)