Skip to content

Commit 02f110d

Browse files
glandiumjcristau
authored andcommitted
Bug 1853082 - Don't rely on file extension to guess archive type in fetch-content. r=taskgraph-reviewers,bhearsum
I want to add a fetch for a rust crate archive from crates.io, and those don't come with an extension in the url, so we can't rely on that. Content sniffing is easy enough to put in place. Differential Revision: https://phabricator.services.mozilla.com/D188152
1 parent 9358d3e commit 02f110d

File tree

1 file changed

+44
-25
lines changed

1 file changed

+44
-25
lines changed

src/taskgraph/run-task/fetch-content

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import contextlib
1010
import datetime
1111
import gzip
1212
import hashlib
13+
import io
1314
import json
1415
import lzma
1516
import multiprocessing
@@ -317,21 +318,41 @@ def gpg_verify_path(path: pathlib.Path, public_key_data: bytes, signature_data:
317318
subprocess.run(["gpgconf", "--kill", "gpg-agent"], env=env)
318319

319320

320-
def open_tar_stream(path: pathlib.Path):
321-
""""""
322-
if path.suffix == ".bz2":
323-
return bz2.open(str(path), "rb")
324-
elif path.suffix in (".gz", ".tgz") :
325-
return gzip.open(str(path), "rb")
326-
elif path.suffix == ".xz":
327-
return lzma.open(str(path), "rb")
328-
elif path.suffix == ".zst":
329-
dctx = ZstdDecompressor()
330-
return dctx.stream_reader(path.open("rb"))
331-
elif path.suffix == ".tar":
332-
return path.open("rb")
333-
else:
334-
raise ValueError("unknown archive format for tar file: %s" % path)
321+
class ArchiveTypeNotSupported(Exception):
322+
def __init__(self, path: pathlib.Path):
323+
super(Exception, self).__init__("Archive type not supported for %s" % path)
324+
325+
326+
def open_stream(path: pathlib.Path):
327+
"""Attempt to identify a path as an extractable archive by looking at its
328+
content."""
329+
fh = path.open(mode="rb")
330+
magic = fh.read(6)
331+
fh.seek(0)
332+
if magic[:2] == b"PK":
333+
return "zip", fh
334+
if magic[:2] == b"\x1f\x8b":
335+
fh = gzip.GzipFile(fileobj=fh)
336+
elif magic[:3] == b"BZh":
337+
fh = bz2.BZ2File(fh)
338+
elif magic == b"\xfd7zXZ\x00":
339+
fh = lzma.LZMAFile(fh)
340+
elif magic[:4] == b"\x28\xb5\x2f\xfd":
341+
fh = ZstdDecompressor().stream_reader(fh)
342+
fh = io.BufferedReader(fh)
343+
try:
344+
# A full tar info header is 512 bytes.
345+
headers = fh.peek(512)
346+
# 257 is the offset of the ustar magic.
347+
magic = headers[257 : 257 + 8]
348+
# For older unix tar, rely on TarInfo.frombuf's checksum check
349+
if magic in (b"ustar\x0000", b"ustar \x00") or tarfile.TarInfo.frombuf(
350+
headers[:512], tarfile.ENCODING, "surrogateescape"
351+
):
352+
return "tar", fh
353+
except Exception as e:
354+
pass
355+
raise ArchiveTypeNotSupported(path)
335356

336357

337358
def archive_type(path: pathlib.Path):
@@ -344,7 +365,7 @@ def archive_type(path: pathlib.Path):
344365
return None
345366

346367

347-
def extract_archive(path, dest_dir, typ):
368+
def extract_archive(path, dest_dir):
348369
"""Extract an archive to a destination directory."""
349370

350371
# Resolve paths to absolute variants.
@@ -356,8 +377,8 @@ def extract_archive(path, dest_dir, typ):
356377

357378
# We pipe input to the decompressor program so that we can apply
358379
# custom decompressors that the program may not know about.
380+
typ, ifh = open_stream(path)
359381
if typ == "tar":
360-
ifh = open_tar_stream(path)
361382
# On Windows, the tar program doesn't support things like symbolic
362383
# links, while Windows actually support them. The tarfile module in
363384
# python does. So use that. But since it's significantly slower than
@@ -404,10 +425,8 @@ def repack_archive(
404425
):
405426
assert orig != dest
406427
log("Repacking as %s" % dest)
407-
orig_typ = archive_type(orig)
428+
orig_typ, ifh = open_stream(orig)
408429
typ = archive_type(dest)
409-
if not orig_typ:
410-
raise Exception("Archive type not supported for %s" % orig.name)
411430
if not typ:
412431
raise Exception("Archive type not supported for %s" % dest.name)
413432

@@ -433,7 +452,7 @@ def repack_archive(
433452
ctx = ZstdCompressor()
434453
if orig_typ == "zip":
435454
assert typ == "tar"
436-
zip = zipfile.ZipFile(orig)
455+
zip = zipfile.ZipFile(ifh)
437456
# Convert the zip stream to a tar on the fly.
438457
with ctx.stream_writer(fh) as compressor, tarfile.open(
439458
fileobj=compressor, mode="w:"
@@ -475,7 +494,6 @@ def repack_archive(
475494
raise Exception("Repacking a tar to zip is not supported")
476495
assert typ == "tar"
477496

478-
ifh = open_tar_stream(orig)
479497
if filter:
480498
# To apply the filter, we need to open the tar stream and
481499
# tweak it.
@@ -518,11 +536,12 @@ def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None):
518536
if not extract:
519537
return
520538

521-
typ = archive_type(dest_path)
522-
if typ:
523-
extract_archive(dest_path, dest_dir, typ)
539+
try:
540+
extract_archive(dest_path, dest_dir)
524541
log("Removing %s" % dest_path)
525542
dest_path.unlink()
543+
except ArchiveTypeNotSupported:
544+
pass
526545

527546

528547
def fetch_urls(downloads):

0 commit comments

Comments
 (0)