@@ -10,6 +10,7 @@ import contextlib
1010import datetime
1111import gzip
1212import hashlib
13+ import io
1314import json
1415import lzma
1516import multiprocessing
@@ -194,6 +195,25 @@ def stream_download(url, sha256=None, size=None, headers=None):
194195 ) if certifi else urllib .request .urlopen (req , timeout = 60 ) as fh :
195196 if not url .endswith (".gz" ) and fh .info ().get ("Content-Encoding" ) == "gzip" :
196197 fh = gzip .GzipFile (fileobj = fh )
198+ else :
199+ # when using gzip we can't compare size or length (inflated) against content-length (compressed)
200+ content_length = fh .getheader ("content-length" )
201+ if content_length :
202+ try :
203+ content_length = int (content_length )
204+ except ValueError :
205+ raise IntegrityError (
206+ "content-length header for %s is not an integer; got %s"
207+ % (url , content_length )
208+ )
209+ if size :
210+ if size != content_length :
211+ raise IntegrityError (
212+ "size mismatch on %s: wanted %d; content-length is %d"
213+ % (url , size , content_length )
214+ )
215+ else :
216+ size = content_length
197217
198218 while True :
199219 chunk = fh .read (65536 )
@@ -252,8 +272,6 @@ def download_to_path(url, path, sha256=None, size=None, headers=None):
252272 fh .write (chunk )
253273
254274 return
255- except IntegrityError :
256- raise
257275 except Exception as e :
258276 log ("Download failed: {}" .format (e ))
259277 continue
@@ -264,17 +282,15 @@ def download_to_path(url, path, sha256=None, size=None, headers=None):
264282def download_to_memory (url , sha256 = None , size = None ):
265283 """Download a URL to memory, possibly with verification."""
266284
267- data = b""
268285 for _ in retrier (attempts = 5 , sleeptime = 60 ):
269- try :
270- log ("Downloading %s" % (url ))
286+ data = b""
287+ log ("Downloading %s" % (url ))
271288
289+ try :
272290 for chunk in stream_download (url , sha256 = sha256 , size = size ):
273291 data += chunk
274292
275293 return data
276- except IntegrityError :
277- raise
278294 except Exception as e :
279295 log ("Download failed: {}" .format (e ))
280296 continue
@@ -317,21 +333,41 @@ def gpg_verify_path(path: pathlib.Path, public_key_data: bytes, signature_data:
317333 subprocess .run (["gpgconf" , "--kill" , "gpg-agent" ], env = env )
318334
319335
320- def open_tar_stream (path : pathlib .Path ):
321- """"""
322- if path .suffix == ".bz2" :
323- return bz2 .open (str (path ), "rb" )
324- elif path .suffix in (".gz" , ".tgz" ) :
325- return gzip .open (str (path ), "rb" )
326- elif path .suffix == ".xz" :
327- return lzma .open (str (path ), "rb" )
328- elif path .suffix == ".zst" :
329- dctx = ZstdDecompressor ()
330- return dctx .stream_reader (path .open ("rb" ))
331- elif path .suffix == ".tar" :
332- return path .open ("rb" )
333- else :
334- raise ValueError ("unknown archive format for tar file: %s" % path )
336+ class ArchiveTypeNotSupported (Exception ):
337+ def __init__ (self , path : pathlib .Path ):
338+ super (Exception , self ).__init__ ("Archive type not supported for %s" % path )
339+
340+
341+ def open_stream (path : pathlib .Path ):
342+ """Attempt to identify a path as an extractable archive by looking at its
343+ content."""
344+ fh = path .open (mode = "rb" )
345+ magic = fh .read (6 )
346+ fh .seek (0 )
347+ if magic [:2 ] == b"PK" :
348+ return "zip" , fh
349+ if magic [:2 ] == b"\x1f \x8b " :
350+ fh = gzip .GzipFile (fileobj = fh )
351+ elif magic [:3 ] == b"BZh" :
352+ fh = bz2 .BZ2File (fh )
353+ elif magic == b"\xfd 7zXZ\x00 " :
354+ fh = lzma .LZMAFile (fh )
355+ elif magic [:4 ] == b"\x28 \xb5 \x2f \xfd " :
356+ fh = ZstdDecompressor ().stream_reader (fh )
357+ fh = io .BufferedReader (fh )
358+ try :
359+ # A full tar info header is 512 bytes.
360+ headers = fh .peek (512 )
361+ # 257 is the offset of the ustar magic.
362+ magic = headers [257 : 257 + 8 ]
363+ # For older unix tar, rely on TarInfo.frombuf's checksum check
364+ if magic in (b"ustar\x00 00" , b"ustar \x00 " ) or tarfile .TarInfo .frombuf (
365+ headers [:512 ], tarfile .ENCODING , "surrogateescape"
366+ ):
367+ return "tar" , fh
368+ except Exception as e :
369+ pass
370+ raise ArchiveTypeNotSupported (path )
335371
336372
337373def archive_type (path : pathlib .Path ):
@@ -344,7 +380,7 @@ def archive_type(path: pathlib.Path):
344380 return None
345381
346382
347- def extract_archive (path , dest_dir , typ ):
383+ def extract_archive (path , dest_dir ):
348384 """Extract an archive to a destination directory."""
349385
350386 # Resolve paths to absolute variants.
@@ -356,8 +392,8 @@ def extract_archive(path, dest_dir, typ):
356392
357393 # We pipe input to the decompressor program so that we can apply
358394 # custom decompressors that the program may not know about.
395+ typ , ifh = open_stream (path )
359396 if typ == "tar" :
360- ifh = open_tar_stream (path )
361397 # On Windows, the tar program doesn't support things like symbolic
362398 # links, while Windows actually support them. The tarfile module in
363399 # python does. So use that. But since it's significantly slower than
@@ -404,10 +440,8 @@ def repack_archive(
404440):
405441 assert orig != dest
406442 log ("Repacking as %s" % dest )
407- orig_typ = archive_type (orig )
443+ orig_typ , ifh = open_stream (orig )
408444 typ = archive_type (dest )
409- if not orig_typ :
410- raise Exception ("Archive type not supported for %s" % orig .name )
411445 if not typ :
412446 raise Exception ("Archive type not supported for %s" % dest .name )
413447
@@ -433,7 +467,7 @@ def repack_archive(
433467 ctx = ZstdCompressor ()
434468 if orig_typ == "zip" :
435469 assert typ == "tar"
436- zip = zipfile .ZipFile (orig )
470+ zip = zipfile .ZipFile (ifh )
437471 # Convert the zip stream to a tar on the fly.
438472 with ctx .stream_writer (fh ) as compressor , tarfile .open (
439473 fileobj = compressor , mode = "w:"
@@ -475,7 +509,6 @@ def repack_archive(
475509 raise Exception ("Repacking a tar to zip is not supported" )
476510 assert typ == "tar"
477511
478- ifh = open_tar_stream (orig )
479512 if filter :
480513 # To apply the filter, we need to open the tar stream and
481514 # tweak it.
@@ -518,11 +551,12 @@ def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None):
518551 if not extract :
519552 return
520553
521- typ = archive_type (dest_path )
522- if typ :
523- extract_archive (dest_path , dest_dir , typ )
554+ try :
555+ extract_archive (dest_path , dest_dir )
524556 log ("Removing %s" % dest_path )
525557 dest_path .unlink ()
558+ except ArchiveTypeNotSupported :
559+ pass
526560
527561
528562def fetch_urls (downloads ):
0 commit comments