@@ -236,6 +236,71 @@ def detect_bgzip(header: bytes) -> bool:
236
236
)
237
237
238
238
239
+ def _read_exact (fp , n ):
240
+ '''Read exactly *n* bytes from `fp`
241
+
242
+ This method is required because fp may be unbuffered,
243
+ i.e. return short reads.
244
+ '''
245
+ data = fp .read (n )
246
+ while len (data ) < n :
247
+ b = fp .read (n - len (data ))
248
+ if not b :
249
+ raise EOFError ("Compressed file ended before the "
250
+ "end-of-stream marker was reached" )
251
+ data += b
252
+ return data
253
+
254
+
255
+ def _read_gzip_header (fp ):
256
+ '''Read a gzip header from `fp` and progress to the end of the header.
257
+
258
+ Returns last mtime if header was present or None otherwise.
259
+ '''
260
+ # Do not use read_exact because a header may not be present. Read twice
261
+ # since fp might be unbuffered.
262
+ magic = fp .read (1 ) + fp .read (1 )
263
+ if magic == b'' :
264
+ return None
265
+
266
+ if magic != b'\037 \213 ' :
267
+ raise BadGzipFile ('Not a gzipped file (%r)' % magic )
268
+
269
+ common_fields = _read_exact (fp , 8 )
270
+ (method , flag , last_mtime ) = struct .unpack ("<BBIxx" , common_fields )
271
+ if method != 8 :
272
+ raise BadGzipFile ('Unknown compression method' )
273
+ header = magic + common_fields
274
+ if flag & FEXTRA :
275
+ # Read & discard the extra field, if present
276
+ encoded_length = _read_exact (fp , 2 )
277
+ extra_len , = struct .unpack ("<H" , encoded_length )
278
+ extra_field = _read_exact (fp , extra_len )
279
+ header = header + encoded_length + extra_field
280
+ if flag & FNAME :
281
+ # Read and discard a null-terminated string containing the filename
282
+ while True :
283
+ s = _read_exact (fp , 1 )
284
+ header += s
285
+ if s == b'\000 ' :
286
+ break
287
+ if flag & FCOMMENT :
288
+ # Read and discard a null-terminated string containing a comment
289
+ while True :
290
+ s = _read_exact (fp , 1 )
291
+ header += s
292
+ if s == b'\000 ' :
293
+ break
294
+ if flag & FHCRC :
295
+ header_crc_encoded = _read_exact (fp , 2 )
296
+ header_crc , = struct .unpack ("<H" , header_crc_encoded )
297
+ crc = isal_zlib .crc32 (header ) & 0xFFFF
298
+ if header_crc != crc :
299
+ raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
300
+ f"match: { crc :04x} != { header_crc :04x} " )
301
+ return last_mtime
302
+
303
+
239
304
class _PaddedFile (gzip ._PaddedFile ):
240
305
# Overwrite _PaddedFile from gzip as its prepend method assumes that
241
306
# the prepended data is always read from its _buffer. Unfortunately in
@@ -275,6 +340,13 @@ def __init__(self, fp):
275
340
# efficiently but this is outside scope for python-isal.
276
341
self ._read_buffer_size = 16 * 1024
277
342
343
+ def _read_gzip_header (self ):
344
+ last_mtime = _read_gzip_header (self ._fp )
345
+ if last_mtime is None :
346
+ return False
347
+ self ._last_mtime = last_mtime
348
+ return True
349
+
278
350
def read (self , size = - 1 ):
279
351
if size < 0 :
280
352
return self .readall ()
@@ -300,7 +372,8 @@ def read(self, size=-1):
300
372
# If the _new_member flag is set, we have to
301
373
# jump to the next member, if there is one.
302
374
self ._crc = isal_zlib .crc32 (b"" )
303
- self ._stream_size = 0 # Decompressed size of unconcatenated stream
375
+ # Decompressed size of unconcatenated stream
376
+ self ._stream_size = 0
304
377
if not self ._read_gzip_header ():
305
378
self ._size = self ._pos
306
379
return b""
@@ -364,52 +437,6 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
364
437
return header + compressed
365
438
366
439
367
- def _gzip_header_end (data : bytes ) -> int :
368
- """
369
- Find the start of the raw deflate block in a gzip file.
370
- :param data: Compressed data that starts with a gzip header.
371
- :return: The end of the header / start of the raw deflate block.
372
- """
373
- eof_error = EOFError ("Compressed file ended before the end-of-stream "
374
- "marker was reached" )
375
- if len (data ) < 10 :
376
- raise eof_error
377
- # We are not interested in mtime, xfl and os flags.
378
- magic , method , flags = struct .unpack ("<HBB" , data [:4 ])
379
- if magic != 0x8b1f :
380
- raise BadGzipFile (f"Not a gzipped file ({ repr (data [:2 ])} )" )
381
- if method != 8 :
382
- raise BadGzipFile ("Unknown compression method" )
383
- if not flags : # Likely when data compressed in memory
384
- return 10
385
- pos = 10
386
- if flags & FEXTRA :
387
- if len (data ) < pos + 2 :
388
- raise eof_error
389
- xlen , = struct .unpack ("<H" , data [pos : pos + 2 ])
390
- pos += 2 + xlen
391
- if flags & FNAME :
392
- pos = data .find (b"\x00 " , pos ) + 1
393
- # pos will be -1 + 1 when null byte not found.
394
- if not pos :
395
- raise eof_error
396
- if flags & FCOMMENT :
397
- pos = data .find (b"\x00 " , pos ) + 1
398
- if not pos :
399
- raise eof_error
400
- if flags & FHCRC :
401
- if len (data ) < pos + 2 :
402
- raise eof_error
403
- header_crc , = struct .unpack ("<H" , data [pos : pos + 2 ])
404
- # CRC is stored as a 16-bit integer by taking last bits of crc32.
405
- crc = isal_zlib .crc32 (data [:pos ]) & 0xFFFF
406
- if header_crc != crc :
407
- raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
408
- f"match: { crc :04x} != { header_crc :04x} " )
409
- pos += 2
410
- return pos
411
-
412
-
413
440
def decompress (data ):
414
441
"""Decompress a gzip compressed string in one shot.
415
442
Return the decompressed string.
@@ -418,7 +445,10 @@ def decompress(data):
418
445
while True :
419
446
if not data : # Empty data returns empty bytestring
420
447
return b"" .join (decompressed_members )
421
- header_end = _gzip_header_end (data )
448
+ fp = io .BytesIO (data )
449
+ if _read_gzip_header (fp ) is None :
450
+ return b"" .join (decompressed_members )
451
+ header_end = fp .tell ()
422
452
# Use a zlib raw deflate compressor
423
453
do = isal_zlib .decompressobj (wbits = - isal_zlib .MAX_WBITS )
424
454
# Read all the data except the header
0 commit comments