@@ -220,20 +220,80 @@ def write(self, data):
220220 return length
221221
222222
223- def detect_bgzip (header : bytes ) -> bool :
224- if len (header ) < 18 :
225- return False
226- magic , method , flags , mtime , xfl , os , xlen , si1 , si2 , slen , bsize = \
227- struct .unpack ("<HBBIBBHBBHH" , header [:18 ])
228- return (
229- method == 8 and # Deflate method used
230- flags & 4 and # There are extra fields
231- xlen == 6 and # The extra field should be of length 6
232- si1 == 66 and # BGZIP magic number one
233- si2 == 67 and # BGZIP magic number two
234- slen == 2 # The length of the 16 bit integer that stores
235- # the size of the block
236- )
223+ def _read_exact (fp , n ):
224+ '''Read exactly *n* bytes from `fp`
225+
226+ This method is required because fp may be unbuffered,
227+ i.e. return short reads.
228+ '''
229+ data = fp .read (n )
230+ while len (data ) < n :
231+ b = fp .read (n - len (data ))
232+ if not b :
233+ raise EOFError ("Compressed file ended before the "
234+ "end-of-stream marker was reached" )
235+ data += b
236+ return data
237+
238+
239+ def _read_gzip_header (fp ):
240+ '''Read a gzip header from `fp` and progress to the end of the header.
241+
242+ Returns None if header not present. Parses mtime from the header, looks
243+ for BGZF format blocks and parses the block size, setting it to None if
244+ not present. Returns a tuple of mtime, block_size if a header was present.
245+ '''
246+ # Do not use read_exact because a header may not be present. Read twice
247+ # since fp might be unbuffered.
248+ magic = fp .read (1 ) + fp .read (1 )
249+ if magic == b'' :
250+ return None
251+
252+ if magic != b'\037 \213 ' :
253+ raise BadGzipFile ('Not a gzipped file (%r)' % magic )
254+
255+ common_fields = _read_exact (fp , 8 )
256+ (method , flag , last_mtime ) = struct .unpack ("<BBIxx" , common_fields )
257+ if method != 8 :
258+ raise BadGzipFile ('Unknown compression method' )
259+ block_size = None
260+ if not flag : # Likely when data compressed in memory
261+ return last_mtime , block_size
262+ header = magic + common_fields
263+ if flag & FEXTRA :
264+ # Read & discard the extra field, if present
265+ encoded_length = _read_exact (fp , 2 )
266+ extra_len , = struct .unpack ("<H" , encoded_length )
267+ extra_field = _read_exact (fp , extra_len )
268+ # Bgzip file detection
269+ if extra_len == 6 :
270+ s1 , s2 , slen , bsize = struct .unpack ("<BBHH" , extra_field )
271+ if s1 == 66 and s2 == 67 and slen == 2 :
272+ # Bgzip magic and correct slen.
273+ block_size = bsize
274+ header = header + encoded_length + extra_field
275+ if flag & FNAME :
276+ # Read and discard a null-terminated string containing the filename
277+ while True :
278+ s = _read_exact (fp , 1 )
279+ header += s
280+ if s == b'\000 ' :
281+ break
282+ if flag & FCOMMENT :
283+ # Read and discard a null-terminated string containing a comment
284+ while True :
285+ s = _read_exact (fp , 1 )
286+ header += s
287+ if s == b'\000 ' :
288+ break
289+ if flag & FHCRC :
290+ header_crc_encoded = _read_exact (fp , 2 )
291+ header_crc , = struct .unpack ("<H" , header_crc_encoded )
292+ crc = isal_zlib .crc32 (header ) & 0xFFFF
293+ if header_crc != crc :
294+ raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
295+ f"match: { crc :04x} != { header_crc :04x} " )
296+ return last_mtime , block_size
237297
238298
239299class _PaddedFile (gzip ._PaddedFile ):
@@ -266,14 +326,21 @@ def __init__(self, fp):
266326 self ._new_member = True
267327 self ._last_mtime = None
268328 self ._read_buffer_size = READ_BUFFER_SIZE
269- if hasattr (fp , "peek" ) and detect_bgzip (fp .peek (18 )):
270- # bgzip consists of puny little blocks of max 64K uncompressed data
271- # so in practice probably more around 16K in compressed size. A
272- # 128K buffer is a massive overshoot and slows down the
273- # decompression.
274- # bgzip stores the block size, so it can be unpacked more
275- # efficiently but this is outside scope for python-isal.
276- self ._read_buffer_size = 16 * 1024
329+
330+ def _read_gzip_header (self ):
331+ header_info = _read_gzip_header (self ._fp )
332+ if header_info is None :
333+ return False
334+ # Get the BGZF block size from the header if present. If the read
335+ # buffer size is set to exactly the block size, there will be less
336+ # overhead as reading the file will stop right before the gzip trailer.
337+ # On normal gzip files nothing happens and this optimization is not
338+ # detrimental.
339+ last_mtime , block_size = header_info
340+ self ._last_mtime = last_mtime
341+ self ._read_buffer_size = (block_size if block_size is not None
342+ else READ_BUFFER_SIZE )
343+ return True
277344
278345 def read (self , size = - 1 ):
279346 if size < 0 :
@@ -299,7 +366,9 @@ def read(self, size=-1):
299366 if self ._new_member :
300367 # If the _new_member flag is set, we have to
301368 # jump to the next member, if there is one.
302- self ._init_read ()
369+ self ._crc = isal_zlib .crc32 (b"" )
370+ # Decompressed size of unconcatenated stream
371+ self ._stream_size = 0
303372 if not self ._read_gzip_header ():
304373 self ._size = self ._pos
305374 return b""
@@ -363,61 +432,22 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
363432 return header + compressed
364433
365434
366- def _gzip_header_end (data : bytes ) -> int :
367- """
368- Find the start of the raw deflate block in a gzip file.
369- :param data: Compressed data that starts with a gzip header.
370- :return: The end of the header / start of the raw deflate block.
371- """
372- eof_error = EOFError ("Compressed file ended before the end-of-stream "
373- "marker was reached" )
374- if len (data ) < 10 :
375- raise eof_error
376- # We are not interested in mtime, xfl and os flags.
377- magic , method , flags = struct .unpack ("<HBB" , data [:4 ])
378- if magic != 0x8b1f :
379- raise BadGzipFile (f"Not a gzipped file ({ repr (data [:2 ])} )" )
380- if method != 8 :
381- raise BadGzipFile ("Unknown compression method" )
382- if not flags : # Likely when data compressed in memory
383- return 10
384- pos = 10
385- if flags & FEXTRA :
386- if len (data ) < pos + 2 :
387- raise eof_error
388- xlen , = struct .unpack ("<H" , data [pos : pos + 2 ])
389- pos += 2 + xlen
390- if flags & FNAME :
391- pos = data .find (b"\x00 " , pos ) + 1
392- # pos will be -1 + 1 when null byte not found.
393- if not pos :
394- raise eof_error
395- if flags & FCOMMENT :
396- pos = data .find (b"\x00 " , pos ) + 1
397- if not pos :
398- raise eof_error
399- if flags & FHCRC :
400- if len (data ) < pos + 2 :
401- raise eof_error
402- header_crc , = struct .unpack ("<H" , data [pos : pos + 2 ])
403- # CRC is stored as a 16-bit integer by taking last bits of crc32.
404- crc = isal_zlib .crc32 (data [:pos ]) & 0xFFFF
405- if header_crc != crc :
406- raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
407- f"match: { crc :04x} != { header_crc :04x} " )
408- pos += 2
409- return pos
410-
411-
412435def decompress (data ):
413436 """Decompress a gzip compressed string in one shot.
414437 Return the decompressed string.
438+
439+ This function checks for extra gzip members. Using
440+ isal_zlib.decompress(data, wbits=31) is faster in cases where only one
441+ gzip member is guaranteed to be present.
415442 """
416443 decompressed_members = []
417444 while True :
418445 if not data : # Empty data returns empty bytestring
419446 return b"" .join (decompressed_members )
420- header_end = _gzip_header_end (data )
447+ fp = io .BytesIO (data )
448+ if _read_gzip_header (fp ) is None :
449+ return b"" .join (decompressed_members )
450+ header_end = fp .tell ()
421451 # Use a zlib raw deflate compressor
422452 do = isal_zlib .decompressobj (wbits = - isal_zlib .MAX_WBITS )
423453 # Read all the data except the header
0 commit comments