@@ -220,20 +220,80 @@ def write(self, data):
220
220
return length
221
221
222
222
223
- def detect_bgzip (header : bytes ) -> bool :
224
- if len (header ) < 18 :
225
- return False
226
- magic , method , flags , mtime , xfl , os , xlen , si1 , si2 , slen , bsize = \
227
- struct .unpack ("<HBBIBBHBBHH" , header [:18 ])
228
- return (
229
- method == 8 and # Deflate method used
230
- flags & 4 and # There are extra fields
231
- xlen == 6 and # The extra field should be of length 6
232
- si1 == 66 and # BGZIP magic number one
233
- si2 == 67 and # BGZIP magic number two
234
- slen == 2 # The length of the 16 bit integer that stores
235
- # the size of the block
236
- )
223
+ def _read_exact (fp , n ):
224
+ '''Read exactly *n* bytes from `fp`
225
+
226
+ This method is required because fp may be unbuffered,
227
+ i.e. return short reads.
228
+ '''
229
+ data = fp .read (n )
230
+ while len (data ) < n :
231
+ b = fp .read (n - len (data ))
232
+ if not b :
233
+ raise EOFError ("Compressed file ended before the "
234
+ "end-of-stream marker was reached" )
235
+ data += b
236
+ return data
237
+
238
+
239
+ def _read_gzip_header (fp ):
240
+ '''Read a gzip header from `fp` and progress to the end of the header.
241
+
242
+ Returns None if header not present. Parses mtime from the header, looks
243
+ for BGZF format blocks and parses the block size, setting it to None if
244
+ not present. Returns a tuple of mtime, block_size if a header was present.
245
+ '''
246
+ # Do not use read_exact because a header may not be present. Read twice
247
+ # since fp might be unbuffered.
248
+ magic = fp .read (1 ) + fp .read (1 )
249
+ if magic == b'' :
250
+ return None
251
+
252
+ if magic != b'\037 \213 ' :
253
+ raise BadGzipFile ('Not a gzipped file (%r)' % magic )
254
+
255
+ common_fields = _read_exact (fp , 8 )
256
+ (method , flag , last_mtime ) = struct .unpack ("<BBIxx" , common_fields )
257
+ if method != 8 :
258
+ raise BadGzipFile ('Unknown compression method' )
259
+ block_size = None
260
+ if not flag : # Likely when data compressed in memory
261
+ return last_mtime , block_size
262
+ header = magic + common_fields
263
+ if flag & FEXTRA :
264
+ # Read & discard the extra field, if present
265
+ encoded_length = _read_exact (fp , 2 )
266
+ extra_len , = struct .unpack ("<H" , encoded_length )
267
+ extra_field = _read_exact (fp , extra_len )
268
+ # Bgzip file detection
269
+ if extra_len == 6 :
270
+ s1 , s2 , slen , bsize = struct .unpack ("<BBHH" , extra_field )
271
+ if s1 == 66 and s2 == 67 and slen == 2 :
272
+ # Bgzip magic and correct slen.
273
+ block_size = bsize
274
+ header = header + encoded_length + extra_field
275
+ if flag & FNAME :
276
+ # Read and discard a null-terminated string containing the filename
277
+ while True :
278
+ s = _read_exact (fp , 1 )
279
+ header += s
280
+ if s == b'\000 ' :
281
+ break
282
+ if flag & FCOMMENT :
283
+ # Read and discard a null-terminated string containing a comment
284
+ while True :
285
+ s = _read_exact (fp , 1 )
286
+ header += s
287
+ if s == b'\000 ' :
288
+ break
289
+ if flag & FHCRC :
290
+ header_crc_encoded = _read_exact (fp , 2 )
291
+ header_crc , = struct .unpack ("<H" , header_crc_encoded )
292
+ crc = isal_zlib .crc32 (header ) & 0xFFFF
293
+ if header_crc != crc :
294
+ raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
295
+ f"match: { crc :04x} != { header_crc :04x} " )
296
+ return last_mtime , block_size
237
297
238
298
239
299
class _PaddedFile (gzip ._PaddedFile ):
@@ -266,14 +326,21 @@ def __init__(self, fp):
266
326
self ._new_member = True
267
327
self ._last_mtime = None
268
328
self ._read_buffer_size = READ_BUFFER_SIZE
269
- if hasattr (fp , "peek" ) and detect_bgzip (fp .peek (18 )):
270
- # bgzip consists of puny little blocks of max 64K uncompressed data
271
- # so in practice probably more around 16K in compressed size. A
272
- # 128K buffer is a massive overshoot and slows down the
273
- # decompression.
274
- # bgzip stores the block size, so it can be unpacked more
275
- # efficiently but this is outside scope for python-isal.
276
- self ._read_buffer_size = 16 * 1024
329
+
330
+ def _read_gzip_header (self ):
331
+ header_info = _read_gzip_header (self ._fp )
332
+ if header_info is None :
333
+ return False
334
+ # Get the BGZF block size from the header if present. If the read
335
+ # buffer size is set to exactly the block size, there will be less
336
+ # overhead as reading the file will stop right before the gzip trailer.
337
+ # On normal gzip files nothing happens and this optimization is not
338
+ # detrimental.
339
+ last_mtime , block_size = header_info
340
+ self ._last_mtime = last_mtime
341
+ self ._read_buffer_size = (block_size if block_size is not None
342
+ else READ_BUFFER_SIZE )
343
+ return True
277
344
278
345
def read (self , size = - 1 ):
279
346
if size < 0 :
@@ -299,7 +366,9 @@ def read(self, size=-1):
299
366
if self ._new_member :
300
367
# If the _new_member flag is set, we have to
301
368
# jump to the next member, if there is one.
302
- self ._init_read ()
369
+ self ._crc = isal_zlib .crc32 (b"" )
370
+ # Decompressed size of unconcatenated stream
371
+ self ._stream_size = 0
303
372
if not self ._read_gzip_header ():
304
373
self ._size = self ._pos
305
374
return b""
@@ -363,61 +432,22 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
363
432
return header + compressed
364
433
365
434
366
- def _gzip_header_end (data : bytes ) -> int :
367
- """
368
- Find the start of the raw deflate block in a gzip file.
369
- :param data: Compressed data that starts with a gzip header.
370
- :return: The end of the header / start of the raw deflate block.
371
- """
372
- eof_error = EOFError ("Compressed file ended before the end-of-stream "
373
- "marker was reached" )
374
- if len (data ) < 10 :
375
- raise eof_error
376
- # We are not interested in mtime, xfl and os flags.
377
- magic , method , flags = struct .unpack ("<HBB" , data [:4 ])
378
- if magic != 0x8b1f :
379
- raise BadGzipFile (f"Not a gzipped file ({ repr (data [:2 ])} )" )
380
- if method != 8 :
381
- raise BadGzipFile ("Unknown compression method" )
382
- if not flags : # Likely when data compressed in memory
383
- return 10
384
- pos = 10
385
- if flags & FEXTRA :
386
- if len (data ) < pos + 2 :
387
- raise eof_error
388
- xlen , = struct .unpack ("<H" , data [pos : pos + 2 ])
389
- pos += 2 + xlen
390
- if flags & FNAME :
391
- pos = data .find (b"\x00 " , pos ) + 1
392
- # pos will be -1 + 1 when null byte not found.
393
- if not pos :
394
- raise eof_error
395
- if flags & FCOMMENT :
396
- pos = data .find (b"\x00 " , pos ) + 1
397
- if not pos :
398
- raise eof_error
399
- if flags & FHCRC :
400
- if len (data ) < pos + 2 :
401
- raise eof_error
402
- header_crc , = struct .unpack ("<H" , data [pos : pos + 2 ])
403
- # CRC is stored as a 16-bit integer by taking last bits of crc32.
404
- crc = isal_zlib .crc32 (data [:pos ]) & 0xFFFF
405
- if header_crc != crc :
406
- raise BadGzipFile (f"Corrupted gzip header. Checksums do not "
407
- f"match: { crc :04x} != { header_crc :04x} " )
408
- pos += 2
409
- return pos
410
-
411
-
412
435
def decompress (data ):
413
436
"""Decompress a gzip compressed string in one shot.
414
437
Return the decompressed string.
438
+
439
+ This function checks for extra gzip members. Using
440
+ isal_zlib.decompress(data, wbits=31) is faster in cases where only one
441
+ gzip member is guaranteed to be present.
415
442
"""
416
443
decompressed_members = []
417
444
while True :
418
445
if not data : # Empty data returns empty bytestring
419
446
return b"" .join (decompressed_members )
420
- header_end = _gzip_header_end (data )
447
+ fp = io .BytesIO (data )
448
+ if _read_gzip_header (fp ) is None :
449
+ return b"" .join (decompressed_members )
450
+ header_end = fp .tell ()
421
451
# Use a zlib raw deflate compressor
422
452
do = isal_zlib .decompressobj (wbits = - isal_zlib .MAX_WBITS )
423
453
# Read all the data except the header
0 commit comments