@@ -220,22 +220,6 @@ def write(self, data):
220
220
return length
221
221
222
222
223
- def detect_bgzip (header : bytes ) -> bool :
224
- if len (header ) < 18 :
225
- return False
226
- magic , method , flags , mtime , xfl , os , xlen , si1 , si2 , slen , bsize = \
227
- struct .unpack ("<HBBIBBHBBHH" , header [:18 ])
228
- return (
229
- method == 8 and # Deflate method used
230
- flags & 4 and # There are extra fields
231
- xlen == 6 and # The extra field should be of length 6
232
- si1 == 66 and # BGZIP magic number one
233
- si2 == 67 and # BGZIP magic number two
234
- slen == 2 # The length of the 16 bit integer that stores
235
- # the size of the block
236
- )
237
-
238
-
239
223
def _read_exact (fp , n ):
240
224
'''Read exactly *n* bytes from `fp`
241
225
@@ -255,7 +239,9 @@ def _read_exact(fp, n):
255
239
def _read_gzip_header (fp ):
256
240
'''Read a gzip header from `fp` and progress to the end of the header.
257
241
258
- Returns last mtime if header was present or None otherwise.
242
+ Returns None if header not present. Parses mtime from the header, looks
243
+ for BGZF format blocks and parses the block size, setting it to None if
244
+ not present. Returns a tuple of mtime, block_size if a header was present.
259
245
'''
260
246
# Do not use read_exact because a header may not be present. Read twice
261
247
# since fp might be unbuffered.
@@ -340,20 +326,16 @@ def __init__(self, fp):
340
326
self ._new_member = True
341
327
self ._last_mtime = None
342
328
self ._read_buffer_size = READ_BUFFER_SIZE
343
- if hasattr (fp , "peek" ) and detect_bgzip (fp .peek (18 )):
344
- # bgzip consists of puny little blocks of max 64K uncompressed data
345
- # so in practice probably more around 16K in compressed size. A
346
- # 128K buffer is a massive overshoot and slows down the
347
- # decompression.
348
- # bgzip stores the block size, so it can be unpacked more
349
- # efficiently but this is outside scope for python-isal.
350
- self ._read_buffer_size = 16 * 1024
351
329
352
330
def _read_gzip_header (self ):
353
331
header_info = _read_gzip_header (self ._fp )
354
332
if header_info is None :
355
333
return False
356
- # Get the BGZF block size from the header if present
334
+ # Get the BGZF block size from the header if present. If the read
335
+ # buffer size is set to exactly the block size, there will be less
336
+ # overhead as reading the file will stop right before the gzip trailer.
337
+ # On normal gzip files nothing happens and this optimization is not
338
+ # detrimental.
357
339
last_mtime , block_size = header_info
358
340
self ._last_mtime = last_mtime
359
341
self ._read_buffer_size = (block_size if block_size is not None
0 commit comments