@@ -220,22 +220,6 @@ def write(self, data):
220220 return length
221221
222222
223- def detect_bgzip (header : bytes ) -> bool :
224- if len (header ) < 18 :
225- return False
226- magic , method , flags , mtime , xfl , os , xlen , si1 , si2 , slen , bsize = \
227- struct .unpack ("<HBBIBBHBBHH" , header [:18 ])
228- return (
229- method == 8 and # Deflate method used
230- flags & 4 and # There are extra fields
231- xlen == 6 and # The extra field should be of length 6
232- si1 == 66 and # BGZIP magic number one
233- si2 == 67 and # BGZIP magic number two
234- slen == 2 # The length of the 16 bit integer that stores
235- # the size of the block
236- )
237-
238-
239223def _read_exact (fp , n ):
240224 '''Read exactly *n* bytes from `fp`
241225
@@ -255,7 +239,9 @@ def _read_exact(fp, n):
255239def _read_gzip_header (fp ):
256240 '''Read a gzip header from `fp` and progress to the end of the header.
257241
258- Returns last mtime if header was present or None otherwise.
242+ Returns None if header not present. Parses mtime from the header, looks
243+ for BGZF format blocks and parses the block size, setting it to None if
244+ not present. Returns a tuple of mtime, block_size if a header was present.
259245 '''
260246 # Do not use read_exact because a header may not be present. Read twice
261247 # since fp might be unbuffered.
@@ -340,20 +326,16 @@ def __init__(self, fp):
340326 self ._new_member = True
341327 self ._last_mtime = None
342328 self ._read_buffer_size = READ_BUFFER_SIZE
343- if hasattr (fp , "peek" ) and detect_bgzip (fp .peek (18 )):
344- # bgzip consists of puny little blocks of max 64K uncompressed data
345- # so in practice probably more around 16K in compressed size. A
346- # 128K buffer is a massive overshoot and slows down the
347- # decompression.
348- # bgzip stores the block size, so it can be unpacked more
349- # efficiently but this is outside scope for python-isal.
350- self ._read_buffer_size = 16 * 1024
351329
352330 def _read_gzip_header (self ):
353331 header_info = _read_gzip_header (self ._fp )
354332 if header_info is None :
355333 return False
356- # Get the BGZF block size from the header if present
334+ # Get the BGZF block size from the header if present. If the read
335+ # buffer size is set to exactly the block size, there will be less
336+ # overhead as reading the file will stop right before the gzip trailer.
337+ # On normal gzip files nothing happens and this optimization is not
338+ # detrimental.
357339 last_mtime , block_size = header_info
358340 self ._last_mtime = last_mtime
359341 self ._read_buffer_size = (block_size if block_size is not None
0 commit comments