Skip to content

Commit c7e8af4

Browse files
committed
Add bgzip detection and adjust decompression strategy accordingly
1 parent 791d517 commit c7e8af4

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

src/isal/igzip.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,22 @@ def write(self, data):
220220
return length
221221

222222

223+
def detect_bgzip(header: bytes) -> bool:
224+
if len(header) < 18:
225+
return False
226+
magic, method, flags, mtime, xfl, os, xlen, si1, si2, slen, bsize = \
227+
struct.unpack("<HBBIBBHBBHH", header[:18])
228+
return (
229+
method == 8 and # Deflate method used
230+
flags & 4 and # There are extra fields
231+
xlen == 6 and # The extra field should be of length 6
232+
si1 == 66 and # BGZIP magic number one
233+
si2 == 67 and # BGZIP magic number two
234+
slen == 2 # The length of the 16 bit integer that stores
235+
# the size of the block
236+
)
237+
238+
223239
class _PaddedFile(gzip._PaddedFile):
224240
# Overwrite _PaddedFile from gzip as its prepend method assumes that
225241
# the prepended data is always read from its _buffer. Unfortunately in
@@ -250,6 +266,14 @@ def __init__(self, fp):
250266
self._new_member = True
251267
self._last_mtime = None
252268
self._read_buffer_size = READ_BUFFER_SIZE
269+
if detect_bgzip(fp.peek(18)):
270+
# bgzip consists of puny little blocks of max 64K uncompressed data
271+
# so in practice probably more around 16K in compressed size. A
272+
# 128K buffer is a massive overshoot and slows down the
273+
# decompression.
274+
# bgzip stores the block size, so it can be unpacked more
275+
# efficiently but this is outside scope for python-isal.
276+
self._read_buffer_size = 16 * 1024
253277

254278
def read(self, size=-1):
255279
if size < 0:

0 commit comments

Comments
 (0)