Skip to content

Commit 4ad8e12

Browse files
authored
Merge pull request #145 from pycompression/bgzip
Increase BGZip streaming decompression performance
2 parents fd66618 + a76c459 commit 4ad8e12

File tree

4 files changed

+40
-1
lines changed

4 files changed

+40
-1
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ Changelog
99
1010
version 1.2.0-dev
1111
-----------------
12+
+ Bgzip files are now detected and a smaller reading buffer is used to
13+
accomodate the fact that bgzip blocks are typically less than 64K. (Unlike
14+
normal gzip files that consist of one block that spans the entire file.)
15+
This has reduced decompression time for bgzip files by roughly 12%.
1216
+ Speed-up source build by using ISA-L Unix-specific makefile rather than the
1317
autotools build.
1418
+ Simplify build setup. ISA-L release flags are now used and not

src/isal/igzip.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,22 @@ def write(self, data):
220220
return length
221221

222222

223+
def detect_bgzip(header: bytes) -> bool:
224+
if len(header) < 18:
225+
return False
226+
magic, method, flags, mtime, xfl, os, xlen, si1, si2, slen, bsize = \
227+
struct.unpack("<HBBIBBHBBHH", header[:18])
228+
return (
229+
method == 8 and # Deflate method used
230+
flags & 4 and # There are extra fields
231+
xlen == 6 and # The extra field should be of length 6
232+
si1 == 66 and # BGZIP magic number one
233+
si2 == 67 and # BGZIP magic number two
234+
slen == 2 # The length of the 16 bit integer that stores
235+
# the size of the block
236+
)
237+
238+
223239
class _PaddedFile(gzip._PaddedFile):
224240
# Overwrite _PaddedFile from gzip as its prepend method assumes that
225241
# the prepended data is always read from its _buffer. Unfortunately in
@@ -249,6 +265,15 @@ def __init__(self, fp):
249265
# Set flag indicating start of a new member
250266
self._new_member = True
251267
self._last_mtime = None
268+
self._read_buffer_size = READ_BUFFER_SIZE
269+
if hasattr(fp, "peek") and detect_bgzip(fp.peek(18)):
270+
# bgzip consists of puny little blocks of max 64K uncompressed data
271+
# so in practice probably more around 16K in compressed size. A
272+
# 128K buffer is a massive overshoot and slows down the
273+
# decompression.
274+
# bgzip stores the block size, so it can be unpacked more
275+
# efficiently but this is outside scope for python-isal.
276+
self._read_buffer_size = 16 * 1024
252277

253278
def read(self, size=-1):
254279
if size < 0:
@@ -282,7 +307,7 @@ def read(self, size=-1):
282307

283308
# Read a chunk of data from the file
284309
if self._decompressor.needs_input:
285-
buf = self._fp.read(READ_BUFFER_SIZE)
310+
buf = self._fp.read(self._read_buffer_size)
286311
uncompress = self._decompressor.decompress(buf, size)
287312
else:
288313
uncompress = self._decompressor.decompress(b"", size)

tests/data/test.fastq.bgzip.gz

1.48 MB
Binary file not shown.

tests/test_igzip.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,3 +443,13 @@ def test_concatenated_gzip():
443443
with igzip.open(concat, "rb") as igzip_h:
444444
result = igzip_h.read()
445445
assert data == result
446+
447+
448+
def test_bgzip():
449+
bgzip_file = Path(__file__).parent / "data" / "test.fastq.bgzip.gz"
450+
gzip_file = Path(__file__).parent / "data" / "test.fastq.gz"
451+
with igzip.open(bgzip_file, "rb") as bgz:
452+
bgz_data = bgz.read()
453+
with igzip.open(gzip_file, "rb") as gz:
454+
gz_data = gz.read()
455+
assert bgz_data == gz_data

0 commit comments

Comments
 (0)