Merge pull request #145 from pycompression/bgzip

rhpvorderman · web-flow · commit 4ad8e12eaa9a · 2023-07-21T13:57:53.000+02:00
Increase BGZip streaming decompression performance
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,10 @@ Changelog
 
 version 1.2.0-dev
 -----------------
++ Bgzip files are now detected and a smaller reading buffer is used to
+  accomodate the fact that bgzip blocks are typically less than 64K. (Unlike
+  normal gzip files that consist of one block that spans the entire file.)
+  This has reduced decompression time for bgzip files by roughly 12%.
 + Speed-up source build by using ISA-L Unix-specific makefile rather than the
   autotools build.
 + Simplify build setup. ISA-L release flags are now used and not
diff --git a/src/isal/igzip.py b/src/isal/igzip.py
@@ -220,6 +220,22 @@ def write(self, data):
         return length
 
 
+def detect_bgzip(header: bytes) -> bool:
+    if len(header) < 18:
+        return False
+    magic, method, flags, mtime, xfl, os, xlen, si1, si2, slen, bsize = \
+        struct.unpack("<HBBIBBHBBHH", header[:18])
+    return (
+            method == 8 and  # Deflate method used
+            flags & 4 and    # There are extra fields
+            xlen == 6 and    # The extra field should be of length 6
+            si1 == 66 and    # BGZIP magic number one
+            si2 == 67 and    # BGZIP magic number two
+            slen == 2        # The length of the 16 bit integer that stores
+                             # the size of the block
+    )
+
+
 class _PaddedFile(gzip._PaddedFile):
     # Overwrite _PaddedFile from gzip as its prepend method assumes that
     # the prepended data is always read from its _buffer. Unfortunately in
@@ -249,6 +265,15 @@ def __init__(self, fp):
         # Set flag indicating start of a new member
         self._new_member = True
         self._last_mtime = None
+        self._read_buffer_size = READ_BUFFER_SIZE
+        if hasattr(fp, "peek") and detect_bgzip(fp.peek(18)):
+            # bgzip consists of puny little blocks of max 64K uncompressed data
+            # so in practice probably more around 16K in compressed size. A
+            # 128K buffer is a massive overshoot and slows down the
+            # decompression.
+            # bgzip stores the block size, so it can be unpacked more
+            # efficiently but this is outside scope for python-isal.
+            self._read_buffer_size = 16 * 1024
 
     def read(self, size=-1):
         if size < 0:
@@ -282,7 +307,7 @@ def read(self, size=-1):
 
             # Read a chunk of data from the file
             if self._decompressor.needs_input:
-                buf = self._fp.read(READ_BUFFER_SIZE)
+                buf = self._fp.read(self._read_buffer_size)
                 uncompress = self._decompressor.decompress(buf, size)
             else:
                 uncompress = self._decompressor.decompress(b"", size)
diff --git a/tests/data/test.fastq.bgzip.gz b/tests/data/test.fastq.bgzip.gz
diff --git a/tests/test_igzip.py b/tests/test_igzip.py
@@ -443,3 +443,13 @@ def test_concatenated_gzip():
     with igzip.open(concat, "rb") as igzip_h:
         result = igzip_h.read()
     assert data == result
+
+
+def test_bgzip():
+    bgzip_file = Path(__file__).parent / "data" / "test.fastq.bgzip.gz"
+    gzip_file = Path(__file__).parent / "data" / "test.fastq.gz"
+    with igzip.open(bgzip_file, "rb") as bgz:
+        bgz_data = bgz.read()
+    with igzip.open(gzip_file, "rb") as gz:
+        gz_data = gz.read()
+    assert bgz_data == gz_data