Merge pull request #151 from pycompression/cgzipmodule

rhpvorderman · web-flow · commit a75d160df60f · 2023-10-04T07:00:18.000+02:00
Rewrite _IGzipReader in C for even less overhead.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,10 +19,10 @@ jobs:
       - uses: actions/checkout@v2.3.4
         with:
           submodules: recursive
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2.2.1
         with:
-          python-version: 3.7
+          python-version: 3.8
       - name: Install tox
         run: pip install tox
       - name: Lint
@@ -39,10 +39,10 @@ jobs:
       - uses: actions/checkout@v2.3.4
         with:
           submodules: recursive
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2.2.1
         with:
-          python-version: 3.7
+          python-version: 3.8
       - name: Install isal
         run: sudo apt-get install libisal-dev
       - name: Install tox and upgrade setuptools and pip
@@ -57,20 +57,18 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.7"
           - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
-          - "pypy-3.7"
           - "pypy-3.8"
           - "pypy-3.9"
         os: ["ubuntu-latest"]
         include:
           - os: "macos-latest"
-            python-version: 3.7
+            python-version: 3.8
           - os: "windows-latest"
-            python-version: 3.7
+            python-version: 3.8
     steps:
       - uses: actions/checkout@v2.3.4
         with:
@@ -106,7 +104,7 @@ jobs:
     strategy:
       matrix:
         python_version:
-          - "3.7"
+          - "3.8"
     steps:
       - uses: actions/checkout@v2.3.4
         with:
diff --git a/benchmark_scripts/benchmark_cgzipreader.py b/benchmark_scripts/benchmark_cgzipreader.py
@@ -0,0 +1,11 @@
+import sys
+
+from isal.isal_zlib import _GzipReader
+
+if __name__ == "__main__":
+    with open(sys.argv[1], "rb") as f:
+        reader = _GzipReader(f, 512 * 1024)
+        while True:
+            block = reader.read(128 * 1024)
+            if not block:
+                break
diff --git a/setup.cfg b/setup.cfg
@@ -1,2 +1,2 @@
 [metadata]
-license_file=LICENSE
+license_files=LICENSE
diff --git a/setup.py b/setup.py
@@ -158,7 +158,6 @@ def build_isa_l():
     classifiers=[
         "Programming Language :: Python :: 3 :: Only",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
@@ -173,6 +172,6 @@ def build_isa_l():
         "Operating System :: MacOS",
         "Operating System :: Microsoft :: Windows",
     ],
-    python_requires=">=3.7",  # We use METH_FASTCALL
+    python_requires=">=3.8",  # BadGzipFile imported
     ext_modules=EXTENSIONS
 )
diff --git a/src/isal/igzip.py b/src/isal/igzip.py
@@ -36,9 +36,9 @@
 import sys
 import time
 from typing import Optional, SupportsInt
-import _compression  # noqa: I201  # Not third-party
 
 from . import igzip_lib, isal_zlib
+from .isal_zlib import _GzipReader
 
 __all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
            "READ_BUFFER_SIZE"]
@@ -47,19 +47,16 @@
 _COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
 _COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
 
-#: The amount of data that is read in at once when decompressing a file.
-#: Increasing this value may increase performance.
-#: 128K is also the size used by pigz and cat to read files from the
-# filesystem.
-READ_BUFFER_SIZE = 128 * 1024
+# The amount of data that is read in at once when decompressing a file.
+# Increasing this value may increase performance.
+# After 512K the performance does not increase anymore on a Ryzen 5 3600 test
+# system.
+READ_BUFFER_SIZE = 512 * 1024
 
 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
 READ, WRITE = 1, 2
 
-try:
-    BadGzipFile = gzip.BadGzipFile  # type: ignore
-except AttributeError:  # Versions lower than 3.8 do not have BadGzipFile
-    BadGzipFile = OSError  # type: ignore
+BadGzipFile = gzip.BadGzipFile  # type: ignore
 
 
 # The open method was copied from the CPython source with minor adjustments.
@@ -166,7 +163,7 @@ def __init__(self, filename=None, mode=None,
                                                   isal_zlib.DEF_MEM_LEVEL,
                                                   0)
         if self.mode == READ:
-            raw = _IGzipReader(self.fileobj)
+            raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
             self._buffer = io.BufferedReader(raw)
 
     def __repr__(self):
@@ -220,186 +217,9 @@ def write(self, data):
         return length
 
 
-def _read_exact(fp, n):
-    '''Read exactly *n* bytes from `fp`
-
-    This method is required because fp may be unbuffered,
-    i.e. return short reads.
-    '''
-    data = fp.read(n)
-    while len(data) < n:
-        b = fp.read(n - len(data))
-        if not b:
-            raise EOFError("Compressed file ended before the "
-                           "end-of-stream marker was reached")
-        data += b
-    return data
-
-
-def _read_gzip_header(fp):
-    '''Read a gzip header from `fp` and progress to the end of the header.
-
-    Returns None if header not present. Parses mtime from the header, looks
-    for BGZF format blocks and parses the block size, setting it to None if
-    not present. Returns a tuple of mtime, block_size if a header was present.
-    '''
-    # Do not use read_exact because a header may not be present. Read twice
-    # since fp might be unbuffered.
-    magic = fp.read(1) + fp.read(1)
-    if magic == b'':
-        return None
-
-    if magic != b'\037\213':
-        raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
-    common_fields = _read_exact(fp, 8)
-    (method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields)
-    if method != 8:
-        raise BadGzipFile('Unknown compression method')
-    block_size = None
-    if not flag:  # Likely when data compressed in memory
-        return last_mtime, block_size
-    header = magic + common_fields
-    if flag & FEXTRA:
-        # Read & discard the extra field, if present
-        encoded_length = _read_exact(fp, 2)
-        extra_len, = struct.unpack("<H", encoded_length)
-        extra_field = _read_exact(fp, extra_len)
-        # Bgzip file detection
-        if extra_len == 6:
-            s1, s2, slen, bsize = struct.unpack("<BBHH", extra_field)
-            if s1 == 66 and s2 == 67 and slen == 2:
-                # Bgzip magic and correct slen.
-                block_size = bsize
-        header = header + encoded_length + extra_field
-    if flag & FNAME:
-        # Read and discard a null-terminated string containing the filename
-        while True:
-            s = _read_exact(fp, 1)
-            header += s
-            if s == b'\000':
-                break
-    if flag & FCOMMENT:
-        # Read and discard a null-terminated string containing a comment
-        while True:
-            s = _read_exact(fp, 1)
-            header += s
-            if s == b'\000':
-                break
-    if flag & FHCRC:
-        header_crc_encoded = _read_exact(fp, 2)
-        header_crc, = struct.unpack("<H", header_crc_encoded)
-        crc = isal_zlib.crc32(header) & 0xFFFF
-        if header_crc != crc:
-            raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
-                              f"match: {crc:04x} != {header_crc:04x}")
-    return last_mtime, block_size
-
-
-class _PaddedFile(gzip._PaddedFile):
-    # Overwrite _PaddedFile from gzip as its prepend method assumes that
-    # the prepended data is always read from its _buffer. Unfortunately in
-    # isal_zlib.decompressobj there is a bitbuffer as well which may be added.
-    # So an extra check is added to prepend to ensure no extra data in front
-    # of the buffer was present. (Negative self._read).
-    def prepend(self, prepend=b''):
-        if self._read is not None:
-            # Assume data was read since the last prepend() call
-            self._read -= len(prepend)
-            if self._read >= 0:
-                return
-            # If self._read is negative the data was read further back and
-            # the buffer needs to be reset.
-        self._buffer = prepend
-        self._length = len(self._buffer)
-        self._read = 0
-
-
-class _IGzipReader(gzip._GzipReader):
-    def __init__(self, fp):
-        # Call the init method of gzip._GzipReader's parent here.
-        # It is not very invasive and allows us to override _PaddedFile
-        _compression.DecompressReader.__init__(
-            self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
-            hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
-        # Set flag indicating start of a new member
-        self._new_member = True
-        self._last_mtime = None
-        self._read_buffer_size = READ_BUFFER_SIZE
-
-    def _read_gzip_header(self):
-        header_info = _read_gzip_header(self._fp)
-        if header_info is None:
-            return False
-        # Get the BGZF block size from the header if present. If the read
-        # buffer size is set to exactly the block size, there will be less
-        # overhead as reading the file will stop right before the gzip trailer.
-        # On normal gzip files nothing happens and this optimization is not
-        # detrimental.
-        last_mtime, block_size = header_info
-        self._last_mtime = last_mtime
-        self._read_buffer_size = (block_size if block_size is not None
-                                  else READ_BUFFER_SIZE)
-        return True
-
-    def read(self, size=-1):
-        if size < 0:
-            return self.readall()
-        # size=0 is special because decompress(max_length=0) is not supported
-        if not size:
-            return b""
-
-        # For certain input data, a single
-        # call to decompress() may not return
-        # any data. In this case, retry until we get some data or reach EOF.
-        while True:
-            if self._decompressor.eof:
-                # Ending case: we've come to the end of a member in the file,
-                # so finish up this member, and read a new gzip header.
-                # Check the CRC and file size, and set the flag so we read
-                # a new member
-                self._read_eof()
-                self._new_member = True
-                self._decompressor = self._decomp_factory(
-                    **self._decomp_args)
-
-            if self._new_member:
-                # If the _new_member flag is set, we have to
-                # jump to the next member, if there is one.
-                self._crc = isal_zlib.crc32(b"")
-                # Decompressed size of unconcatenated stream
-                self._stream_size = 0
-                if not self._read_gzip_header():
-                    self._size = self._pos
-                    return b""
-                self._new_member = False
-
-            # Read a chunk of data from the file
-            if self._decompressor.needs_input:
-                buf = self._fp.read(self._read_buffer_size)
-                uncompress = self._decompressor.decompress(buf, size)
-            else:
-                uncompress = self._decompressor.decompress(b"", size)
-            if self._decompressor.unused_data != b"":
-                # Prepend the already read bytes to the fileobj so they can
-                # be seen by _read_eof() and _read_gzip_header()
-                self._fp.prepend(self._decompressor.unused_data)
-
-            if uncompress != b"":
-                break
-            if buf == b"":
-                raise EOFError("Compressed file ended before the "
-                               "end-of-stream marker was reached")
-
-        self._crc = isal_zlib.crc32(uncompress, self._crc)
-        self._stream_size += len(uncompress)
-        self._pos += len(uncompress)
-        return uncompress
-
-
 # Aliases for improved compatibility with CPython gzip module.
 GzipFile = IGzipFile
-_GzipReader = _IGzipReader
+_IGzipReader = _GzipReader
 
 
 def _create_simple_gzip_header(compresslevel: int,
@@ -440,28 +260,9 @@ def decompress(data):
     isal_zlib.decompress(data, wbits=31) is faster in cases where only one
     gzip member is guaranteed to be present.
     """
-    decompressed_members = []
-    while True:
-        if not data:  # Empty data returns empty bytestring
-            return b"".join(decompressed_members)
-        fp = io.BytesIO(data)
-        if _read_gzip_header(fp) is None:
-            return b"".join(decompressed_members)
-        header_end = fp.tell()
-        # Use a zlib raw deflate compressor
-        do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS)
-        # Read all the data except the header
-        decompressed = do.decompress(data[header_end:])
-        if not do.eof or len(do.unused_data) < 8:
-            raise EOFError("Compressed file ended before the end-of-stream "
-                           "marker was reached")
-        crc, length = struct.unpack("<II", do.unused_data[:8])
-        if crc != isal_zlib.crc32(decompressed):
-            raise BadGzipFile("CRC check failed")
-        if length != (len(decompressed) & 0xffffffff):
-            raise BadGzipFile("Incorrect length of data produced")
-        decompressed_members.append(decompressed)
-        data = do.unused_data[8:].lstrip(b"\x00")
+    fp = io.BytesIO(data)
+    reader = _GzipReader(fp, max(len(data), 16))
+    return reader.readall()
 
 
 def _argument_parser():
@@ -563,8 +364,6 @@ def main():
         else:
             out_file = sys.stdout.buffer
 
-    global READ_BUFFER_SIZE
-    READ_BUFFER_SIZE = args.buffer_size
     try:
         while True:
             block = in_file.read(args.buffer_size)
diff --git a/src/isal/isal_zlib.pyi b/src/isal/isal_zlib.pyi
@@ -5,6 +5,8 @@
 # This file is part of python-isal which is distributed under the
 # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.
 
+import typing
+
 ISAL_BEST_SPEED: int
 ISAL_BEST_COMPRESSION: int
 ISAL_DEFAULT_COMPRESSION: int
@@ -61,3 +63,16 @@ def compressobj(level: int = ISAL_DEFAULT_COMPRESSION,
                 strategy: int = Z_DEFAULT_STRATEGY,
                 zdict = None) -> Compress: ...
 def decompressobj(wbits: int = MAX_WBITS, zdict = None) -> Decompress: ...
+
+class _GzipReader:
+    def __init__(self, fp: typing.BinaryIO, buffersize: int = 32 * 1024): ...
+    def readinto(self, obj) -> int: ...
+    def readable(self) -> bool: ...
+    def writable(self) -> bool: ...
+    def seekable(self) -> bool: ...
+    def tell(self) -> int: ...
+    def seek(self, offset: int, whence: int): ...
+    def close(self): ...
+    def readall(self) -> bytes: ...
+    def read(self, __size: int): ...
+    def flush(self): ...
diff --git a/src/isal/isal_zlibmodule.c b/src/isal/isal_zlibmodule.c
diff --git a/tests/test_gzip_compliance.py b/tests/test_gzip_compliance.py
diff --git a/tests/test_igzip.py b/tests/test_igzip.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`[metadata]`
`2`		`-license_file=LICENSE`
	`2`	`+license_files=LICENSE`