Make code more in line with CPython header reading

rhpvorderman · rhpvorderman · commit d6f00a5dd7a8 · 2023-09-20T08:55:01.000+02:00
diff --git a/src/isal/igzip.py b/src/isal/igzip.py
@@ -236,6 +236,71 @@ def detect_bgzip(header: bytes) -> bool:
     )
 
 
+def _read_exact(fp, n):
+    '''Read exactly *n* bytes from `fp`
+
+    This method is required because fp may be unbuffered,
+    i.e. return short reads.
+    '''
+    data = fp.read(n)
+    while len(data) < n:
+        b = fp.read(n - len(data))
+        if not b:
+            raise EOFError("Compressed file ended before the "
+                           "end-of-stream marker was reached")
+        data += b
+    return data
+
+
+def _read_gzip_header(fp):
+    '''Read a gzip header from `fp` and progress to the end of the header.
+
+    Returns last mtime if header was present or None otherwise.
+    '''
+    # Do not use read_exact because a header may not be present. Read twice
+    # since fp might be unbuffered.
+    magic = fp.read(1) + fp.read(1)
+    if magic == b'':
+        return None
+
+    if magic != b'\037\213':
+        raise BadGzipFile('Not a gzipped file (%r)' % magic)
+
+    common_fields = _read_exact(fp, 8)
+    (method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields)
+    if method != 8:
+        raise BadGzipFile('Unknown compression method')
+    header = magic + common_fields
+    if flag & FEXTRA:
+        # Read & discard the extra field, if present
+        encoded_length = _read_exact(fp, 2)
+        extra_len, = struct.unpack("<H", encoded_length)
+        extra_field = _read_exact(fp, extra_len)
+        header = header + encoded_length + extra_field
+    if flag & FNAME:
+        # Read and discard a null-terminated string containing the filename
+        while True:
+            s = _read_exact(fp, 1)
+            header += s
+            if s == b'\000':
+                break
+    if flag & FCOMMENT:
+        # Read and discard a null-terminated string containing a comment
+        while True:
+            s = _read_exact(fp, 1)
+            header += s
+            if s == b'\000':
+                break
+    if flag & FHCRC:
+        header_crc_encoded = _read_exact(fp, 2)
+        header_crc, = struct.unpack("<H", header_crc_encoded)
+        crc = isal_zlib.crc32(header) & 0xFFFF
+        if header_crc != crc:
+            raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
+                              f"match: {crc:04x} != {header_crc:04x}")
+    return last_mtime
+
+
 class _PaddedFile(gzip._PaddedFile):
     # Overwrite _PaddedFile from gzip as its prepend method assumes that
     # the prepended data is always read from its _buffer. Unfortunately in
@@ -275,6 +340,13 @@ def __init__(self, fp):
             # efficiently but this is outside scope for python-isal.
             self._read_buffer_size = 16 * 1024
 
+    def _read_gzip_header(self):
+        last_mtime = _read_gzip_header(self._fp)
+        if last_mtime is None:
+            return False
+        self._last_mtime = last_mtime
+        return True
+
     def read(self, size=-1):
         if size < 0:
             return self.readall()
@@ -300,7 +372,8 @@ def read(self, size=-1):
                 # If the _new_member flag is set, we have to
                 # jump to the next member, if there is one.
                 self._crc = isal_zlib.crc32(b"")
-                self._stream_size = 0  # Decompressed size of unconcatenated stream
+                # Decompressed size of unconcatenated stream
+                self._stream_size = 0
                 if not self._read_gzip_header():
                     self._size = self._pos
                     return b""
@@ -364,52 +437,6 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
     return header + compressed
 
 
-def _gzip_header_end(data: bytes) -> int:
-    """
-    Find the start of the raw deflate block in a gzip file.
-    :param data: Compressed data that starts with a gzip header.
-    :return: The end of the header / start of the raw deflate block.
-    """
-    eof_error = EOFError("Compressed file ended before the end-of-stream "
-                         "marker was reached")
-    if len(data) < 10:
-        raise eof_error
-    # We are not interested in mtime, xfl and os flags.
-    magic, method, flags = struct.unpack("<HBB", data[:4])
-    if magic != 0x8b1f:
-        raise BadGzipFile(f"Not a gzipped file ({repr(data[:2])})")
-    if method != 8:
-        raise BadGzipFile("Unknown compression method")
-    if not flags:  # Likely when data compressed in memory
-        return 10
-    pos = 10
-    if flags & FEXTRA:
-        if len(data) < pos + 2:
-            raise eof_error
-        xlen, = struct.unpack("<H", data[pos: pos+2])
-        pos += 2 + xlen
-    if flags & FNAME:
-        pos = data.find(b"\x00", pos) + 1
-        # pos will be -1 + 1 when null byte not found.
-        if not pos:
-            raise eof_error
-    if flags & FCOMMENT:
-        pos = data.find(b"\x00", pos) + 1
-        if not pos:
-            raise eof_error
-    if flags & FHCRC:
-        if len(data) < pos + 2:
-            raise eof_error
-        header_crc, = struct.unpack("<H", data[pos: pos+2])
-        # CRC is stored as a 16-bit integer by taking last bits of crc32.
-        crc = isal_zlib.crc32(data[:pos]) & 0xFFFF
-        if header_crc != crc:
-            raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
-                              f"match: {crc:04x} != {header_crc:04x}")
-        pos += 2
-    return pos
-
-
 def decompress(data):
     """Decompress a gzip compressed string in one shot.
     Return the decompressed string.
@@ -418,7 +445,10 @@ def decompress(data):
     while True:
         if not data:  # Empty data returns empty bytestring
             return b"".join(decompressed_members)
-        header_end = _gzip_header_end(data)
+        fp = io.BytesIO(data)
+        if _read_gzip_header(fp) is None:
+            return b"".join(decompressed_members)
+        header_end = fp.tell()
         # Use a zlib raw deflate compressor
         do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS)
         # Read all the data except the header
diff --git a/tests/test_igzip.py b/tests/test_igzip.py
@@ -392,8 +392,10 @@ def headers():
 
 
 @pytest.mark.parametrize("header", list(headers()))
-def test_gzip_header_end(header):
-    assert igzip._gzip_header_end(header) == len(header)
+def test_read_gzip_header_position(header):
+    fp = io.BytesIO(header)
+    igzip._read_gzip_header(fp)
+    assert fp.tell() == len(header)
 
 
 def test_header_too_short():