Skip to content

Commit d6f00a5

Browse files
committed
Make code more in line with CPython header reading
1 parent 49b5b31 commit d6f00a5

File tree

2 files changed

+82
-50
lines changed

2 files changed

+82
-50
lines changed

src/isal/igzip.py

Lines changed: 78 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,71 @@ def detect_bgzip(header: bytes) -> bool:
236236
)
237237

238238

239+
def _read_exact(fp, n):
240+
'''Read exactly *n* bytes from `fp`
241+
242+
This method is required because fp may be unbuffered,
243+
i.e. return short reads.
244+
'''
245+
data = fp.read(n)
246+
while len(data) < n:
247+
b = fp.read(n - len(data))
248+
if not b:
249+
raise EOFError("Compressed file ended before the "
250+
"end-of-stream marker was reached")
251+
data += b
252+
return data
253+
254+
255+
def _read_gzip_header(fp):
256+
'''Read a gzip header from `fp` and progress to the end of the header.
257+
258+
Returns last mtime if header was present or None otherwise.
259+
'''
260+
# Do not use read_exact because a header may not be present. Read twice
261+
# since fp might be unbuffered.
262+
magic = fp.read(1) + fp.read(1)
263+
if magic == b'':
264+
return None
265+
266+
if magic != b'\037\213':
267+
raise BadGzipFile('Not a gzipped file (%r)' % magic)
268+
269+
common_fields = _read_exact(fp, 8)
270+
(method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields)
271+
if method != 8:
272+
raise BadGzipFile('Unknown compression method')
273+
header = magic + common_fields
274+
if flag & FEXTRA:
275+
# Read & discard the extra field, if present
276+
encoded_length = _read_exact(fp, 2)
277+
extra_len, = struct.unpack("<H", encoded_length)
278+
extra_field = _read_exact(fp, extra_len)
279+
header = header + encoded_length + extra_field
280+
if flag & FNAME:
281+
# Read and discard a null-terminated string containing the filename
282+
while True:
283+
s = _read_exact(fp, 1)
284+
header += s
285+
if s == b'\000':
286+
break
287+
if flag & FCOMMENT:
288+
# Read and discard a null-terminated string containing a comment
289+
while True:
290+
s = _read_exact(fp, 1)
291+
header += s
292+
if s == b'\000':
293+
break
294+
if flag & FHCRC:
295+
header_crc_encoded = _read_exact(fp, 2)
296+
header_crc, = struct.unpack("<H", header_crc_encoded)
297+
crc = isal_zlib.crc32(header) & 0xFFFF
298+
if header_crc != crc:
299+
raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
300+
f"match: {crc:04x} != {header_crc:04x}")
301+
return last_mtime
302+
303+
239304
class _PaddedFile(gzip._PaddedFile):
240305
# Overwrite _PaddedFile from gzip as its prepend method assumes that
241306
# the prepended data is always read from its _buffer. Unfortunately in
@@ -275,6 +340,13 @@ def __init__(self, fp):
275340
# efficiently but this is outside scope for python-isal.
276341
self._read_buffer_size = 16 * 1024
277342

343+
def _read_gzip_header(self):
344+
last_mtime = _read_gzip_header(self._fp)
345+
if last_mtime is None:
346+
return False
347+
self._last_mtime = last_mtime
348+
return True
349+
278350
def read(self, size=-1):
279351
if size < 0:
280352
return self.readall()
@@ -300,7 +372,8 @@ def read(self, size=-1):
300372
# If the _new_member flag is set, we have to
301373
# jump to the next member, if there is one.
302374
self._crc = isal_zlib.crc32(b"")
303-
self._stream_size = 0 # Decompressed size of unconcatenated stream
375+
# Decompressed size of unconcatenated stream
376+
self._stream_size = 0
304377
if not self._read_gzip_header():
305378
self._size = self._pos
306379
return b""
@@ -364,52 +437,6 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
364437
return header + compressed
365438

366439

367-
def _gzip_header_end(data: bytes) -> int:
368-
"""
369-
Find the start of the raw deflate block in a gzip file.
370-
:param data: Compressed data that starts with a gzip header.
371-
:return: The end of the header / start of the raw deflate block.
372-
"""
373-
eof_error = EOFError("Compressed file ended before the end-of-stream "
374-
"marker was reached")
375-
if len(data) < 10:
376-
raise eof_error
377-
# We are not interested in mtime, xfl and os flags.
378-
magic, method, flags = struct.unpack("<HBB", data[:4])
379-
if magic != 0x8b1f:
380-
raise BadGzipFile(f"Not a gzipped file ({repr(data[:2])})")
381-
if method != 8:
382-
raise BadGzipFile("Unknown compression method")
383-
if not flags: # Likely when data compressed in memory
384-
return 10
385-
pos = 10
386-
if flags & FEXTRA:
387-
if len(data) < pos + 2:
388-
raise eof_error
389-
xlen, = struct.unpack("<H", data[pos: pos+2])
390-
pos += 2 + xlen
391-
if flags & FNAME:
392-
pos = data.find(b"\x00", pos) + 1
393-
# pos will be -1 + 1 when null byte not found.
394-
if not pos:
395-
raise eof_error
396-
if flags & FCOMMENT:
397-
pos = data.find(b"\x00", pos) + 1
398-
if not pos:
399-
raise eof_error
400-
if flags & FHCRC:
401-
if len(data) < pos + 2:
402-
raise eof_error
403-
header_crc, = struct.unpack("<H", data[pos: pos+2])
404-
# CRC is stored as a 16-bit integer by taking last bits of crc32.
405-
crc = isal_zlib.crc32(data[:pos]) & 0xFFFF
406-
if header_crc != crc:
407-
raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
408-
f"match: {crc:04x} != {header_crc:04x}")
409-
pos += 2
410-
return pos
411-
412-
413440
def decompress(data):
414441
"""Decompress a gzip compressed string in one shot.
415442
Return the decompressed string.
@@ -418,7 +445,10 @@ def decompress(data):
418445
while True:
419446
if not data: # Empty data returns empty bytestring
420447
return b"".join(decompressed_members)
421-
header_end = _gzip_header_end(data)
448+
fp = io.BytesIO(data)
449+
if _read_gzip_header(fp) is None:
450+
return b"".join(decompressed_members)
451+
header_end = fp.tell()
422452
# Use a zlib raw deflate compressor
423453
do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS)
424454
# Read all the data except the header

tests/test_igzip.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,10 @@ def headers():
392392

393393

394394
@pytest.mark.parametrize("header", list(headers()))
395-
def test_gzip_header_end(header):
396-
assert igzip._gzip_header_end(header) == len(header)
395+
def test_read_gzip_header_position(header):
396+
fp = io.BytesIO(header)
397+
igzip._read_gzip_header(fp)
398+
assert fp.tell() == len(header)
397399

398400

399401
def test_header_too_short():

0 commit comments

Comments
 (0)