Skip to content

Commit ecf9117

Browse files
authored
Merge pull request #148 from pycompression/bgzipperformance
Better BGZF format streaming performance.
2 parents 92e4cf2 + 45c0a9d commit ecf9117

File tree

3 files changed

+115
-72
lines changed

3 files changed

+115
-72
lines changed

CHANGELOG.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,17 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 1.3.0-dev
11+
-----------------
12+
+ Gzip headers are now actively checked for a BGZF extra field. If found the
13+
block size is taken into account when decompressing. This has further
14+
improved bgzf decompression speed by 5% on some files compared to the
15+
more generic solution of 1.2.0.
16+
+ Integrated CPython 3.11 code for reading gzip headers. This leads to more
17+
commonality between the python-isal code and the upstream gzip.py code.
18+
This has enabled the change above. It comes at the cost of a slight increase
19+
in overhead at the ``gzip.decompress`` function.
20+
1021
version 1.2.0
1122
-----------------
1223
+ Bgzip files are now detected and a smaller reading buffer is used to

src/isal/igzip.py

Lines changed: 100 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -220,20 +220,80 @@ def write(self, data):
220220
return length
221221

222222

223-
def detect_bgzip(header: bytes) -> bool:
224-
if len(header) < 18:
225-
return False
226-
magic, method, flags, mtime, xfl, os, xlen, si1, si2, slen, bsize = \
227-
struct.unpack("<HBBIBBHBBHH", header[:18])
228-
return (
229-
method == 8 and # Deflate method used
230-
flags & 4 and # There are extra fields
231-
xlen == 6 and # The extra field should be of length 6
232-
si1 == 66 and # BGZIP magic number one
233-
si2 == 67 and # BGZIP magic number two
234-
slen == 2 # The length of the 16 bit integer that stores
235-
# the size of the block
236-
)
223+
def _read_exact(fp, n):
224+
'''Read exactly *n* bytes from `fp`
225+
226+
This method is required because fp may be unbuffered,
227+
i.e. return short reads.
228+
'''
229+
data = fp.read(n)
230+
while len(data) < n:
231+
b = fp.read(n - len(data))
232+
if not b:
233+
raise EOFError("Compressed file ended before the "
234+
"end-of-stream marker was reached")
235+
data += b
236+
return data
237+
238+
239+
def _read_gzip_header(fp):
240+
'''Read a gzip header from `fp` and progress to the end of the header.
241+
242+
Returns None if header not present. Parses mtime from the header, looks
243+
for BGZF format blocks and parses the block size, setting it to None if
244+
not present. Returns a tuple of mtime, block_size if a header was present.
245+
'''
246+
# Do not use read_exact because a header may not be present. Read twice
247+
# since fp might be unbuffered.
248+
magic = fp.read(1) + fp.read(1)
249+
if magic == b'':
250+
return None
251+
252+
if magic != b'\037\213':
253+
raise BadGzipFile('Not a gzipped file (%r)' % magic)
254+
255+
common_fields = _read_exact(fp, 8)
256+
(method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields)
257+
if method != 8:
258+
raise BadGzipFile('Unknown compression method')
259+
block_size = None
260+
if not flag: # Likely when data compressed in memory
261+
return last_mtime, block_size
262+
header = magic + common_fields
263+
if flag & FEXTRA:
264+
# Read & discard the extra field, if present
265+
encoded_length = _read_exact(fp, 2)
266+
extra_len, = struct.unpack("<H", encoded_length)
267+
extra_field = _read_exact(fp, extra_len)
268+
# Bgzip file detection
269+
if extra_len == 6:
270+
s1, s2, slen, bsize = struct.unpack("<BBHH", extra_field)
271+
if s1 == 66 and s2 == 67 and slen == 2:
272+
# Bgzip magic and correct slen.
273+
block_size = bsize
274+
header = header + encoded_length + extra_field
275+
if flag & FNAME:
276+
# Read and discard a null-terminated string containing the filename
277+
while True:
278+
s = _read_exact(fp, 1)
279+
header += s
280+
if s == b'\000':
281+
break
282+
if flag & FCOMMENT:
283+
# Read and discard a null-terminated string containing a comment
284+
while True:
285+
s = _read_exact(fp, 1)
286+
header += s
287+
if s == b'\000':
288+
break
289+
if flag & FHCRC:
290+
header_crc_encoded = _read_exact(fp, 2)
291+
header_crc, = struct.unpack("<H", header_crc_encoded)
292+
crc = isal_zlib.crc32(header) & 0xFFFF
293+
if header_crc != crc:
294+
raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
295+
f"match: {crc:04x} != {header_crc:04x}")
296+
return last_mtime, block_size
237297

238298

239299
class _PaddedFile(gzip._PaddedFile):
@@ -266,14 +326,21 @@ def __init__(self, fp):
266326
self._new_member = True
267327
self._last_mtime = None
268328
self._read_buffer_size = READ_BUFFER_SIZE
269-
if hasattr(fp, "peek") and detect_bgzip(fp.peek(18)):
270-
# bgzip consists of puny little blocks of max 64K uncompressed data
271-
# so in practice probably more around 16K in compressed size. A
272-
# 128K buffer is a massive overshoot and slows down the
273-
# decompression.
274-
# bgzip stores the block size, so it can be unpacked more
275-
# efficiently but this is outside scope for python-isal.
276-
self._read_buffer_size = 16 * 1024
329+
330+
def _read_gzip_header(self):
331+
header_info = _read_gzip_header(self._fp)
332+
if header_info is None:
333+
return False
334+
# Get the BGZF block size from the header if present. If the read
335+
# buffer size is set to exactly the block size, there will be less
336+
# overhead as reading the file will stop right before the gzip trailer.
337+
# On normal gzip files nothing happens and this optimization is not
338+
# detrimental.
339+
last_mtime, block_size = header_info
340+
self._last_mtime = last_mtime
341+
self._read_buffer_size = (block_size if block_size is not None
342+
else READ_BUFFER_SIZE)
343+
return True
277344

278345
def read(self, size=-1):
279346
if size < 0:
@@ -299,7 +366,9 @@ def read(self, size=-1):
299366
if self._new_member:
300367
# If the _new_member flag is set, we have to
301368
# jump to the next member, if there is one.
302-
self._init_read()
369+
self._crc = isal_zlib.crc32(b"")
370+
# Decompressed size of unconcatenated stream
371+
self._stream_size = 0
303372
if not self._read_gzip_header():
304373
self._size = self._pos
305374
return b""
@@ -363,61 +432,22 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
363432
return header + compressed
364433

365434

366-
def _gzip_header_end(data: bytes) -> int:
367-
"""
368-
Find the start of the raw deflate block in a gzip file.
369-
:param data: Compressed data that starts with a gzip header.
370-
:return: The end of the header / start of the raw deflate block.
371-
"""
372-
eof_error = EOFError("Compressed file ended before the end-of-stream "
373-
"marker was reached")
374-
if len(data) < 10:
375-
raise eof_error
376-
# We are not interested in mtime, xfl and os flags.
377-
magic, method, flags = struct.unpack("<HBB", data[:4])
378-
if magic != 0x8b1f:
379-
raise BadGzipFile(f"Not a gzipped file ({repr(data[:2])})")
380-
if method != 8:
381-
raise BadGzipFile("Unknown compression method")
382-
if not flags: # Likely when data compressed in memory
383-
return 10
384-
pos = 10
385-
if flags & FEXTRA:
386-
if len(data) < pos + 2:
387-
raise eof_error
388-
xlen, = struct.unpack("<H", data[pos: pos+2])
389-
pos += 2 + xlen
390-
if flags & FNAME:
391-
pos = data.find(b"\x00", pos) + 1
392-
# pos will be -1 + 1 when null byte not found.
393-
if not pos:
394-
raise eof_error
395-
if flags & FCOMMENT:
396-
pos = data.find(b"\x00", pos) + 1
397-
if not pos:
398-
raise eof_error
399-
if flags & FHCRC:
400-
if len(data) < pos + 2:
401-
raise eof_error
402-
header_crc, = struct.unpack("<H", data[pos: pos+2])
403-
# CRC is stored as a 16-bit integer by taking last bits of crc32.
404-
crc = isal_zlib.crc32(data[:pos]) & 0xFFFF
405-
if header_crc != crc:
406-
raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
407-
f"match: {crc:04x} != {header_crc:04x}")
408-
pos += 2
409-
return pos
410-
411-
412435
def decompress(data):
413436
"""Decompress a gzip compressed string in one shot.
414437
Return the decompressed string.
438+
439+
This function checks for extra gzip members. Using
440+
isal_zlib.decompress(data, wbits=31) is faster in cases where only one
441+
gzip member is guaranteed to be present.
415442
"""
416443
decompressed_members = []
417444
while True:
418445
if not data: # Empty data returns empty bytestring
419446
return b"".join(decompressed_members)
420-
header_end = _gzip_header_end(data)
447+
fp = io.BytesIO(data)
448+
if _read_gzip_header(fp) is None:
449+
return b"".join(decompressed_members)
450+
header_end = fp.tell()
421451
# Use a zlib raw deflate compressor
422452
do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS)
423453
# Read all the data except the header

tests/test_igzip.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,10 @@ def headers():
392392

393393

394394
@pytest.mark.parametrize("header", list(headers()))
395-
def test_gzip_header_end(header):
396-
assert igzip._gzip_header_end(header) == len(header)
395+
def test_read_gzip_header_position(header):
396+
fp = io.BytesIO(header)
397+
igzip._read_gzip_header(fp)
398+
assert fp.tell() == len(header)
397399

398400

399401
def test_header_too_short():

0 commit comments

Comments
 (0)