Skip to content

Commit b37e5db

Browse files
authored
Merge pull request #61 from pycompression/fixbug
Fix offset error in _PaddedFile caused by concatenated gzips.
2 parents 57a5f15 + ec4e155 commit b37e5db

File tree

4 files changed

+42
-3
lines changed

4 files changed

+42
-3
lines changed

CHANGELOG.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.8.1
11+
-----------------
12+
+ Fix a bug where multi-member gzip files where read incorrectly due to an
13+
offset error. This was caused by ISA-L's decompressobj having a small
14+
bitbuffer which was not taken properly into account in some circumstances.
15+
1016
version 0.8.0
1117
-----------------
1218
+ Speed up ``igzip.compress`` and ``igzip.decompress`` by improving the

src/isal/igzip.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import sys
3030
import time
3131
from typing import List, Optional, SupportsInt
32+
import _compression # noqa: I201 # Not third-party
3233

3334
from . import isal_zlib
3435

@@ -204,11 +205,35 @@ def write(self, data):
204205
return length
205206

206207

208+
class _PaddedFile(gzip._PaddedFile):
209+
# Overwrite _PaddedFile from gzip as its prepend method assumes that
210+
# the prepended data is always read from its _buffer. Unfortunately in
211+
# isal_zlib.decompressobj there is a bitbuffer as well which may be added.
212+
# So an extra check is added to prepend to ensure no extra data in front
213+
# of the buffer was present. (Negative self._read).
214+
def prepend(self, prepend=b''):
215+
if self._read is not None:
216+
# Assume data was read since the last prepend() call
217+
self._read -= len(prepend)
218+
if self._read >= 0:
219+
return
220+
# If self._read is negative the data was read further back and
221+
# the buffer needs to be reset.
222+
self._buffer = prepend
223+
self._length = len(self._buffer)
224+
self._read = 0
225+
226+
207227
class _IGzipReader(gzip._GzipReader):
208228
def __init__(self, fp):
209-
super().__init__(fp)
210-
self._decomp_factory = isal_zlib.decompressobj
211-
self._decompressor = self._decomp_factory(**self._decomp_args)
229+
# Call the init method of gzip._GzipReader's parent here.
230+
# It is not very invasive and allows us to override _PaddedFile
231+
_compression.DecompressReader.__init__(
232+
self, _PaddedFile(fp), isal_zlib.decompressobj,
233+
wbits=-isal_zlib.MAX_WBITS)
234+
# Set flag indicating start of a new member
235+
self._new_member = True
236+
self._last_mtime = None
212237

213238
def _add_read_data(self, data):
214239
# Use faster isal crc32 calculation and update the stream size in place

tests/data/concatenated.fastq.gz

90.5 KB
Binary file not shown.

tests/test_igzip.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,3 +288,11 @@ def test_header_corrupt():
288288
def test_truncated_header(trunc):
289289
with pytest.raises(EOFError):
290290
igzip.decompress(trunc)
291+
292+
293+
def test_concatenated_gzip():
294+
concat = Path(__file__).parent / "data" / "concatenated.fastq.gz"
295+
data = gzip.decompress(concat.read_bytes())
296+
with igzip.open(concat, "rb") as igzip_h:
297+
result = igzip_h.read()
298+
assert data == result

0 commit comments

Comments
 (0)