Skip to content

Commit 5c137bb

Browse files
committed
Use _GzipReader and add tests
1 parent 30f3921 commit 5c137bb

File tree

3 files changed

+77
-94
lines changed

3 files changed

+77
-94
lines changed

src/zlib_ng/gzip_ng.py

Lines changed: 10 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import _compression # noqa: I201 # Not third-party
2929

3030
from . import zlib_ng
31+
from .zlib_ng import _GzipReader
3132

3233
__all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile",
3334
"READ_BUFFER_SIZE"]
@@ -36,19 +37,14 @@
3637
_COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION
3738
_COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION
3839

39-
#: The amount of data that is read in at once when decompressing a file.
40-
#: Increasing this value may increase performance.
41-
#: 128K is also the size used by pigz and cat to read files from the
42-
# filesystem.
43-
READ_BUFFER_SIZE = 128 * 1024
40+
# The amount of data that is read in at once when decompressing a file.
41+
# Increasing this value may increase performance.
42+
READ_BUFFER_SIZE = 512 * 1024
4443

4544
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
4645
READ, WRITE = 1, 2
4746

48-
try:
49-
BadGzipFile = gzip.BadGzipFile # type: ignore
50-
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
51-
BadGzipFile = OSError # type: ignore
47+
BadGzipFile = gzip.BadGzipFile # type: ignore
5248

5349

5450
# The open method was copied from the CPython source with minor adjustments.
@@ -149,7 +145,7 @@ def __init__(self, filename=None, mode=None,
149145
zlib_ng.DEF_MEM_LEVEL,
150146
0)
151147
if self.mode == READ:
152-
raw = _GzipNGReader(self.fileobj)
148+
raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
153149
self._buffer = io.BufferedReader(raw)
154150

155151
def __repr__(self):
@@ -180,73 +176,9 @@ def write(self, data):
180176
return length
181177

182178

183-
class _GzipNGReader(gzip._GzipReader):
184-
def __init__(self, fp):
185-
# Call the init method of gzip._GzipReader's parent here.
186-
# It is not very invasive and allows us to override _PaddedFile
187-
_compression.DecompressReader.__init__(
188-
self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor,
189-
wbits=-zlib_ng.MAX_WBITS)
190-
# Set flag indicating start of a new member
191-
self._new_member = True
192-
self._last_mtime = None
193-
194-
def read(self, size=-1):
195-
if size < 0:
196-
return self.readall()
197-
# size=0 is special because decompress(max_length=0) is not supported
198-
if not size:
199-
return b""
200-
201-
# For certain input data, a single
202-
# call to decompress() may not return
203-
# any data. In this case, retry until we get some data or reach EOF.
204-
while True:
205-
if self._decompressor.eof:
206-
# Ending case: we've come to the end of a member in the file,
207-
# so finish up this member, and read a new gzip header.
208-
# Check the CRC and file size, and set the flag so we read
209-
# a new member
210-
self._read_eof()
211-
self._new_member = True
212-
self._decompressor = self._decomp_factory(
213-
**self._decomp_args)
214-
215-
if self._new_member:
216-
# If the _new_member flag is set, we have to
217-
# jump to the next member, if there is one.
218-
self._init_read()
219-
if not self._read_gzip_header():
220-
self._size = self._pos
221-
return b""
222-
self._new_member = False
223-
224-
# Read a chunk of data from the file
225-
if self._decompressor.needs_input:
226-
buf = self._fp.read(READ_BUFFER_SIZE)
227-
uncompress = self._decompressor.decompress(buf, size)
228-
else:
229-
uncompress = self._decompressor.decompress(b"", size)
230-
if self._decompressor.unused_data != b"":
231-
# Prepend the already read bytes to the fileobj so they can
232-
# be seen by _read_eof() and _read_gzip_header()
233-
self._fp.prepend(self._decompressor.unused_data)
234-
235-
if uncompress != b"":
236-
break
237-
if buf == b"":
238-
raise EOFError("Compressed file ended before the "
239-
"end-of-stream marker was reached")
240-
241-
self._crc = zlib_ng.crc32(uncompress, self._crc)
242-
self._stream_size += len(uncompress)
243-
self._pos += len(uncompress)
244-
return uncompress
245-
246-
247179
# Aliases for improved compatibility with CPython gzip module.
248180
GzipFile = GzipNGFile
249-
_GzipReader = _GzipNGReader
181+
_GzipNGReader = _GzipReader
250182

251183

252184
def _read_exact(fp, n):
@@ -342,25 +274,9 @@ def decompress(data):
342274
"""Decompress a gzip compressed string in one shot.
343275
Return the decompressed string.
344276
"""
345-
decompressed_members = []
346-
while True:
347-
fp = io.BytesIO(data)
348-
if _read_gzip_header(fp) is None:
349-
return b"".join(decompressed_members)
350-
# Use a zlib raw deflate compressor
351-
do = zlib_ng.decompressobj(wbits=-zlib_ng.MAX_WBITS)
352-
# Read all the data except the header
353-
decompressed = do.decompress(data[fp.tell():])
354-
if not do.eof or len(do.unused_data) < 8:
355-
raise EOFError("Compressed file ended before the end-of-stream "
356-
"marker was reached")
357-
crc, length = struct.unpack("<II", do.unused_data[:8])
358-
if crc != zlib_ng.crc32(decompressed):
359-
raise BadGzipFile("CRC check failed")
360-
if length != (len(decompressed) & 0xffffffff):
361-
raise BadGzipFile("Incorrect length of data produced")
362-
decompressed_members.append(decompressed)
363-
data = do.unused_data[8:].lstrip(b"\x00")
277+
fp = io.BytesIO(data)
278+
reader = _GzipReader(fp, max(len(data), 16))
279+
return reader.readall()
364280

365281

366282
def _argument_parser():

tests/data/test.fastq.bgzip.gz

1.48 MB
Binary file not shown.

tests/test_gzip_ng.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,9 +373,76 @@ def test_truncated_header(trunc):
373373
gzip_ng.decompress(trunc)
374374

375375

376+
def test_very_long_header_in_data():
377+
# header with a very long filename.
378+
header = (b"\x1f\x8b\x08\x08\x00\x00\x00\x00\x00\xff" + 256 * 1024 * b"A" +
379+
b"\x00")
380+
compressed = header + zlib.compress(b"", 3, -15) + 8 * b"\00"
381+
assert gzip_ng.decompress(compressed) == b""
382+
383+
384+
def test_very_long_header_in_file():
385+
# header with a very long filename.
386+
header = (b"\x1f\x8b\x08\x08\x00\x00\x00\x00\x00\xff" +
387+
gzip_ng.READ_BUFFER_SIZE * 2 * b"A" +
388+
b"\x00")
389+
compressed = header + zlib.compress(b"", 3, -15) + 8 * b"\00"
390+
f = io.BytesIO(compressed)
391+
with gzip_ng.open(f) as gzip_file:
392+
assert gzip_file.read() == b""
393+
394+
376395
def test_concatenated_gzip():
377396
concat = Path(__file__).parent / "data" / "concatenated.fastq.gz"
378397
data = gzip.decompress(concat.read_bytes())
379398
with gzip_ng.open(concat, "rb") as gzip_ng_h:
380399
result = gzip_ng_h.read()
381400
assert data == result
401+
402+
403+
def test_seek():
404+
from io import SEEK_CUR, SEEK_END, SEEK_SET
405+
with tempfile.NamedTemporaryFile("wb", delete=False) as tmpfile:
406+
tmpfile.write(gzip.compress(b"X" * 500 + b"A" + b"X" * 499))
407+
tmpfile.write(gzip.compress(b"X" * 500 + b"B" + b"X" * 499))
408+
tmpfile.write(gzip.compress(b"X" * 500 + b"C" + b"X" * 499))
409+
tmpfile.write(gzip.compress(b"X" * 500 + b"D" + b"X" * 499))
410+
with gzip_ng.open(tmpfile.name, "rb") as gzip_file:
411+
# Start testing forward seek
412+
gzip_file.seek(500)
413+
assert gzip_file.read(1) == b"A"
414+
gzip_file.seek(1500)
415+
assert gzip_file.read(1) == b"B"
416+
# Test reverse
417+
gzip_file.seek(500)
418+
assert gzip_file.read(1) == b"A"
419+
# Again, but with explicit SEEK_SET
420+
gzip_file.seek(500, SEEK_SET)
421+
assert gzip_file.read(1) == b"A"
422+
gzip_file.seek(1500, SEEK_SET)
423+
assert gzip_file.read(1) == b"B"
424+
gzip_file.seek(500, SEEK_SET)
425+
assert gzip_file.read(1) == b"A"
426+
# Seeking from current position
427+
gzip_file.seek(500)
428+
gzip_file.seek(2000, SEEK_CUR)
429+
assert gzip_file.read(1) == b"C"
430+
gzip_file.seek(-1001, SEEK_CUR)
431+
assert gzip_file.read(1) == b"B"
432+
# Seeking from end
433+
# Any positive number should end up at the end
434+
gzip_file.seek(200, SEEK_END)
435+
assert gzip_file.read(1) == b""
436+
gzip_file.seek(-1500, SEEK_END)
437+
assert gzip_file.read(1) == b"C"
438+
os.remove(tmpfile.name)
439+
440+
441+
def test_bgzip():
442+
bgzip_file = Path(__file__).parent / "data" / "test.fastq.bgzip.gz"
443+
gzip_file = Path(__file__).parent / "data" / "test.fastq.gz"
444+
with gzip_ng.open(bgzip_file, "rb") as bgz:
445+
bgz_data = bgz.read()
446+
with gzip_ng.open(gzip_file, "rb") as gz:
447+
gz_data = gz.read()
448+
assert bgz_data == gz_data

0 commit comments

Comments
 (0)