Skip to content

Commit a75d160

Browse files
authored
Merge pull request #151 from pycompression/cgzipmodule
Rewrite _IGzipReader in C for even less overhead.
2 parents fc6ff1b + 67ea6c8 commit a75d160

File tree

9 files changed

+743
-243
lines changed

9 files changed

+743
-243
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ jobs:
1919
- uses: actions/[email protected]
2020
with:
2121
submodules: recursive
22-
- name: Set up Python 3.7
22+
- name: Set up Python 3.8
2323
uses: actions/[email protected]
2424
with:
25-
python-version: 3.7
25+
python-version: 3.8
2626
- name: Install tox
2727
run: pip install tox
2828
- name: Lint
@@ -39,10 +39,10 @@ jobs:
3939
- uses: actions/[email protected]
4040
with:
4141
submodules: recursive
42-
- name: Set up Python 3.7
42+
- name: Set up Python 3.8
4343
uses: actions/[email protected]
4444
with:
45-
python-version: 3.7
45+
python-version: 3.8
4646
- name: Install isal
4747
run: sudo apt-get install libisal-dev
4848
- name: Install tox and upgrade setuptools and pip
@@ -57,20 +57,18 @@ jobs:
5757
strategy:
5858
matrix:
5959
python-version:
60-
- "3.7"
6160
- "3.8"
6261
- "3.9"
6362
- "3.10"
6463
- "3.11"
65-
- "pypy-3.7"
6664
- "pypy-3.8"
6765
- "pypy-3.9"
6866
os: ["ubuntu-latest"]
6967
include:
7068
- os: "macos-latest"
71-
python-version: 3.7
69+
python-version: 3.8
7270
- os: "windows-latest"
73-
python-version: 3.7
71+
python-version: 3.8
7472
steps:
7573
- uses: actions/[email protected]
7674
with:
@@ -106,7 +104,7 @@ jobs:
106104
strategy:
107105
matrix:
108106
python_version:
109-
- "3.7"
107+
- "3.8"
110108
steps:
111109
- uses: actions/[email protected]
112110
with:
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import sys
2+
3+
from isal.isal_zlib import _GzipReader
4+
5+
if __name__ == "__main__":
6+
with open(sys.argv[1], "rb") as f:
7+
reader = _GzipReader(f, 512 * 1024)
8+
while True:
9+
block = reader.read(128 * 1024)
10+
if not block:
11+
break

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
[metadata]
2-
license_file=LICENSE
2+
license_files=LICENSE

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ def build_isa_l():
158158
classifiers=[
159159
"Programming Language :: Python :: 3 :: Only",
160160
"Programming Language :: Python :: 3",
161-
"Programming Language :: Python :: 3.7",
162161
"Programming Language :: Python :: 3.8",
163162
"Programming Language :: Python :: 3.9",
164163
"Programming Language :: Python :: 3.10",
@@ -173,6 +172,6 @@ def build_isa_l():
173172
"Operating System :: MacOS",
174173
"Operating System :: Microsoft :: Windows",
175174
],
176-
python_requires=">=3.7", # We use METH_FASTCALL
175+
python_requires=">=3.8", # BadGzipFile imported
177176
ext_modules=EXTENSIONS
178177
)

src/isal/igzip.py

Lines changed: 12 additions & 213 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@
3636
import sys
3737
import time
3838
from typing import Optional, SupportsInt
39-
import _compression # noqa: I201 # Not third-party
4039

4140
from . import igzip_lib, isal_zlib
41+
from .isal_zlib import _GzipReader
4242

4343
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
4444
"READ_BUFFER_SIZE"]
@@ -47,19 +47,16 @@
4747
_COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
4848
_COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
4949

50-
#: The amount of data that is read in at once when decompressing a file.
51-
#: Increasing this value may increase performance.
52-
#: 128K is also the size used by pigz and cat to read files from the
53-
# filesystem.
54-
READ_BUFFER_SIZE = 128 * 1024
50+
# The amount of data that is read in at once when decompressing a file.
51+
# Increasing this value may increase performance.
52+
# After 512K the performance does not increase anymore on a Ryzen 5 3600 test
53+
# system.
54+
READ_BUFFER_SIZE = 512 * 1024
5555

5656
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
5757
READ, WRITE = 1, 2
5858

59-
try:
60-
BadGzipFile = gzip.BadGzipFile # type: ignore
61-
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
62-
BadGzipFile = OSError # type: ignore
59+
BadGzipFile = gzip.BadGzipFile # type: ignore
6360

6461

6562
# The open method was copied from the CPython source with minor adjustments.
@@ -166,7 +163,7 @@ def __init__(self, filename=None, mode=None,
166163
isal_zlib.DEF_MEM_LEVEL,
167164
0)
168165
if self.mode == READ:
169-
raw = _IGzipReader(self.fileobj)
166+
raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
170167
self._buffer = io.BufferedReader(raw)
171168

172169
def __repr__(self):
@@ -220,186 +217,9 @@ def write(self, data):
220217
return length
221218

222219

223-
def _read_exact(fp, n):
224-
'''Read exactly *n* bytes from `fp`
225-
226-
This method is required because fp may be unbuffered,
227-
i.e. return short reads.
228-
'''
229-
data = fp.read(n)
230-
while len(data) < n:
231-
b = fp.read(n - len(data))
232-
if not b:
233-
raise EOFError("Compressed file ended before the "
234-
"end-of-stream marker was reached")
235-
data += b
236-
return data
237-
238-
239-
def _read_gzip_header(fp):
240-
'''Read a gzip header from `fp` and progress to the end of the header.
241-
242-
Returns None if header not present. Parses mtime from the header, looks
243-
for BGZF format blocks and parses the block size, setting it to None if
244-
not present. Returns a tuple of mtime, block_size if a header was present.
245-
'''
246-
# Do not use read_exact because a header may not be present. Read twice
247-
# since fp might be unbuffered.
248-
magic = fp.read(1) + fp.read(1)
249-
if magic == b'':
250-
return None
251-
252-
if magic != b'\037\213':
253-
raise BadGzipFile('Not a gzipped file (%r)' % magic)
254-
255-
common_fields = _read_exact(fp, 8)
256-
(method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields)
257-
if method != 8:
258-
raise BadGzipFile('Unknown compression method')
259-
block_size = None
260-
if not flag: # Likely when data compressed in memory
261-
return last_mtime, block_size
262-
header = magic + common_fields
263-
if flag & FEXTRA:
264-
# Read & discard the extra field, if present
265-
encoded_length = _read_exact(fp, 2)
266-
extra_len, = struct.unpack("<H", encoded_length)
267-
extra_field = _read_exact(fp, extra_len)
268-
# Bgzip file detection
269-
if extra_len == 6:
270-
s1, s2, slen, bsize = struct.unpack("<BBHH", extra_field)
271-
if s1 == 66 and s2 == 67 and slen == 2:
272-
# Bgzip magic and correct slen.
273-
block_size = bsize
274-
header = header + encoded_length + extra_field
275-
if flag & FNAME:
276-
# Read and discard a null-terminated string containing the filename
277-
while True:
278-
s = _read_exact(fp, 1)
279-
header += s
280-
if s == b'\000':
281-
break
282-
if flag & FCOMMENT:
283-
# Read and discard a null-terminated string containing a comment
284-
while True:
285-
s = _read_exact(fp, 1)
286-
header += s
287-
if s == b'\000':
288-
break
289-
if flag & FHCRC:
290-
header_crc_encoded = _read_exact(fp, 2)
291-
header_crc, = struct.unpack("<H", header_crc_encoded)
292-
crc = isal_zlib.crc32(header) & 0xFFFF
293-
if header_crc != crc:
294-
raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
295-
f"match: {crc:04x} != {header_crc:04x}")
296-
return last_mtime, block_size
297-
298-
299-
class _PaddedFile(gzip._PaddedFile):
300-
# Overwrite _PaddedFile from gzip as its prepend method assumes that
301-
# the prepended data is always read from its _buffer. Unfortunately in
302-
# isal_zlib.decompressobj there is a bitbuffer as well which may be added.
303-
# So an extra check is added to prepend to ensure no extra data in front
304-
# of the buffer was present. (Negative self._read).
305-
def prepend(self, prepend=b''):
306-
if self._read is not None:
307-
# Assume data was read since the last prepend() call
308-
self._read -= len(prepend)
309-
if self._read >= 0:
310-
return
311-
# If self._read is negative the data was read further back and
312-
# the buffer needs to be reset.
313-
self._buffer = prepend
314-
self._length = len(self._buffer)
315-
self._read = 0
316-
317-
318-
class _IGzipReader(gzip._GzipReader):
319-
def __init__(self, fp):
320-
# Call the init method of gzip._GzipReader's parent here.
321-
# It is not very invasive and allows us to override _PaddedFile
322-
_compression.DecompressReader.__init__(
323-
self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
324-
hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
325-
# Set flag indicating start of a new member
326-
self._new_member = True
327-
self._last_mtime = None
328-
self._read_buffer_size = READ_BUFFER_SIZE
329-
330-
def _read_gzip_header(self):
331-
header_info = _read_gzip_header(self._fp)
332-
if header_info is None:
333-
return False
334-
# Get the BGZF block size from the header if present. If the read
335-
# buffer size is set to exactly the block size, there will be less
336-
# overhead as reading the file will stop right before the gzip trailer.
337-
# On normal gzip files nothing happens and this optimization is not
338-
# detrimental.
339-
last_mtime, block_size = header_info
340-
self._last_mtime = last_mtime
341-
self._read_buffer_size = (block_size if block_size is not None
342-
else READ_BUFFER_SIZE)
343-
return True
344-
345-
def read(self, size=-1):
346-
if size < 0:
347-
return self.readall()
348-
# size=0 is special because decompress(max_length=0) is not supported
349-
if not size:
350-
return b""
351-
352-
# For certain input data, a single
353-
# call to decompress() may not return
354-
# any data. In this case, retry until we get some data or reach EOF.
355-
while True:
356-
if self._decompressor.eof:
357-
# Ending case: we've come to the end of a member in the file,
358-
# so finish up this member, and read a new gzip header.
359-
# Check the CRC and file size, and set the flag so we read
360-
# a new member
361-
self._read_eof()
362-
self._new_member = True
363-
self._decompressor = self._decomp_factory(
364-
**self._decomp_args)
365-
366-
if self._new_member:
367-
# If the _new_member flag is set, we have to
368-
# jump to the next member, if there is one.
369-
self._crc = isal_zlib.crc32(b"")
370-
# Decompressed size of unconcatenated stream
371-
self._stream_size = 0
372-
if not self._read_gzip_header():
373-
self._size = self._pos
374-
return b""
375-
self._new_member = False
376-
377-
# Read a chunk of data from the file
378-
if self._decompressor.needs_input:
379-
buf = self._fp.read(self._read_buffer_size)
380-
uncompress = self._decompressor.decompress(buf, size)
381-
else:
382-
uncompress = self._decompressor.decompress(b"", size)
383-
if self._decompressor.unused_data != b"":
384-
# Prepend the already read bytes to the fileobj so they can
385-
# be seen by _read_eof() and _read_gzip_header()
386-
self._fp.prepend(self._decompressor.unused_data)
387-
388-
if uncompress != b"":
389-
break
390-
if buf == b"":
391-
raise EOFError("Compressed file ended before the "
392-
"end-of-stream marker was reached")
393-
394-
self._crc = isal_zlib.crc32(uncompress, self._crc)
395-
self._stream_size += len(uncompress)
396-
self._pos += len(uncompress)
397-
return uncompress
398-
399-
400220
# Aliases for improved compatibility with CPython gzip module.
401221
GzipFile = IGzipFile
402-
_GzipReader = _IGzipReader
222+
_IGzipReader = _GzipReader
403223

404224

405225
def _create_simple_gzip_header(compresslevel: int,
@@ -440,28 +260,9 @@ def decompress(data):
440260
isal_zlib.decompress(data, wbits=31) is faster in cases where only one
441261
gzip member is guaranteed to be present.
442262
"""
443-
decompressed_members = []
444-
while True:
445-
if not data: # Empty data returns empty bytestring
446-
return b"".join(decompressed_members)
447-
fp = io.BytesIO(data)
448-
if _read_gzip_header(fp) is None:
449-
return b"".join(decompressed_members)
450-
header_end = fp.tell()
451-
# Use a zlib raw deflate compressor
452-
do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS)
453-
# Read all the data except the header
454-
decompressed = do.decompress(data[header_end:])
455-
if not do.eof or len(do.unused_data) < 8:
456-
raise EOFError("Compressed file ended before the end-of-stream "
457-
"marker was reached")
458-
crc, length = struct.unpack("<II", do.unused_data[:8])
459-
if crc != isal_zlib.crc32(decompressed):
460-
raise BadGzipFile("CRC check failed")
461-
if length != (len(decompressed) & 0xffffffff):
462-
raise BadGzipFile("Incorrect length of data produced")
463-
decompressed_members.append(decompressed)
464-
data = do.unused_data[8:].lstrip(b"\x00")
263+
fp = io.BytesIO(data)
264+
reader = _GzipReader(fp, max(len(data), 16))
265+
return reader.readall()
465266

466267

467268
def _argument_parser():
@@ -563,8 +364,6 @@ def main():
563364
else:
564365
out_file = sys.stdout.buffer
565366

566-
global READ_BUFFER_SIZE
567-
READ_BUFFER_SIZE = args.buffer_size
568367
try:
569368
while True:
570369
block = in_file.read(args.buffer_size)

src/isal/isal_zlib.pyi

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
# This file is part of python-isal which is distributed under the
66
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.
77

8+
import typing
9+
810
ISAL_BEST_SPEED: int
911
ISAL_BEST_COMPRESSION: int
1012
ISAL_DEFAULT_COMPRESSION: int
@@ -61,3 +63,16 @@ def compressobj(level: int = ISAL_DEFAULT_COMPRESSION,
6163
strategy: int = Z_DEFAULT_STRATEGY,
6264
zdict = None) -> Compress: ...
6365
def decompressobj(wbits: int = MAX_WBITS, zdict = None) -> Decompress: ...
66+
67+
class _GzipReader:
68+
def __init__(self, fp: typing.BinaryIO, buffersize: int = 32 * 1024): ...
69+
def readinto(self, obj) -> int: ...
70+
def readable(self) -> bool: ...
71+
def writable(self) -> bool: ...
72+
def seekable(self) -> bool: ...
73+
def tell(self) -> int: ...
74+
def seek(self, offset: int, whence: int): ...
75+
def close(self): ...
76+
def readall(self) -> bytes: ...
77+
def read(self, __size: int): ...
78+
def flush(self): ...

0 commit comments

Comments
 (0)