Skip to content

Commit d51768d

Browse files
authored
Merge pull request #78 from pycompression/igzipdecompressor
Make decompression slightly faster for the CLI tool
2 parents d10aa55 + c65712a commit d51768d

File tree

12 files changed

+471
-39
lines changed

12 files changed

+471
-39
lines changed

CHANGELOG.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.11.0-dev
11+
------------------
12+
In this release the ``python -m isal.igzip`` relatively slow decompression rate
13+
has been improved. Previously it was 19% slower than ``igzip`` when used with
14+
the ``-d`` flag for decompressing, now it is just 8% slower.
15+
16+
+ Reverse a bug in the build system which caused some docstring and parameter
17+
information on ``igzip_lib`` and ``isal_zlib`` to disappear in the
18+
documentation and the REPL.
19+
+ Increase the buffer size for ``python -m isal.igzip`` so it is now closer
20+
to speeds reached with ``igzip``.
21+
+ Add a ``READ_BUFFER_SIZE`` attribute to ``igzip`` which allows setting the
22+
amount of raw data that is read at once.
23+
+ Add an ``igzip_lib.IgzipDecompressor`` object which can decompress without
24+
using an unconsumed_tail and is therefore more efficient.
25+
1026
version 0.10.0
1127
------------------
1228
+ Added an ``igzip_lib`` module which allows more direct access to ISA-L's

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ API-documentation: igzip
104104
========================
105105

106106
.. automodule:: isal.igzip
107-
:members: compress, decompress, open
107+
:members: compress, decompress, open, BadGzipFile, GzipFile, READ_BUFFER_SIZE
108108

109109
.. autoclass:: IGzipFile
110110
:members:

profile_igzipreader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66

77
def main():
8+
igzip.READ_BUFFER_SIZE = 32 * 1024
89
with igzip.open(sys.argv[1], mode="rb") as gzip_h:
910
while True:
1011
block = gzip_h.read(32*1024)

setup.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@
3636
SYSTEM_IS_WINDOWS = sys.platform.startswith("win")
3737

3838

39-
def default_compiler_directives():
40-
return dict(language_level="3",
41-
binding=True)
42-
43-
4439
class IsalExtension(Extension):
4540
"""Custom extension to allow for targeted modification."""
4641
pass
@@ -110,23 +105,20 @@ def build_extension(self, ext):
110105
"include")]
111106
# -fPIC needed for proper static linking
112107
ext.extra_compile_args = ["-fPIC"]
113-
114-
# Import cython here so python setup.py can be used without
115-
# installing cython.
116-
from Cython.Build import cythonize
117-
compiler_directives = default_compiler_directives()
118-
line_tracing_enabled = os.getenv("CYTHON_COVERAGE") is not None
119-
if line_tracing_enabled:
120-
# Add cython directives for coverage support.
121-
compiler_directives.update(linetrace=True)
122-
cythonized_exts = cythonize(
123-
ext, compiler_directives=compiler_directives)
124-
125-
for cython_ext in cythonized_exts:
126-
if line_tracing_enabled:
108+
if os.getenv("CYTHON_COVERAGE") is not None:
109+
# Import cython here so python setup.py can be used without
110+
# installing cython.
111+
from Cython.Build import cythonize
112+
# Add cython directives and macros for coverage support.
113+
cythonized_exts = cythonize(ext, compiler_directives=dict(
114+
linetrace=True
115+
))
116+
for cython_ext in cythonized_exts:
127117
cython_ext.define_macros = [("CYTHON_TRACE_NOGIL", "1")]
128-
cython_ext._needs_stub = False
129-
super().build_extension(cython_ext)
118+
cython_ext._needs_stub = False
119+
super().build_extension(cython_ext)
120+
return
121+
super().build_extension(ext)
130122

131123

132124
# Use a cache to prevent isa-l from being build twice. According to the

src/isal/_isal.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
2123
from .version cimport ISAL_MAJOR_VERSION as C_ISAL_MAJOR_VERSION
2224
from .version cimport ISAL_MINOR_VERSION as C_ISAL_MINOR_VERSION
2325
from .version cimport ISAL_PATCH_VERSION as C_ISAL_PATCH_VERSION

src/isal/crc.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
2123
cdef extern from "<isa-l/crc.h>":
2224
cdef unsigned int crc32_gzip_refl(
2325
unsigned int init_crc, #!< initial CRC value, 32 bits

src/isal/igzip.py

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,17 @@
3333

3434
from . import igzip_lib, isal_zlib
3535

36-
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile"]
36+
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
37+
"READ_BUFFER_SIZE"]
3738

3839
_COMPRESS_LEVEL_FAST = isal_zlib.ISAL_BEST_SPEED
3940
_COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
4041
_COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
4142

43+
#: The amount of data that is read in at once when decompressing a file.
44+
#: Increasing this value may increase performance.
45+
READ_BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
46+
4247
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
4348

4449
try:
@@ -229,8 +234,8 @@ def __init__(self, fp):
229234
# Call the init method of gzip._GzipReader's parent here.
230235
# It is not very invasive and allows us to override _PaddedFile
231236
_compression.DecompressReader.__init__(
232-
self, _PaddedFile(fp), isal_zlib.decompressobj,
233-
wbits=-isal_zlib.MAX_WBITS)
237+
self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
238+
hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
234239
# Set flag indicating start of a new member
235240
self._new_member = True
236241
self._last_mtime = None
@@ -241,6 +246,57 @@ def _add_read_data(self, data):
241246
self._crc = isal_zlib.crc32(data, self._crc)
242247
self._stream_size += len(data)
243248

249+
def read(self, size=-1):
250+
if size < 0:
251+
return self.readall()
252+
# size=0 is special because decompress(max_length=0) is not supported
253+
if not size:
254+
return b""
255+
256+
# For certain input data, a single
257+
# call to decompress() may not return
258+
# any data. In this case, retry until we get some data or reach EOF.
259+
while True:
260+
if self._decompressor.eof:
261+
# Ending case: we've come to the end of a member in the file,
262+
# so finish up this member, and read a new gzip header.
263+
# Check the CRC and file size, and set the flag so we read
264+
# a new member
265+
self._read_eof()
266+
self._new_member = True
267+
self._decompressor = self._decomp_factory(
268+
**self._decomp_args)
269+
270+
if self._new_member:
271+
# If the _new_member flag is set, we have to
272+
# jump to the next member, if there is one.
273+
self._init_read()
274+
if not self._read_gzip_header():
275+
self._size = self._pos
276+
return b""
277+
self._new_member = False
278+
279+
# Read a chunk of data from the file
280+
if self._decompressor.needs_input:
281+
buf = self._fp.read(READ_BUFFER_SIZE)
282+
uncompress = self._decompressor.decompress(buf, size)
283+
else:
284+
uncompress = self._decompressor.decompress(b"", size)
285+
if self._decompressor.unused_data != b"":
286+
# Prepend the already read bytes to the fileobj so they can
287+
# be seen by _read_eof() and _read_gzip_header()
288+
self._fp.prepend(self._decompressor.unused_data)
289+
290+
if uncompress != b"":
291+
break
292+
if buf == b"":
293+
raise EOFError("Compressed file ended before the "
294+
"end-of-stream marker was reached")
295+
296+
self._add_read_data(uncompress)
297+
self._pos += len(uncompress)
298+
return uncompress
299+
244300

245301
# Aliases for improved compatibility with CPython gzip module.
246302
GzipFile = IGzipFile
@@ -382,7 +438,7 @@ def _argument_parser():
382438
# diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
383439
# ~6% faster.
384440
parser.add_argument("-b", "--buffer-size",
385-
default=32 * 1024, type=int,
441+
default=128 * 1024, type=int,
386442
help=argparse.SUPPRESS)
387443
return parser
388444

@@ -418,6 +474,8 @@ def main():
418474
elif not args.compress and args.file is not None:
419475
out_file = io.open(base, "wb")
420476

477+
global READ_BUFFER_SIZE
478+
READ_BUFFER_SIZE = args.buffer_size
421479
try:
422480
while True:
423481
block = in_file.read(args.buffer_size)

src/isal/igzip_lib.pxd

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
# cython: binding=True
23+
2124
cdef extern from "<isa-l/igzip_lib.h>":
2225
# Deflate compression standard defines
2326
int ISAL_DEF_MAX_HDR_SIZE
@@ -490,16 +493,16 @@ cdef:
490493

491494
cdef int mem_level_to_bufsize(int compression_level, int mem_level, unsigned int *bufsize)
492495

493-
cpdef compress(data,
494-
int level= ?,
495-
int flag = ?,
496-
int mem_level = ?,
497-
int hist_bits = ?,
496+
cdef _compress(data,
497+
int level,
498+
int flag,
499+
int mem_level,
500+
int hist_bits,
498501
)
499502

500-
cpdef decompress(data,
501-
int flag = ?,
502-
int hist_bits= ?,
503-
Py_ssize_t bufsize= ?)
503+
cdef _decompress(data,
504+
int flag,
505+
int hist_bits,
506+
Py_ssize_t bufsize)
504507

505508
cdef bytes view_bitbuffer(inflate_state * stream)

src/isal/igzip_lib.pyi

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,10 @@ def compress(data, level: int = ISAL_DEFAULT_COMPRESSION,
5353
def decompress(data, flag: int = DECOMP_DEFLATE,
5454
hist_bits: int = MAX_HIST_BITS,
5555
bufsize: int = DEF_BUF_SIZE) -> bytes: ...
56+
57+
class IgzipDecompressor:
58+
unused_data: bytes
59+
needs_input: bool
60+
eof: bool
61+
62+
def decompress(self, data, max_length = -1) -> bytes: ...

0 commit comments

Comments
 (0)