Skip to content

Commit 901133f

Browse files
authored
Merge pull request #86 from pycompression/release_0.11.0
Release 0.11.0
2 parents dc92b71 + d4265ab commit 901133f

14 files changed

+624
-74
lines changed

CHANGELOG.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,29 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.11.0
11+
------------------
12+
In this release the ``python -m isal.igzip`` relatively slow decompression rate
13+
has been improved in both speed and usability. Previously it was 19% slower
14+
than ``igzip`` when used with the ``-d`` flag for decompressing, now it is
15+
just 8% slower. Also some extra flags were added to make it easier to select
16+
the output file.
17+
18+
+ Prompt when an output file is overwritten with the ``python -m isal.igzip``
19+
command line utility and provide the ``-f`` or ``--force`` flags to force
20+
overwriting.
21+
+ Added ``-o`` and ``--output`` flags to the ``python -m isal.igzip`` command
22+
line utility to allow the user to select the destination of the output file.
23+
+ Reverse a bug in the build system which caused some docstring and parameter
24+
information on ``igzip_lib`` and ``isal_zlib`` to disappear in the
25+
documentation and the REPL.
26+
+ Increase the buffer size for ``python -m isal.igzip`` so it is now closer
27+
to speeds reached with ``igzip``.
28+
+ Add a ``READ_BUFFER_SIZE`` attribute to ``igzip`` which allows setting the
29+
amount of raw data that is read at once.
30+
+ Add an ``igzip_lib.IgzipDecompressor`` object which can decompress without
31+
using an unconsumed_tail and is therefore more efficient.
32+
1033
version 0.10.0
1134
------------------
1235
+ Added an ``igzip_lib`` module which allows more direct access to ISA-L's

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ API-documentation: igzip
104104
========================
105105

106106
.. automodule:: isal.igzip
107-
:members: compress, decompress, open
107+
:members: compress, decompress, open, BadGzipFile, GzipFile, READ_BUFFER_SIZE
108108

109109
.. autoclass:: IGzipFile
110110
:members:

profile_igzipreader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66

77
def main():
8+
igzip.READ_BUFFER_SIZE = 32 * 1024
89
with igzip.open(sys.argv[1], mode="rb") as gzip_h:
910
while True:
1011
block = gzip_h.read(32*1024)

setup.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@
3636
SYSTEM_IS_WINDOWS = sys.platform.startswith("win")
3737

3838

39-
def default_compiler_directives():
40-
return dict(language_level="3",
41-
binding=True)
42-
43-
4439
class IsalExtension(Extension):
4540
"""Custom extension to allow for targeted modification."""
4641
pass
@@ -110,23 +105,20 @@ def build_extension(self, ext):
110105
"include")]
111106
# -fPIC needed for proper static linking
112107
ext.extra_compile_args = ["-fPIC"]
113-
114-
# Import cython here so python setup.py can be used without
115-
# installing cython.
116-
from Cython.Build import cythonize
117-
compiler_directives = default_compiler_directives()
118-
line_tracing_enabled = os.getenv("CYTHON_COVERAGE") is not None
119-
if line_tracing_enabled:
120-
# Add cython directives for coverage support.
121-
compiler_directives.update(linetrace=True)
122-
cythonized_exts = cythonize(
123-
ext, compiler_directives=compiler_directives)
124-
125-
for cython_ext in cythonized_exts:
126-
if line_tracing_enabled:
108+
if os.getenv("CYTHON_COVERAGE") is not None:
109+
# Import cython here so python setup.py can be used without
110+
# installing cython.
111+
from Cython.Build import cythonize
112+
# Add cython directives and macros for coverage support.
113+
cythonized_exts = cythonize(ext, compiler_directives=dict(
114+
linetrace=True
115+
))
116+
for cython_ext in cythonized_exts:
127117
cython_ext.define_macros = [("CYTHON_TRACE_NOGIL", "1")]
128-
cython_ext._needs_stub = False
129-
super().build_extension(cython_ext)
118+
cython_ext._needs_stub = False
119+
super().build_extension(cython_ext)
120+
return
121+
super().build_extension(ext)
130122

131123

132124
# Use a cache to prevent isa-l from being build twice. According to the
@@ -178,7 +170,7 @@ def build_isa_l(compiler_command: str, compiler_options: str):
178170

179171
setup(
180172
name="isal",
181-
version="0.10.0",
173+
version="0.11.0",
182174
description="Faster zlib and gzip compatible compression and "
183175
"decompression by providing python bindings for the ISA-L "
184176
"library.",

src/isal/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@
3939
"__version__"
4040
]
4141

42-
__version__ = "0.10.0"
42+
__version__ = "0.11.0"

src/isal/_isal.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
2123
from .version cimport ISAL_MAJOR_VERSION as C_ISAL_MAJOR_VERSION
2224
from .version cimport ISAL_MINOR_VERSION as C_ISAL_MINOR_VERSION
2325
from .version cimport ISAL_PATCH_VERSION as C_ISAL_PATCH_VERSION

src/isal/crc.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
2123
cdef extern from "<isa-l/crc.h>":
2224
cdef unsigned int crc32_gzip_refl(
2325
unsigned int init_crc, #!< initial CRC value, 32 bits

src/isal/igzip.py

Lines changed: 109 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,17 @@
3333

3434
from . import igzip_lib, isal_zlib
3535

36-
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile"]
36+
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
37+
"READ_BUFFER_SIZE"]
3738

3839
_COMPRESS_LEVEL_FAST = isal_zlib.ISAL_BEST_SPEED
3940
_COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
4041
_COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
4142

43+
#: The amount of data that is read in at once when decompressing a file.
44+
#: Increasing this value may increase performance.
45+
READ_BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
46+
4247
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
4348

4449
try:
@@ -229,8 +234,8 @@ def __init__(self, fp):
229234
# Call the init method of gzip._GzipReader's parent here.
230235
# It is not very invasive and allows us to override _PaddedFile
231236
_compression.DecompressReader.__init__(
232-
self, _PaddedFile(fp), isal_zlib.decompressobj,
233-
wbits=-isal_zlib.MAX_WBITS)
237+
self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
238+
hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
234239
# Set flag indicating start of a new member
235240
self._new_member = True
236241
self._last_mtime = None
@@ -241,6 +246,57 @@ def _add_read_data(self, data):
241246
self._crc = isal_zlib.crc32(data, self._crc)
242247
self._stream_size += len(data)
243248

249+
def read(self, size=-1):
250+
if size < 0:
251+
return self.readall()
252+
# size=0 is special because decompress(max_length=0) is not supported
253+
if not size:
254+
return b""
255+
256+
# For certain input data, a single
257+
# call to decompress() may not return
258+
# any data. In this case, retry until we get some data or reach EOF.
259+
while True:
260+
if self._decompressor.eof:
261+
# Ending case: we've come to the end of a member in the file,
262+
# so finish up this member, and read a new gzip header.
263+
# Check the CRC and file size, and set the flag so we read
264+
# a new member
265+
self._read_eof()
266+
self._new_member = True
267+
self._decompressor = self._decomp_factory(
268+
**self._decomp_args)
269+
270+
if self._new_member:
271+
# If the _new_member flag is set, we have to
272+
# jump to the next member, if there is one.
273+
self._init_read()
274+
if not self._read_gzip_header():
275+
self._size = self._pos
276+
return b""
277+
self._new_member = False
278+
279+
# Read a chunk of data from the file
280+
if self._decompressor.needs_input:
281+
buf = self._fp.read(READ_BUFFER_SIZE)
282+
uncompress = self._decompressor.decompress(buf, size)
283+
else:
284+
uncompress = self._decompressor.decompress(b"", size)
285+
if self._decompressor.unused_data != b"":
286+
# Prepend the already read bytes to the fileobj so they can
287+
# be seen by _read_eof() and _read_gzip_header()
288+
self._fp.prepend(self._decompressor.unused_data)
289+
290+
if uncompress != b"":
291+
break
292+
if buf == b"":
293+
raise EOFError("Compressed file ended before the "
294+
"end-of-stream marker was reached")
295+
296+
self._add_read_data(uncompress)
297+
self._pos += len(uncompress)
298+
return uncompress
299+
244300

245301
# Aliases for improved compatibility with CPython gzip module.
246302
GzipFile = IGzipFile
@@ -376,13 +432,18 @@ def _argument_parser():
376432
dest="compress",
377433
const=False,
378434
help="Decompress the file instead of compressing.")
379-
parser.add_argument("-c", "--stdout", action="store_true",
380-
help="write on standard output")
435+
output_group = parser.add_mutually_exclusive_group()
436+
output_group.add_argument("-c", "--stdout", action="store_true",
437+
help="write on standard output")
438+
output_group.add_argument("-o", "--output",
439+
help="Write to this output file")
440+
parser.add_argument("-f", "--force", action="store_true",
441+
help="Overwrite output without prompting")
381442
# -b flag not taken by either gzip or igzip. Hidden attribute. Above 32K
382443
# diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
383444
# ~6% faster.
384445
parser.add_argument("-b", "--buffer-size",
385-
default=32 * 1024, type=int,
446+
default=128 * 1024, type=int,
386447
help=argparse.SUPPRESS)
387448
return parser
388449

@@ -392,32 +453,49 @@ def main():
392453

393454
compresslevel = args.compresslevel or _COMPRESS_LEVEL_TRADEOFF
394455

395-
# Determine input file
396-
if args.compress and args.file is None:
397-
in_file = sys.stdin.buffer
398-
elif args.compress and args.file is not None:
399-
in_file = io.open(args.file, mode="rb")
400-
elif not args.compress and args.file is None:
401-
in_file = IGzipFile(mode="rb", fileobj=sys.stdin.buffer)
402-
elif not args.compress and args.file is not None:
403-
base, extension = os.path.splitext(args.file)
404-
if extension != ".gz" and not args.stdout:
405-
sys.exit(f"filename doesn't end in .gz: {args.file!r}. "
406-
f"Cannot determine output filename.")
407-
in_file = open(args.file, "rb")
408-
409-
# Determine output file
410-
if args.compress and (args.file is None or args.stdout):
411-
out_file = IGzipFile(mode="wb", compresslevel=compresslevel,
412-
fileobj=sys.stdout.buffer)
413-
elif args.compress and args.file is not None:
414-
out_file = open(args.file + ".gz", mode="wb",
415-
compresslevel=compresslevel)
416-
elif not args.compress and (args.file is None or args.stdout):
417-
out_file = sys.stdout.buffer
418-
elif not args.compress and args.file is not None:
419-
out_file = io.open(base, "wb")
456+
if args.output:
457+
out_filepath = args.output
458+
elif args.stdout:
459+
out_filepath = None # to stdout
460+
elif args.file is None:
461+
out_filepath = None # to stout
462+
else:
463+
if args.compress:
464+
out_filepath = args.file + ".gz"
465+
else:
466+
out_filepath, extension = os.path.splitext(args.file)
467+
if extension != ".gz" and not args.stdout:
468+
sys.exit(f"filename doesn't end in .gz: {args.file!r}. "
469+
f"Cannot determine output filename.")
470+
if out_filepath is not None and not args.force:
471+
if os.path.exists(out_filepath):
472+
yes_or_no = input(f"{out_filepath} already exists; "
473+
f"do you wish to overwrite (y/n)?")
474+
if yes_or_no not in {"y", "Y", "yes"}:
475+
sys.exit("not overwritten")
476+
477+
if args.compress:
478+
if args.file is None:
479+
in_file = sys.stdin.buffer
480+
else:
481+
in_file = io.open(args.file, mode="rb")
482+
if out_filepath is not None:
483+
out_file = open(out_filepath, "wb", compresslevel=compresslevel)
484+
else:
485+
out_file = IGzipFile(mode="wb", fileobj=sys.stdout.buffer,
486+
compresslevel=compresslevel)
487+
else:
488+
if args.file:
489+
in_file = open(args.file, mode="rb")
490+
else:
491+
in_file = IGzipFile(mode="rb", fileobj=sys.stdin.buffer)
492+
if out_filepath is not None:
493+
out_file = io.open(out_filepath, mode="wb")
494+
else:
495+
out_file = sys.stdout.buffer
420496

497+
global READ_BUFFER_SIZE
498+
READ_BUFFER_SIZE = args.buffer_size
421499
try:
422500
while True:
423501
block = in_file.read(args.buffer_size)

src/isal/igzip_lib.pxd

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
# cython: language_level=3
22+
# cython: binding=True
23+
2124
cdef extern from "<isa-l/igzip_lib.h>":
2225
# Deflate compression standard defines
2326
int ISAL_DEF_MAX_HDR_SIZE
@@ -490,16 +493,16 @@ cdef:
490493

491494
cdef int mem_level_to_bufsize(int compression_level, int mem_level, unsigned int *bufsize)
492495

493-
cpdef compress(data,
494-
int level= ?,
495-
int flag = ?,
496-
int mem_level = ?,
497-
int hist_bits = ?,
496+
cdef _compress(data,
497+
int level,
498+
int flag,
499+
int mem_level,
500+
int hist_bits,
498501
)
499502

500-
cpdef decompress(data,
501-
int flag = ?,
502-
int hist_bits= ?,
503-
Py_ssize_t bufsize= ?)
503+
cdef _decompress(data,
504+
int flag,
505+
int hist_bits,
506+
Py_ssize_t bufsize)
504507

505508
cdef bytes view_bitbuffer(inflate_state * stream)

src/isal/igzip_lib.pyi

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,10 @@ def compress(data, level: int = ISAL_DEFAULT_COMPRESSION,
5353
def decompress(data, flag: int = DECOMP_DEFLATE,
5454
hist_bits: int = MAX_HIST_BITS,
5555
bufsize: int = DEF_BUF_SIZE) -> bytes: ...
56+
57+
class IgzipDecompressor:
58+
unused_data: bytes
59+
needs_input: bool
60+
eof: bool
61+
62+
def decompress(self, data, max_length = -1) -> bytes: ...

0 commit comments

Comments
 (0)