Skip to content

Commit 85d573c

Browse files
authored
Merge pull request #55 from pycompression/release_0.7.0
Release 0.7.0
2 parents 24925a4 + f3be8f9 commit 85d573c

File tree

10 files changed

+176
-190
lines changed

10 files changed

+176
-190
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ jobs:
138138
submodules: recursive
139139
- uses: actions/setup-python@v2
140140
name: Install Python
141-
- name: Install cibuildwheel twine
142-
run: python -m pip install cibuildwheel twine
141+
- name: Install cibuildwheel twine wheel
142+
run: python -m pip install cibuildwheel twine wheel
143143
- name: Install build dependencies (Macos)
144144
run: brew install nasm automake autoconf
145145
if: runner.os == 'macOS'
@@ -149,8 +149,6 @@ jobs:
149149
- name: Install nasm (Windows)
150150
uses: ilammy/[email protected]
151151
if: runner.os == 'Windows'
152-
- name: Install cibuildwheel twine wheel
153-
run: python -m pip install cibuildwheel twine wheel
154152
- name: Build wheels
155153
run: cibuildwheel --output-dir dist
156154
env:
@@ -159,8 +157,8 @@ jobs:
159157
CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
160158
# Fully test the build wheels again.
161159
CIBW_TEST_REQUIRES: "pytest"
162-
# Simple test that requires the project to be build correctly
163-
CIBW_TEST_COMMAND: "pytest {project}/tests/test_igzip.py"
160+
# Test everything to be sure.
161+
CIBW_TEST_COMMAND: "pytest {project}/tests"
164162
- name: Build sdist
165163
if: "runner.os == 'Linux'"
166164
run: python setup.py sdist

CHANGELOG.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.7.0
11+
-----------------
12+
+ Remove workarounds in the ``igzip`` module for the ``unconsumed_tail``
13+
and ``unused_data`` bugs. ``igzip._IGzipReader`` now functions the same
14+
as ``gzip._GzipReader`` with only a few calls replaced with ``isal_zlib``
15+
calls for speed.
16+
+ Correctly implement ``unused_data`` and ``unconsumed_tail`` on
17+
``isal_zlib.Decompress`` objects.
18+
It works the same as in CPython's zlib now.
19+
+ Correctly implement flush implementation on ``isal_zlib.Compress`` and
20+
``isal_zlib.Decompress`` objects.
21+
It works the same as in CPython's zlib now.
22+
1023
version 0.6.1
1124
-----------------
1225
+ Fix a crash that occurs when opening a file that did not end in ``.gz`` while

README.rst

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ Differences with zlib and gzip modules
130130
+ Compression level 0 in ``zlib`` and ``gzip`` means **no compression**, while
131131
in ``isal_zlib`` and ``igzip`` this is the **lowest compression level**.
132132
This is a design choice that was inherited from the ISA-L library.
133-
+ Compression levels range from 0 to 3, not 1 to 9.
134-
+ ``igzip`` contains a class ``IGzipFile`` instead of ``GzipFile``. Since the
135-
compression levels are not compatible, a difference in naming was chosen to
136-
reflect this.
133+
+ Compression levels range from 0 to 3, not 1 to 9. ``isal_zlib.Z_DEFAULT_COMPRESSION``
134+
has been aliased to ``isal_zlib.ISAL_DEFAULT_COMPRESSION`` (2).
135+
+ ``isal_zlib`` only supports ``NO_FLUSH``, ``SYNC_FLUSH``, ``FULL_FLUSH`` and
136+
``FINISH_FLUSH``. Other flush modes are not supported and will raise errors.
137137
+ ``zlib.Z_DEFAULT_STRATEGY``, ``zlib.Z_RLE`` etc. are exposed as
138138
``isal_zlib.Z_DEFAULT_STRATEGY``, ``isal_zlib.Z_RLE`` etc. for compatibility
139139
reasons. However, ``isal_zlib`` only supports a default strategy and will
@@ -142,14 +142,14 @@ Differences with zlib and gzip modules
142142
``isal_zlib`` supports memory levels smallest, small, medium, large and
143143
largest. These have been mapped to levels 1, 2-3, 4-6, 7-8 and 9. So
144144
``isal_zlib`` can be used with zlib compatible memory levels.
145-
+ ``isal_zlib`` has a ``compressobj`` and ``decompressobj`` implementation.
146-
However, the unused_data and unconsumed_tail for the Decompress object, only
147-
work properly when using gzip compatible compression. (25 <= wbits <= 31).
148-
+ The flush implementation for the Compress object behavious differently from
149-
the zlib equivalent. The flush implementation is sufficient for
150-
the ``igzip`` module to work 100% in compliance with the ``gzip`` tests from
151-
CPython. It does not however work for all the ``zlib`` compliance tests
152-
(see above). This is an area that still needs work.
145+
+ ``isal_zlib`` methods have a ``data`` argument which is positional only. In
146+
isal_zlib this is not enforced and it can also called as keyword argument.
147+
This is due to implementing ``isal_zlib`` in cython and maintaining backwards
148+
compatibility with python 3.6.
149+
+ ``igzip.open`` returns a class ``IGzipFile`` instead of ``GzipFile``. Since
150+
the compression levels are not compatible, a difference in naming was chosen
151+
to reflect this. ``igzip.GzipFile`` does exist as an alias of
152+
``igzip.IGzipFile`` for compatibility reasons.
153153

154154
Contributing
155155
------------

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def build_isa_l():
152152

153153
setup(
154154
name="isal",
155-
version="0.6.1",
155+
version="0.7.0",
156156
description="Faster zlib and gzip compatible compression and "
157157
"decompression by providing python bindings for the ISA-L "
158158
"library.",

src/isal/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@
3939
"__version__"
4040
]
4141

42-
__version__ = "0.6.1"
42+
__version__ = "0.7.0"

src/isal/igzip.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,6 @@ def __repr__(self):
152152
s = repr(self.fileobj)
153153
return '<igzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
154154

155-
def flush(self, zlib_mode=isal_zlib.Z_SYNC_FLUSH):
156-
super().flush(zlib_mode)
157-
158155
def _write_gzip_header(self, compresslevel=_COMPRESS_LEVEL_TRADEOFF):
159156
# Python 3.9 added a `compresslevel` parameter to write gzip header.
160157
# This only determines the value of one extra flag. Because this change
@@ -206,24 +203,18 @@ class _IGzipReader(gzip._GzipReader):
206203
def __init__(self, fp):
207204
super().__init__(fp)
208205
self._decomp_factory = isal_zlib.decompressobj
209-
self._decomp_args = dict(wbits=64+isal_zlib.MAX_WBITS)
210-
# Set wbits such that ISAL_GZIP_NO_HDR_VER is used. This means that
211-
# it does not read a header, and it verifies the trailer.
212206
self._decompressor = self._decomp_factory(**self._decomp_args)
213207

214208
def _add_read_data(self, data):
215-
# isa-l verifies the trailer data, so no need to keep track of the crc.
216-
self._stream_size = self._stream_size + len(data)
217-
218-
def _read_eof(self):
219-
# Gzip files can be padded with zeroes and still have archives.
220-
# Consume all zero bytes and set the file position to the first
221-
# non-zero byte. See http://www.gzip.org/#faq8
222-
c = b"\x00"
223-
while c == b"\x00":
224-
c = self._fp.read(1)
225-
if c:
226-
self._fp.prepend(c)
209+
# Use faster isal crc32 calculation and update the stream size in place
210+
# compared to CPython gzip
211+
self._crc = isal_zlib.crc32(data, self._crc)
212+
self._stream_size += len(data)
213+
214+
215+
# Aliases for improved compatibility with CPython gzip module.
216+
GzipFile = IGzipFile
217+
_GzipReader = _IGzipReader
227218

228219

229220
# Plagiarized from gzip.py from python's stdlib.

src/isal/isal_zlib.pyx

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import zlib
2929

3030
from .crc cimport crc32_gzip_refl
3131
from .igzip_lib cimport *
32-
from libc.stdint cimport UINT64_MAX, UINT32_MAX, uint32_t
32+
from libc.stdint cimport UINT64_MAX, UINT32_MAX
3333
from cpython.mem cimport PyMem_Malloc, PyMem_Free
3434
from cpython.buffer cimport PyBUF_READ, PyBUF_C_CONTIGUOUS, PyObject_GetBuffer, \
3535
PyBuffer_Release
@@ -45,8 +45,6 @@ ISAL_DEFAULT_COMPRESSION = 2
4545
Z_BEST_SPEED = ISAL_BEST_SPEED
4646
Z_BEST_COMPRESSION = ISAL_BEST_COMPRESSION
4747
Z_DEFAULT_COMPRESSION = ISAL_DEFAULT_COMPRESSION
48-
cdef int ISAL_DEFAULT_COMPRESSION_I = ISAL_DEFAULT_COMPRESSION
49-
cdef int ZLIB_DEFAULT_COMPRESSION_I = zlib.Z_DEFAULT_COMPRESSION
5048

5149
DEF_BUF_SIZE = zlib.DEF_BUF_SIZE
5250
DEF_MEM_LEVEL = zlib.DEF_MEM_LEVEL
@@ -65,13 +63,10 @@ Z_FILTERED=zlib.Z_FILTERED
6563
Z_FIXED=zlib.Z_FIXED
6664

6765
# Flush methods
68-
ISAL_NO_FLUSH=NO_FLUSH
69-
ISAL_SYNC_FLUSH=SYNC_FLUSH
70-
ISAL_FULL_FLUSH=FULL_FLUSH
71-
72-
Z_NO_FLUSH=ISAL_NO_FLUSH
73-
Z_SYNC_FLUSH=ISAL_SYNC_FLUSH
74-
Z_FINISH=ISAL_FULL_FLUSH
66+
Z_NO_FLUSH=zlib.Z_NO_FLUSH
67+
Z_SYNC_FLUSH=zlib.Z_SYNC_FLUSH
68+
Z_FULL_FLUSH=zlib.Z_FULL_FLUSH
69+
Z_FINISH=zlib.Z_FINISH
7570

7671
class IsalError(OSError):
7772
"""Exception raised on compression and decompression errors."""
@@ -126,12 +121,6 @@ def crc32(data, value = 0):
126121
finally:
127122
PyBuffer_Release(buffer)
128123

129-
cdef Py_ssize_t Py_ssize_t_min(Py_ssize_t a, Py_ssize_t b):
130-
if a <= b:
131-
return a
132-
else:
133-
return b
134-
135124
ctypedef fused stream_or_state:
136125
isal_zstream
137126
inflate_state
@@ -147,7 +136,7 @@ cdef void arrange_input_buffer(stream_or_state *stream, Py_ssize_t *remains):
147136
remains[0] -= stream.avail_in
148137

149138
def compress(data,
150-
int level=ISAL_DEFAULT_COMPRESSION_I,
139+
int level=ISAL_DEFAULT_COMPRESSION,
151140
int wbits = ISAL_DEF_MAX_HIST_BITS):
152141
"""
153142
Compresses the bytes in *data*. Returns a bytes object with the
@@ -165,9 +154,6 @@ def compress(data,
165154
-9 to -15 will generate a raw compressed string with
166155
no headers and trailers.
167156
"""
168-
if level == ZLIB_DEFAULT_COMPRESSION_I:
169-
level = ISAL_DEFAULT_COMPRESSION_I
170-
171157
# Initialise stream
172158
cdef isal_zstream stream
173159
cdef unsigned int level_buf_size = zlib_mem_level_to_isal(level, DEF_MEM_LEVEL)
@@ -352,7 +338,7 @@ def compressobj(int level=ISAL_DEFAULT_COMPRESSION,
352338
no headers and trailers.
353339
:param memLevel: The amount of memory used for the internal compression
354340
state. Higher values use more memory for better speed and
355-
smaller output.
341+
smaller output. Values between 1 and 9 are supported.
356342
:zdict: A predefined compression dictionary. A sequence of bytes
357343
that are expected to occur frequently in the to be
358344
compressed data. The most common subsequences should come
@@ -392,8 +378,6 @@ cdef class Compress:
392378
err = isal_deflate_set_dict(&self.stream, zdict, zdict_length)
393379
if err != COMP_OK:
394380
check_isal_deflate_rc(err)
395-
if level == ZLIB_DEFAULT_COMPRESSION_I:
396-
level = ISAL_DEFAULT_COMPRESSION_I
397381
self.stream.level = level
398382
self.stream.level_buf_size = zlib_mem_level_to_isal(level, memLevel)
399383
self.level_buf = <unsigned char *>PyMem_Malloc(self.stream.level_buf_size * sizeof(char))
@@ -452,27 +436,34 @@ cdef class Compress:
452436
finally:
453437
PyBuffer_Release(buffer)
454438

455-
def flush(self, int mode=FULL_FLUSH):
439+
def flush(self, mode=zlib.Z_FINISH):
456440
"""
457441
All pending input is processed, and a bytes object containing the
458442
remaining compressed output is returned.
459443
460-
:param mode: Defaults to ISAL_FULL_FLUSH (Z_FINISH equivalent) which
444+
:param mode: Defaults to Z_FINISH which
461445
finishes the compressed stream and prevents compressing
462-
any more data. The only other supported method is
463-
ISAL_SYNC_FLUSH (Z_SYNC_FLUSH) equivalent.
446+
any more data. The other supported methods are
447+
Z_NO_FLUSH, Z_SYNC_FLUSH and Z_FULL_FLUSH.
464448
"""
465-
if mode == NO_FLUSH:
449+
450+
if mode == zlib.Z_NO_FLUSH:
466451
# Flushing with no_flush does nothing.
467452
return b""
468-
469-
self.stream.end_of_stream = 1
470-
self.stream.flush = mode
453+
elif mode == zlib.Z_FINISH:
454+
self.stream.flush = FULL_FLUSH
455+
self.stream.end_of_stream = 1
456+
elif mode == zlib.Z_FULL_FLUSH:
457+
self.stream.flush = FULL_FLUSH
458+
elif mode == zlib.Z_SYNC_FLUSH:
459+
self.stream.flush=SYNC_FLUSH
460+
else:
461+
raise IsalError("Unsupported flush mode")
471462

472463
# Initialise output buffer
473464
out = []
474465

475-
while self.stream.internal_state.state != ZSTATE_END:
466+
while True:
476467
self.stream.next_out = self.obuf # Reset output buffer.
477468
self.stream.avail_out = self.obuflen
478469
err = isal_deflate(&self.stream)
@@ -485,6 +476,10 @@ cdef class Compress:
485476
# the data is appended to a list.
486477
# TODO: Improve this with the buffer protocol.
487478
out.append(self.obuf[:self.obuflen - self.stream.avail_out])
479+
if self.stream.avail_out != 0: # All input is processed and therefore all output flushed.
480+
break
481+
if self.stream.avail_in != 0:
482+
raise AssertionError("There should be no available input after flushing.")
488483
return b"".join(out)
489484

490485
cdef class Decompress:
@@ -528,19 +523,41 @@ cdef class Decompress:
528523
if self.obuf is not NULL:
529524
PyMem_Free(self.obuf)
530525

526+
def _view_bitbuffer(self):
527+
"""Shows the 64-bitbuffer of the internal inflate_state. It contains
528+
a maximum of 8 bytes. This data is already read-in so is not part
529+
of the unconsumed tail."""
530+
bits_in_buffer = self.stream.read_in_length
531+
read_in_length = bits_in_buffer // 8
532+
if read_in_length == 0:
533+
return b""
534+
remainder = bits_in_buffer % 8
535+
read_in = self.stream.read_in
536+
# The bytes are added by bitshifting, so in reverse order. Reading the
537+
# 64-bit integer into 8 bytes little-endian provides the characters in
538+
# the correct order.
539+
return (read_in >> remainder).to_bytes(8, "little")[:read_in_length]
540+
531541
cdef save_unconsumed_input(self, Py_buffer *data):
532542
cdef Py_ssize_t old_size, new_size, left_size
533543
cdef bytes new_data
534544
if self.stream.block_state == ISAL_BLOCK_FINISH:
535545
self.eof = 1
536546
if self.stream.avail_in > 0:
537-
old_size = len(self.unused_data)
538547
left_size = <unsigned char*>data.buf + data.len - self.stream.next_in
539-
if left_size > (PY_SSIZE_T_MAX - old_size):
540-
raise MemoryError()
541548
new_data = PyBytes_FromStringAndSize(<char *>self.stream.next_in, left_size)
542-
self.unused_data += new_data
543-
if self.stream.avail_in > 0 or self.unconsumed_tail:
549+
else:
550+
new_data = b""
551+
if not self.unused_data:
552+
# The block is finished and this decompressobject can not be
553+
# used anymore. Some unused data is in the bitbuffer and has to
554+
# be recovered. Only when self.unused_data is empty. Otherwise
555+
# we assume the bitbuffer data is already added.
556+
self.unused_data = self._view_bitbuffer()
557+
self.unused_data += new_data
558+
if self.unconsumed_tail:
559+
self.unconsumed_tail = b"" # When there is unused_data unconsumed tail should be b""
560+
elif self.stream.avail_in > 0 or self.unconsumed_tail:
544561
left_size = <unsigned char*>data.buf + data.len - self.stream.next_in
545562
new_data = PyBytes_FromStringAndSize(<char *>self.stream.next_in, left_size)
546563
self.unconsumed_tail = new_data
@@ -646,9 +663,9 @@ cdef class Decompress:
646663
cdef Py_ssize_t unused_bytes
647664

648665
try:
649-
while self.stream.block_state != ISAL_BLOCK_FINISH and ibuflen !=0:
666+
while True:
650667
arrange_input_buffer(&self.stream, &ibuflen)
651-
while (self.stream.block_state != ISAL_BLOCK_FINISH):
668+
while True:
652669
self.stream.next_out = obuf # Reset output buffer.
653670
self.stream.avail_out = obuflen
654671
err = isal_inflate(&self.stream)
@@ -660,17 +677,20 @@ cdef class Decompress:
660677
# Instead of output buffer resizing as the zlibmodule.c example
661678
# the data is appended to a list.
662679
# TODO: Improve this with the buffer protocol.
680+
if self.stream.avail_out == obuflen:
681+
break
663682
bytes_written = obuflen - self.stream.avail_out
664683
total_bytes += bytes_written
665684
out.append(obuf[:bytes_written])
685+
if self.stream.avail_out != 0:
686+
break
687+
if self.stream.block_state == ISAL_BLOCK_FINISH or ibuflen == 0:
688+
break
666689
self.save_unconsumed_input(buffer)
667690
return b"".join(out)
668691
finally:
669692
PyMem_Free(obuf)
670693

671-
@property
672-
def crc(self):
673-
return self.stream.crc
674694

675695
cdef wbits_to_flag_and_hist_bits_deflate(int wbits,
676696
unsigned short * hist_bits,
@@ -707,9 +727,6 @@ cdef wbits_to_flag_and_hist_bits_inflate(int wbits,
707727
elif 40 <= wbits <= 47: # Accept gzip or zlib
708728
hist_bits[0] = wbits - 32
709729
crc_flag[0] = ISAL_GZIP if gzip else ISAL_ZLIB
710-
elif 72 <=wbits <= 79:
711-
hist_bits[0] = wbits - 64
712-
crc_flag[0] = ISAL_GZIP_NO_HDR_VER
713730
else:
714731
raise ValueError("Invalid wbits value")
715732

0 commit comments

Comments
 (0)