Skip to content

Commit d611848

Browse files
authored
Merge pull request #7 from rhpvorderman/segfault
Fix testing issues
2 parents f73d256 + 9da2e7e commit d611848

File tree

9 files changed

+359
-329
lines changed

9 files changed

+359
-329
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Changelog
99
1010
version 0.1.0-dev
1111
-----------------
12+
+ Ensure the igzip module is fully compatible with the gzip stdlib module.
1213
+ Add compliance tests from CPython to ensure isal_zlib and igzip are validated
1314
to the same standards as the zlib and gzip modules.
1415
+ Added a working gzip app using ``python -m isal.igzip``

README.rst

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,10 @@ Differences with zlib and gzip modules
7171
+ ``isal_zlib`` only supports ``FLUSH``, ``SYNC_FLUSH`` and ``FULL_FLUSH``
7272
``FINISH`` is aliased to ``FULL_FLUSH`` (and works correctly as such).
7373
+ ``isal_zlib`` has a ``compressobj`` and ``decompressobj`` implementation.
74-
The resulting objects only support a maximum of 4 GB input for their
75-
``compress`` and ``decompress`` methods unlike ``zlib`` which supports
76-
an unlimited size. This difference arises because
77-
the zlib C library only supports very small amounts (64 KB) of input and the
78-
python zlibmodule.c has created a workaround so larger amounts can be
79-
supported. This workaround allows anything up to the maximum size that python
80-
can describe.
81-
However for isa-l the maximum supported size is 4GB. for isal_zlib it was
82-
decided that since the ``compressobj`` and ``decompressobj`` are only used in
83-
streaming applications, 4 GB is ample. This simplifies the underlying code
84-
quite a bit. If you need to compress or decompress larger sizes than 4 GB
85-
in memory then the ``compress`` and ``decompress`` methods support this.
74+
However, the unused_data and unconsumed_tail for the Decompress object, only
75+
work properly when using gzip compatible compression. (25 <= wbits <= 31).
76+
+ The flush implementation for the Compress object behavious differently from
77+
the zlib equivalent.
8678

8779
Contributing
8880
------------

src/isal/crc.pxd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
# cython: language_level=3
2222

2323
cdef extern from "<isa-l/crc.h>":
24-
cdef unsigned long crc32_gzip_refl(
25-
unsigned long init_crc, #!< initial CRC value, 32 bits
24+
cdef unsigned int crc32_gzip_refl(
25+
unsigned int init_crc, #!< initial CRC value, 32 bits
2626
const unsigned char *buf, #!< buffer to calculate CRC on
2727
unsigned long long len #!< buffer length in bytes (64-bit data)
2828
)

src/isal/igzip.py

Lines changed: 44 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,12 @@
2222
Library to speed up its methods."""
2323

2424
import argparse
25-
import functools
2625
import gzip
2726
import io
2827
import os
28+
import sys
2929

3030
import _compression
31-
import sys
3231

3332
from . import isal_zlib
3433

@@ -41,11 +40,13 @@
4140

4241
BUFFER_SIZE = _compression.BUFFER_SIZE
4342

44-
class BadGzipFile(OSError):
45-
pass
43+
try:
44+
BadGzipFile = gzip.BadGzipFile
45+
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
46+
BadGzipFile = OSError
4647

4748

48-
# The open method was copied from the python source with minor adjustments.
49+
# The open method was copied from the CPython source with minor adjustments.
4950
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_TRADEOFF,
5051
encoding=None, errors=None, newline=None):
5152
"""Open a gzip-compressed file in binary or text mode. This uses the isa-l
@@ -104,7 +105,7 @@ def __init__(self, filename=None, mode=None,
104105
isal_zlib.ISAL_BEST_SPEED, isal_zlib.ISAL_BEST_COMPRESSION
105106
))
106107
super().__init__(filename, mode, compresslevel, fileobj, mtime)
107-
if hasattr(self, "compress"):
108+
if self.mode == gzip.WRITE:
108109
self.compress = isal_zlib.compressobj(compresslevel,
109110
isal_zlib.DEFLATED,
110111
-isal_zlib.MAX_WBITS,
@@ -121,6 +122,20 @@ def __repr__(self):
121122
def flush(self, zlib_mode=isal_zlib.Z_SYNC_FLUSH):
122123
super().flush(zlib_mode)
123124

125+
def _write_gzip_header(self, compresslevel=_COMPRESS_LEVEL_TRADEOFF):
126+
# Determine what xfl flag is written for the compression level.
127+
# Equate the fast level to gzip level 1. All the other levels are
128+
# medium.
129+
if sys.version_info[0] == 3 and sys.version_info[1] < 7:
130+
# Correct header introduced in 3.7
131+
super()._write_gzip_header()
132+
else:
133+
if compresslevel == _COMPRESS_LEVEL_FAST:
134+
compresslevel = gzip._COMPRESS_LEVEL_FAST
135+
else:
136+
compresslevel = gzip._COMPRESS_LEVEL_TRADEOFF
137+
super()._write_gzip_header(compresslevel)
138+
124139
def write(self, data):
125140
self._check_not_closed()
126141
if self.mode != gzip.WRITE:
@@ -145,63 +160,28 @@ def write(self, data):
145160
return length
146161

147162

148-
# The gzip._GzipReader does all sorts of complex stuff. While using the
149-
# standard DecompressReader by _compression relies more on the C implementation
150-
# side of things. It is much simpler. Gzip header interpretation and gzip
151-
# checksum checking is already implemented in the isa-l library. So no need
152-
# to do so in pure python.
153-
class _IGzipReader(_compression.DecompressReader):
163+
class _IGzipReader(gzip._GzipReader):
154164
def __init__(self, fp):
155-
super().__init__(gzip._PaddedFile(fp), isal_zlib.decompressobj,
156-
trailing_error=isal_zlib.IsalError,
157-
wbits=16 + isal_zlib.MAX_WBITS)
158-
159-
# Created by mixing and matching gzip._GzipReader and
160-
# _compression.DecompressReader
161-
def read(self, size=-1):
162-
if size < 0:
163-
return self.readall()
164-
# size=0 is special because decompress(max_length=0) is not supported
165-
if not size:
166-
return b""
167-
168-
# For certain input data, a single
169-
# call to decompress() may not return
170-
# any data. In this case, retry until we get some data or reach EOF.
171-
uncompress = b""
172-
while True:
173-
if self._decompressor.eof:
174-
buf = (self._decompressor.unused_data or
175-
self._fp.read(BUFFER_SIZE))
176-
if not buf:
177-
break
178-
# Continue to next stream.
179-
self._decompressor = self._decomp_factory(
180-
**self._decomp_args)
181-
try:
182-
uncompress = self._decompressor.decompress(buf, size)
183-
except self._trailing_error:
184-
# Trailing data isn't a valid compressed stream; ignore it.
185-
break
186-
else:
187-
# Read a chunk of data from the file
188-
buf = self._fp.read(BUFFER_SIZE)
189-
uncompress = self._decompressor.decompress(buf, size)
190-
if self._decompressor.unconsumed_tail != b"":
191-
self._fp.prepend(self._decompressor.unconsumed_tail)
192-
elif self._decompressor.unused_data != b"":
193-
# Prepend the already read bytes to the fileobj so they can
194-
# be seen by _read_eof() and _read_gzip_header()
195-
self._fp.prepend(self._decompressor.unused_data)
196-
197-
if uncompress != b"":
198-
break
199-
if buf == b"":
200-
raise EOFError("Compressed file ended before the "
201-
"end-of-stream marker was reached")
202-
203-
self._pos += len(uncompress)
204-
return uncompress
165+
super().__init__(fp)
166+
self._decomp_factory = isal_zlib.decompressobj
167+
self._decomp_args = dict(wbits=64+isal_zlib.MAX_WBITS)
168+
# Set wbits such that ISAL_GZIP_NO_HDR_VER is used. This means that
169+
# it does not read a header, and it verifies the trailer.
170+
self._decompressor = self._decomp_factory(**self._decomp_args)
171+
172+
def _add_read_data(self, data):
173+
# isa-l verifies the trailer data, so no need to keep track of the crc.
174+
self._stream_size = self._stream_size + len(data)
175+
176+
def _read_eof(self):
177+
# Gzip files can be padded with zeroes and still have archives.
178+
# Consume all zero bytes and set the file position to the first
179+
# non-zero byte. See http://www.gzip.org/#faq8
180+
c = b"\x00"
181+
while c == b"\x00":
182+
c = self._fp.read(1)
183+
if c:
184+
self._fp.prepend(c)
205185

206186

207187
# Plagiarized from gzip.py from python's stdlib.
@@ -216,12 +196,12 @@ def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
216196
return buf.getvalue()
217197

218198

219-
# Unlike stdlib, do not use the roundabout way of doing this via a file.
220199
def decompress(data):
221200
"""Decompress a gzip compressed string in one shot.
222201
Return the decompressed string.
223202
"""
224-
return isal_zlib.decompress(data, wbits=16 + isal_zlib.MAX_WBITS)
203+
with IGzipFile(fileobj=io.BytesIO(data)) as f:
204+
return f.read()
225205

226206

227207
def main():

src/isal/igzip_lib.pxd

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -163,48 +163,44 @@ cdef extern from "<isa-l/igzip_lib.h>":
163163

164164
cdef struct BitBuf2:
165165
unsigned long long m_bits #!< bits in the bit buffer
166-
unsigned long m_bits_count; #!< number of valid bits in the bit buffer
166+
unsigned int m_bits_count; #!< number of valid bits in the bit buffer
167167
unsigned char *m_out_buff #!< current index of buffer to write to
168168
unsigned char *m_out_end #!< end of buffer to write to
169169
unsigned char *m_out_start #!< start of buffer to write to
170170

171171
cdef struct isal_zstate:
172-
unsigned long total_in_start #!< Not used, may be replaced with something else
173-
unsigned long block_next #!< Start of current deflate block in the input
174-
unsigned long block_end #!< End of current deflate block in the input
175-
unsigned long dist_mask #!< Distance mask used.
176-
unsigned long hash_mask
172+
unsigned int total_in_start #!< Not used, may be replaced with something else
173+
unsigned int block_next #!< Start of current deflate block in the input
174+
unsigned int block_end #!< End of current deflate block in the input
175+
unsigned int dist_mask #!< Distance mask used.
176+
unsigned int hash_mask
177177
isal_zstate_state state #!< Current state in processing the data stream
178178
BitBuf2 bitbuf
179-
unsigned long crc #!< Current checksum without finalize step if any (adler)
179+
unsigned int crc #!< Current checksum without finalize step if any (adler)
180180
unsigned char has_wrap_hdr #!< keeps track of wrapper header
181181
unsigned char has_eob_hdr #!< keeps track of eob on the last deflate block
182182
unsigned char has_hist #!< flag to track if there is match history
183-
unsigned int has_level_buf_init #!< flag to track if user supplied memory has been initialized.
184-
unsigned long count #!< used for partial header/trailer writes
183+
unsigned short has_level_buf_init #!< flag to track if user supplied memory has been initialized.
184+
unsigned int count #!< used for partial header/trailer writes
185185
unsigned char tmp_out_buff[16] #! temporary array
186-
unsigned long tmp_out_start #!< temporary variable
187-
unsigned long tmp_out_end #!< temporary variable
188-
unsigned long b_bytes_valid #!< number of valid bytes in buffer
189-
unsigned long b_bytes_processed #!< number of bytes processed in buffer
190-
# Below values give compile errors because they are dynamic in size
191-
#unsigned char buffer[2 * IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD] #!< Internal buffer
192-
# Stream should be setup such that the head is cache aligned
193-
#unsigned int head[IGZIP_LVL0_HASH_SIZE] #!< Hash array
186+
unsigned int tmp_out_start #!< temporary variable
187+
unsigned int tmp_out_end #!< temporary variable
188+
unsigned int b_bytes_valid #!< number of valid bytes in buffer
189+
unsigned int b_bytes_processed #!< number of bytes processed in buffer
194190

195191
cdef struct isal_hufftables:
196192
pass
197193

198194
cdef struct isal_zstream:
199195
unsigned char *next_in #!< Next input byte
200-
unsigned long avail_in #!< number of bytes available at next_in
201-
unsigned long total_in_start #!< total number of bytes read so far
196+
unsigned int avail_in #!< number of bytes available at next_in
197+
unsigned int total_in_start #!< total number of bytes read so far
202198
unsigned char *next_out #!< Next output byte
203-
unsigned long avail_out #!< number of bytes available at next_out
204-
unsigned long total_out #!< total number of bytes written so far
199+
unsigned int avail_out #!< number of bytes available at next_out
200+
unsigned int total_out #!< total number of bytes written so far
205201
isal_hufftables *hufftables #!< Huffman encoding used when compressing
206-
unsigned long level #!< Compression level to use
207-
unsigned long level_buf_size #!< Size of level_buf
202+
unsigned int level #!< Compression level to use
203+
unsigned int level_buf_size #!< Size of level_buf
208204
unsigned char * level_buf #!< User allocated buffer required for different compression levels
209205
unsigned short end_of_stream #!< non-zero if this is the last input buffer
210206
unsigned short flush #!< Flush type can be NO_FLUSH, SYNC_FLUSH or FULL_FLUSH
@@ -220,19 +216,20 @@ cdef extern from "<isa-l/igzip_lib.h>":
220216

221217
cdef struct inflate_state:
222218
unsigned char *next_out #!< Next output byte
223-
unsigned long avail_out #!< number of bytes available at next_out
224-
unsigned long total_out #!< total number of bytes written so far
219+
unsigned int avail_out #!< number of bytes available at next_out
220+
unsigned int total_out #!< total number of bytes written so far
225221
unsigned char *next_in #!< Next input byte
226-
unsigned long avail_in #!< number of bytes available at next_in
227-
long read_in_length #!< Bits in read_in
222+
unsigned int avail_in #!< number of bytes available at next_in
223+
unsigned long long read_in #!< Bits buffered to handle unaligned streams
224+
int read_in_length #!< Bits in read_in
228225
inflate_huff_code_large lit_huff_code #!< Structure for decoding lit/len symbols
229226
inflate_huff_code_small dist_huff_code #!< Structure for decoding dist symbols
230227
isal_block_state block_state #!< Current decompression state
231-
unsigned long dict_length #!< Length of dictionary used
232-
unsigned long bfinal #!< Flag identifying final block
233-
unsigned long crc_flag #!< Flag identifying whether to track of crc
234-
unsigned long crc #!< Contains crc or adler32 of output if crc_flag is set
235-
unsigned long hist_bits #!< Log base 2 of maximum lookback distance
228+
unsigned int dict_length #!< Length of dictionary used
229+
unsigned int bfinal #!< Flag identifying final block
230+
unsigned int crc_flag #!< Flag identifying whether to track of crc
231+
unsigned int crc #!< Contains crc or adler32 of output if crc_flag is set
232+
unsigned int hist_bits #!< Log base 2 of maximum lookback distance
236233
# Other members are omitted because they are not in use yet.
237234

238235
# Compression functions
@@ -267,7 +264,7 @@ cdef extern from "<isa-l/igzip_lib.h>":
267264
# */
268265
cdef int isal_deflate_set_dict(isal_zstream *stream,
269266
unsigned char *dict,
270-
unsigned long dict_len )
267+
unsigned int dict_len )
271268

272269

273270
#/**
@@ -355,24 +352,6 @@ cdef extern from "<isa-l/igzip_lib.h>":
355352
# */
356353
cdef int isal_deflate_stateless(isal_zstream *stream)
357354

358-
# Other functions
359-
360-
# /**
361-
# * @brief Calculate Adler-32 checksum, runs appropriate version.
362-
# *
363-
# * This function determines what instruction sets are enabled and selects the
364-
# * appropriate version at runtime.
365-
# *
366-
# * @param init: initial Adler-32 value
367-
# * @param buf: buffer to calculate checksum on
368-
# * @param len: buffer length in bytes
369-
# *
370-
# * @returns 32-bit Adler-32 checksum
371-
# */
372-
373-
unsigned long isal_adler32(unsigned long init,
374-
const unsigned char *buf,
375-
unsigned long long len)
376355

377356
###########################
378357
# Inflate functions
@@ -406,7 +385,7 @@ cdef extern from "<isa-l/igzip_lib.h>":
406385
# * @returns COMP_OK,
407386
# * ISAL_INVALID_STATE (dictionary could not be set)
408387
# */
409-
int isal_inflate_set_dict(inflate_state *state, unsigned char *dict, unsigned long dict_len)
388+
int isal_inflate_set_dict(inflate_state *state, unsigned char *dict, unsigned int dict_len)
410389

411390
# /**
412391
# * @brief Fast data (deflate) decompression for storage applications.
@@ -454,3 +433,22 @@ cdef extern from "<isa-l/igzip_lib.h>":
454433
# * ISAL_INCORRECT_CHECKSUM.
455434
# */
456435
int isal_inflate(inflate_state *state)
436+
437+
##########################
438+
# Other functions
439+
##########################
440+
# /**
441+
# * @brief Calculate Adler-32 checksum, runs appropriate version.
442+
# *
443+
# * This function determines what instruction sets are enabled and selects the
444+
# * appropriate version at runtime.
445+
# *
446+
# * @param init: initial Adler-32 value
447+
# * @param buf: buffer to calculate checksum on
448+
# * @param len: buffer length in bytes
449+
# *
450+
# * @returns 32-bit Adler-32 checksum
451+
# */
452+
unsigned int isal_adler32(unsigned int init,
453+
const unsigned char *buf,
454+
unsigned long long len)

0 commit comments

Comments
 (0)