Skip to content

Commit 828838e

Browse files
committed
Add igzip_lib decompressor
1 parent d10aa55 commit 828838e

File tree

3 files changed

+378
-2
lines changed

3 files changed

+378
-2
lines changed

src/isal/igzip.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ def __init__(self, fp):
229229
# Call the init method of gzip._GzipReader's parent here.
230230
# It is not very invasive and allows us to override _PaddedFile
231231
_compression.DecompressReader.__init__(
232-
self, _PaddedFile(fp), isal_zlib.decompressobj,
233-
wbits=-isal_zlib.MAX_WBITS)
232+
self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
233+
hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
234234
# Set flag indicating start of a new member
235235
self._new_member = True
236236
self._last_mtime = None
@@ -241,6 +241,57 @@ def _add_read_data(self, data):
241241
self._crc = isal_zlib.crc32(data, self._crc)
242242
self._stream_size += len(data)
243243

244+
def read(self, size=-1):
245+
if size < 0:
246+
return self.readall()
247+
# size=0 is special because decompress(max_length=0) is not supported
248+
if not size:
249+
return b""
250+
251+
# For certain input data, a single
252+
# call to decompress() may not return
253+
# any data. In this case, retry until we get some data or reach EOF.
254+
while True:
255+
if self._decompressor.eof:
256+
# Ending case: we've come to the end of a member in the file,
257+
# so finish up this member, and read a new gzip header.
258+
# Check the CRC and file size, and set the flag so we read
259+
# a new member
260+
self._read_eof()
261+
self._new_member = True
262+
self._decompressor = self._decomp_factory(
263+
**self._decomp_args)
264+
265+
if self._new_member:
266+
# If the _new_member flag is set, we have to
267+
# jump to the next member, if there is one.
268+
self._init_read()
269+
if not self._read_gzip_header():
270+
self._size = self._pos
271+
return b""
272+
self._new_member = False
273+
274+
# Read a chunk of data from the file
275+
if self._decompressor.needs_input:
276+
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
277+
uncompress = self._decompressor.decompress(buf, size)
278+
else:
279+
uncompress = self._decompressor.decompress(b"", size)
280+
if self._decompressor.unused_data != b"":
281+
# Prepend the already read bytes to the fileobj so they can
282+
# be seen by _read_eof() and _read_gzip_header()
283+
self._fp.prepend(self._decompressor.unused_data)
284+
285+
if uncompress != b"":
286+
break
287+
if buf == b"":
288+
raise EOFError("Compressed file ended before the "
289+
"end-of-stream marker was reached")
290+
291+
self._add_read_data(uncompress)
292+
self._pos += len(uncompress)
293+
return uncompress
294+
244295

245296
# Aliases for improved compatibility with CPython gzip module.
246297
GzipFile = IGzipFile

src/isal/igzip_lib.pyx

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ This module comes with the following constants:
6565
"""
6666

6767
from libc.stdint cimport UINT64_MAX, UINT32_MAX
68+
from libc.string cimport memmove, memcpy
6869
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
6970
from cpython.buffer cimport PyBUF_C_CONTIGUOUS, PyObject_GetBuffer, PyBuffer_Release
7071
from cpython.bytes cimport PyBytes_FromStringAndSize
@@ -327,6 +328,170 @@ cdef bytes view_bitbuffer(inflate_state * stream):
327328
return (read_in >> remainder).to_bytes(8, "little")[:read_in_length]
328329

329330

331+
cdef class IgzipDecompressor:
332+
"""Decompress object for handling streaming decompression."""
333+
cdef public bytes unused_data
334+
cdef public bint eof
335+
cdef public bint needs_input
336+
cdef inflate_state stream
337+
cdef unsigned char * input_buffer
338+
cdef size_t input_buffer_size
339+
cdef Py_ssize_t avail_in_real
340+
341+
def __dealloc__(self):
342+
if self.input_buffer != NULL:
343+
PyMem_Free(self.input_buffer)
344+
345+
def __cinit__(self,
346+
flag=ISAL_DEFLATE,
347+
hist_bits=ISAL_DEF_MAX_HIST_BITS,
348+
zdict = None):
349+
isal_inflate_init(&self.stream)
350+
351+
self.stream.hist_bits = hist_bits
352+
self.stream.crc_flag = flag
353+
cdef Py_ssize_t zdict_length
354+
if zdict:
355+
zdict_length = len(zdict)
356+
if zdict_length > UINT32_MAX:
357+
raise OverflowError("zdict length does not fit in an unsigned int")
358+
err = isal_inflate_set_dict(&self.stream, zdict, zdict_length)
359+
if err != COMP_OK:
360+
check_isal_deflate_rc(err)
361+
self.unused_data = b""
362+
self.eof = False
363+
self.input_buffer = NULL
364+
self.input_buffer_size = 0
365+
self.avail_in_real = 0
366+
self.needs_input = True
367+
368+
def _view_bitbuffer(self):
369+
"""Shows the 64-bitbuffer of the internal inflate_state. It contains
370+
a maximum of 8 bytes. This data is already read-in so is not part
371+
of the unconsumed tail."""
372+
return view_bitbuffer(&self.stream)
373+
374+
cdef decompress_buf(self, Py_ssize_t max_length, unsigned char ** obuf):
375+
obuf[0] = NULL
376+
cdef Py_ssize_t obuflen = DEF_BUF_SIZE_I
377+
cdef int err
378+
if obuflen > max_length:
379+
obuflen = max_length
380+
while True:
381+
obuflen = arrange_output_buffer_with_maximum(&self.stream, obuf, obuflen, max_length)
382+
if obuflen == -1:
383+
raise MemoryError("Unsufficient memory for buffer allocation")
384+
elif obuflen == -2:
385+
break
386+
arrange_input_buffer(&self.stream, &self.avail_in_real)
387+
err = isal_inflate(&self.stream)
388+
self.avail_in_real += self.stream.avail_in
389+
if err != ISAL_DECOMP_OK:
390+
check_isal_inflate_rc(err)
391+
if self.stream.block_state == ISAL_BLOCK_FINISH:
392+
self.eof = 1
393+
break
394+
elif self.avail_in_real == 0:
395+
break
396+
return
397+
398+
def decompress(self, data, Py_ssize_t max_length = -1):
399+
"""
400+
Decompress data, returning a bytes object containing the uncompressed
401+
data corresponding to at least part of the data in string.
402+
:param max_length: if non-zero then the return value will be no longer
403+
than max_length. Unprocessed data will be in the
404+
unconsumed_tail attribute.
405+
"""
406+
if self.eof:
407+
raise EOFError("End of stream already reached")
408+
cdef bint input_buffer_in_use
409+
410+
cdef Py_ssize_t hard_limit
411+
if max_length < 0:
412+
hard_limit = PY_SSIZE_T_MAX
413+
else:
414+
hard_limit = max_length
415+
416+
cdef unsigned int avail_now
417+
cdef unsigned int avail_total
418+
# Cython makes sure error is handled when acquiring buffer fails.
419+
cdef Py_buffer buffer_data
420+
cdef Py_buffer* buffer = &buffer_data
421+
PyObject_GetBuffer(data, buffer, PyBUF_C_CONTIGUOUS)
422+
cdef Py_ssize_t ibuflen = buffer.len
423+
cdef unsigned char * data_ptr = <unsigned char*>buffer.buf
424+
425+
426+
cdef bint max_length_reached = False
427+
cdef unsigned char * tmp
428+
cdef size_t offset
429+
# Initialise output buffer
430+
cdef unsigned char *obuf = NULL
431+
432+
try:
433+
if self.stream.next_in != NULL:
434+
avail_now = (self.input_buffer + self.input_buffer_size) - \
435+
(self.stream.next_in + self.avail_in_real)
436+
avail_total = self.input_buffer_size - self.avail_in_real
437+
if avail_total < ibuflen:
438+
offset = self.stream.next_in - self.input_buffer
439+
new_size = self.input_buffer_size + ibuflen - avail_now
440+
tmp = <unsigned char*>PyMem_Realloc(self.input_buffer, new_size)
441+
if tmp == NULL:
442+
raise MemoryError()
443+
self.input_buffer = tmp
444+
self.input_buffer_size = new_size
445+
self.stream.next_in = self.input_buffer + offset
446+
elif avail_now < ibuflen:
447+
memmove(self.input_buffer, self.stream.next_in,
448+
self.avail_in_real)
449+
self.stream.next_in = self.input_buffer
450+
memcpy(<void *>(self.stream.next_in + self.avail_in_real), data_ptr, buffer.len)
451+
self.avail_in_real += ibuflen
452+
input_buffer_in_use = 1
453+
else:
454+
self.stream.next_in = data_ptr
455+
self.avail_in_real = ibuflen
456+
input_buffer_in_use = 0
457+
458+
self.decompress_buf(hard_limit, &obuf)
459+
if obuf == NULL:
460+
self.stream.next_in = NULL
461+
return b""
462+
if self.eof:
463+
self.needs_input = False
464+
if self.avail_in_real > 0:
465+
new_data = PyBytes_FromStringAndSize(<char *>self.stream.next_in, self.avail_in_real)
466+
self.unused_data = self._view_bitbuffer() + new_data
467+
elif self.avail_in_real == 0:
468+
self.stream.next_in = NULL
469+
self.needs_input = True
470+
else:
471+
self.needs_input = False
472+
if not input_buffer_in_use:
473+
# Discard buffer if to small.
474+
# Resizing may needlessly copy the current contents.
475+
if self.input_buffer != NULL and self.input_buffer_size < self.avail_in_real:
476+
PyMem_Free(self.input_buffer)
477+
self.input_buffer = NULL
478+
479+
# Allocate of necessary
480+
if self.input_buffer == NULL:
481+
self.input_buffer = <unsigned char *>PyMem_Malloc(self.avail_in_real)
482+
if self.input_buffer == NULL:
483+
raise MemoryError()
484+
self.input_buffer_size = self.avail_in_real
485+
486+
# Copy tail
487+
memcpy(self.input_buffer, self.stream.next_in, self.avail_in_real)
488+
self.stream.next_in = self.input_buffer
489+
return PyBytes_FromStringAndSize(<char*>obuf, self.stream.next_out - obuf)
490+
finally:
491+
PyBuffer_Release(buffer)
492+
PyMem_Free(obuf)
493+
494+
330495
cdef int mem_level_to_bufsize(int compression_level, int mem_level, unsigned int *bufsize):
331496
"""
332497
Convert zlib memory levels to isal equivalents

0 commit comments

Comments
 (0)