Skip to content

Commit ff370da

Browse files
authored
Merge pull request #106 from pycompression/fasterungzip
Improve GzipReader compression rates.
2 parents 21f1f94 + 589824a commit ff370da

File tree

10 files changed

+61
-33
lines changed

10 files changed

+61
-33
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ that PyPy is no longer supported.
3333
``isal_zlib.IsalError`` has been removed.
3434
+ The base class for ``isal_zlib.error`` and ``igzip_lib.IsalError`` is now
3535
``Exception`` instead of ``OSError``.
36+
+ GzipReader now uses larger input and output buffers (128k) by default and
37+
IgzipDecompressor.decompress has been updated to allocate ``maxsize`` buffers
38+
when these are of reasonable size, instead of growing the buffer to maxsize
39+
on every call. This has improved gzip decompression speeds by 7%.
3640
+ ISA-L library version variables are now available on windows as well.
3741
+ Wheels are now always build with nasm for the x86 architecture. Previously
3842
yasm was used for Linux and MacOS due to build issues that have since been

benchmark.py renamed to benchmark_scripts/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from isal import igzip, isal_zlib # noqa: F401 used in timeit strings
1010

11-
DATA_DIR = Path(__file__).parent / "tests" / "data"
11+
DATA_DIR = Path(__file__).parent.parent / "tests" / "data"
1212
COMPRESSED_FILE = DATA_DIR / "test.fastq.gz"
1313
with gzip.open(str(COMPRESSED_FILE), mode="rb") as file_h:
1414
data = file_h.read()
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import sys
2+
3+
from isal import igzip
4+
5+
with igzip.open(sys.argv[1], "rb") as gzip_file:
6+
while True:
7+
block = gzip_file.read(128 * 1024)
8+
if not block:
9+
break

benchmark_scripts/gzipreadlines.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import sys
2+
3+
from isal import igzip
4+
5+
with igzip.open(sys.argv[1], "rb") as gzip_file:
6+
for line in gzip_file:
7+
pass

profile_igzipreader.py renamed to benchmark_scripts/profile_igzipreader.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55

66

77
def main():
8-
igzip.READ_BUFFER_SIZE = 32 * 1024
98
with igzip.open(sys.argv[1], mode="rb") as gzip_h:
109
while True:
11-
block = gzip_h.read(32*1024)
10+
block = gzip_h.read(128*1024)
1211
if block == b"":
1312
return
1413

src/isal/igzip.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@
4949

5050
#: The amount of data that is read in at once when decompressing a file.
5151
#: Increasing this value may increase performance.
52-
READ_BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
52+
#: 128K is also the size used by pigz and cat to read files from the
53+
# filesystem.
54+
READ_BUFFER_SIZE = 128 * 1024
5355

5456
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
5557

@@ -164,7 +166,7 @@ def __init__(self, filename=None, mode=None,
164166
0)
165167
if self.mode == gzip.READ:
166168
raw = _IGzipReader(self.fileobj)
167-
self._buffer = io.BufferedReader(raw)
169+
self._buffer = io.BufferedReader(raw, buffer_size=READ_BUFFER_SIZE)
168170

169171
def __repr__(self):
170172
s = repr(self.fileobj)
@@ -247,12 +249,6 @@ def __init__(self, fp):
247249
self._new_member = True
248250
self._last_mtime = None
249251

250-
def _add_read_data(self, data):
251-
# Use faster isal crc32 calculation and update the stream size in place
252-
# compared to CPython gzip
253-
self._crc = isal_zlib.crc32(data, self._crc)
254-
self._stream_size += len(data)
255-
256252
def read(self, size=-1):
257253
if size < 0:
258254
return self.readall()
@@ -300,7 +296,8 @@ def read(self, size=-1):
300296
raise EOFError("Compressed file ended before the "
301297
"end-of-stream marker was reached")
302298

303-
self._add_read_data(uncompress)
299+
self._crc = isal_zlib.crc32(uncompress, self._crc)
300+
self._stream_size += len(uncompress)
304301
self._pos += len(uncompress)
305302
return uncompress
306303

@@ -451,7 +448,7 @@ def _argument_parser():
451448
# diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
452449
# ~6% faster.
453450
parser.add_argument("-b", "--buffer-size",
454-
default=128 * 1024, type=int,
451+
default=READ_BUFFER_SIZE, type=int,
455452
help=argparse.SUPPRESS)
456453
return parser
457454

src/isal/igzip_lib.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,27 @@ decompress_buf(IgzipDecompressor *self, Py_ssize_t max_length)
121121
compare against max_length and PyBytes_GET_SIZE we declare it as
122122
signed */
123123
PyObject * RetVal = NULL;
124-
Py_ssize_t obuflen = DEF_BUF_SIZE;
124+
Py_ssize_t hard_limit;
125+
126+
Py_ssize_t obuflen;
127+
125128
int err;
126129

127-
if (obuflen > max_length)
130+
if (max_length < 0) {
131+
hard_limit = PY_SSIZE_T_MAX;
132+
obuflen = DEF_BUF_SIZE;
133+
} else {
134+
// Assume that decompressor is used in file decompression with a fixed
135+
// block size of max_length. In that case we will reach max_length almost
136+
// always (except at the end of the file). So it makes sense to allocate
137+
// max_length.
138+
hard_limit = max_length;
128139
obuflen = max_length;
129-
140+
if (obuflen > DEF_MAX_INITIAL_BUF_SIZE){
141+
// Safeguard against memory overflow.
142+
obuflen = DEF_MAX_INITIAL_BUF_SIZE;
143+
}
144+
}
130145

131146
do {
132147
arrange_input_buffer(&(self->state.avail_in), &(self->avail_in_real));
@@ -136,7 +151,7 @@ decompress_buf(IgzipDecompressor *self, Py_ssize_t max_length)
136151
&(self->state.next_out),
137152
&RetVal,
138153
obuflen,
139-
max_length);
154+
hard_limit);
140155
if (obuflen == -1){
141156
PyErr_SetString(PyExc_MemoryError,
142157
"Unsufficient memory for buffer allocation");
@@ -176,12 +191,6 @@ decompress(IgzipDecompressor *self, uint8_t *data, size_t len, Py_ssize_t max_le
176191
char input_buffer_in_use;
177192
PyObject *result;
178193

179-
Py_ssize_t hard_limit;
180-
if (max_length < 0) {
181-
hard_limit = PY_SSIZE_T_MAX;
182-
} else {
183-
hard_limit = max_length;
184-
}
185194
/* Prepend unconsumed input if necessary */
186195
if (self->state.next_in != NULL) {
187196
size_t avail_now, avail_total;
@@ -227,7 +236,7 @@ decompress(IgzipDecompressor *self, uint8_t *data, size_t len, Py_ssize_t max_le
227236
input_buffer_in_use = 0;
228237
}
229238

230-
result = decompress_buf(self, hard_limit);
239+
result = decompress_buf(self, max_length);
231240
if(result == NULL) {
232241
self->state.next_in = NULL;
233242
return NULL;

src/isal/igzip_lib_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
/* Initial buffer size. */
2828
#define DEF_BUF_SIZE (16*1024)
29+
#define DEF_MAX_INITIAL_BUF_SIZE (16 * 1024 * 1024)
2930
#define ISAL_BEST_SPEED ISAL_DEF_MIN_LEVEL
3031
#define ISAL_BEST_COMPRESSION ISAL_DEF_MAX_LEVEL
3132
#define ISAL_DEFAULT_COMPRESSION 2

tests/test_gzip_compliance.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -572,13 +572,15 @@ def test_bytes_filename(self):
572572

573573
def test_decompress_limited(self):
574574
"""Decompressed data buffering should be limited"""
575-
bomb = igzip.compress(b'\0' * int(2e6), compresslevel=3)
576-
self.assertLess(len(bomb), io.DEFAULT_BUFFER_SIZE)
575+
bomb_size = int(2e6)
576+
self.assertLess(igzip.READ_BUFFER_SIZE, bomb_size)
577+
bomb = gzip.compress(b'\0' * bomb_size, compresslevel=9)
578+
self.assertLess(len(bomb), igzip.READ_BUFFER_SIZE)
577579

578580
bomb = io.BytesIO(bomb)
579581
decomp = igzip.GzipFile(fileobj=bomb)
580582
self.assertEqual(decomp.read(1), b'\0')
581-
max_decomp = 1 + io.DEFAULT_BUFFER_SIZE
583+
max_decomp = 1 + igzip.READ_BUFFER_SIZE
582584
self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp,
583585
"Excessive amount of data was decompressed")
584586

tox.ini

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ deps=flake8
3434
pytest
3535
skip_install=True
3636
commands =
37-
flake8 src tests setup.py benchmark.py
38-
mypy src/ tests
37+
flake8 src tests setup.py benchmark_scripts
38+
mypy src/ tests benchmark_scripts
3939

4040
[testenv:twine_check]
4141
deps=build
@@ -67,19 +67,19 @@ commands=
6767
[testenv:benchmark-all]
6868
deps=
6969
commands=
70-
python ./benchmark.py --all
70+
python ./benchmark_scripts/benchmark.py --all
7171

7272
[testenv:benchmark-functions]
7373
deps=
7474
commands=
75-
python ./benchmark.py --functions
75+
python ./benchmark_scripts/benchmark.py --functions
7676

7777
[testenv:benchmark-gzip]
7878
deps=
7979
commands=
80-
python ./benchmark.py --gzip
80+
python ./benchmark_scripts/benchmark.py --gzip
8181

8282
[testenv:benchmark-checksums]
8383
deps=
8484
commands=
85-
python ./benchmark.py --checksums
85+
python ./benchmark_scripts/benchmark.py --checksums

0 commit comments

Comments
 (0)