Skip to content

Commit 1d89ae7

Browse files
authored
Merge pull request #58 from pycompression/release_0.8.0
Release 0.8.0
2 parents 90dabed + 050dd39 commit 1d89ae7

File tree

8 files changed

+532
-244
lines changed

8 files changed

+532
-244
lines changed

CHANGELOG.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ Changelog
77
.. This document is user facing. Please word the changes in such a way
88
.. that users understand how the changes affect the new version.
99
10+
version 0.8.0
11+
-----------------
12+
+ Speed up ``igzip.compress`` and ``igzip.decompress`` by improving the
13+
implementation.
14+
+ Make sure compiler arguments are passed to ISA-L compilation step. Previously
15+
ISA-L was compiled without optimisation steps, causing the statically linked
16+
library to be significantly slower.
17+
+ A unused constant from the ``isal_zlib`` library was removed:
18+
``ISAL_DEFAULT_HIST_BITS``.
19+
+ Refactor isal_zlib.pyx to work almost the same as zlibmodule.c. This has made
20+
the code look cleaner and has reduced some overhead.
21+
1022
version 0.7.0
1123
-----------------
1224
+ Remove workarounds in the ``igzip`` module for the ``unconsumed_tail``

benchmark.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import gzip
3+
import io # noqa: F401 used in timeit strings
34
import timeit
45
import zlib
56
from pathlib import Path
@@ -65,12 +66,12 @@ def benchmark(name: str,
6566
number=number, **kwargs)
6667
isal_time = timeit.timeit(isal_string, **timeit_kwargs)
6768
zlib_time = timeit.timeit(zlib_string, **timeit_kwargs)
68-
isal_nanosecs = round(isal_time * (1_000_000 / number), 2)
69-
zlib_nanosecs = round(zlib_time * (1_000_000 / number), 2)
69+
isal_microsecs = round(isal_time * (1_000_000 / number), 2)
70+
zlib_microsecs = round(zlib_time * (1_000_000 / number), 2)
7071
ratio = round(isal_time / zlib_time, 2)
7172
print("{0}\t{1}\t{2}\t{3}".format(name,
72-
isal_nanosecs,
73-
zlib_nanosecs,
73+
isal_microsecs,
74+
zlib_microsecs,
7475
ratio))
7576

7677

@@ -82,6 +83,8 @@ def argument_parser() -> argparse.ArgumentParser:
8283
parser.add_argument("--checksums", action="store_true")
8384
parser.add_argument("--functions", action="store_true")
8485
parser.add_argument("--gzip", action="store_true")
86+
parser.add_argument("--sizes", action="store_true")
87+
parser.add_argument("--objects", action="store_true")
8588
return parser
8689

8790

@@ -96,19 +99,34 @@ def argument_parser() -> argparse.ArgumentParser:
9699
"isal_zlib.adler32(data_block)",
97100
"zlib.adler32(data_block)")
98101
if args.functions or args.all:
99-
benchmark("Compression", sizes,
102+
benchmark("zlib compression", sizes,
100103
"isal_zlib.compress(data_block, 1)",
101104
"zlib.compress(data_block, 1)")
102105

103-
benchmark("Decompression", compressed_sizes,
106+
benchmark("zlib decompression", compressed_sizes,
104107
"isal_zlib.decompress(data_block)",
105108
"zlib.decompress(data_block)")
106109

107110
if args.gzip or args.all:
108-
benchmark("Compression", sizes,
111+
benchmark("gzip compression", sizes,
109112
"igzip.compress(data_block, 1)",
110113
"gzip.compress(data_block, 1)")
111114

112-
benchmark("Decompression", compressed_sizes_gzip,
115+
benchmark("gzip decompression", compressed_sizes_gzip,
113116
"igzip.decompress(data_block)",
114117
"gzip.decompress(data_block)")
118+
if args.objects or args.all:
119+
benchmark("zlib Compress instantiation", {"": b""},
120+
"a = isal_zlib.compressobj()",
121+
"a = zlib.compressobj()")
122+
benchmark("zlib Decompress instantiation", {"": b""},
123+
"a = isal_zlib.decompressobj()",
124+
"a = zlib.decompressobj()")
125+
benchmark("Gzip Writer instantiation", {"": b""},
126+
"a = igzip.GzipFile(fileobj=io.BytesIO(), mode='wb' )",
127+
"a = gzip.GzipFile(fileobj=io.BytesIO(), mode='wb')")
128+
benchmark("Gzip Reader instantiation", {"": b""},
129+
"a = igzip.GzipFile(fileobj=io.BytesIO(), mode='rb' )",
130+
"a = gzip.GzipFile(fileobj=io.BytesIO(), mode='rb')")
131+
if args.sizes or args.all:
132+
show_sizes()

setup.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
20+
import copy
2021
import functools
2122
import os
2223
import shutil
@@ -76,7 +77,18 @@ def build_extension(self, ext):
7677
raise NotImplementedError(
7778
f"Unsupported platform: {sys.platform}")
7879
else:
79-
isa_l_prefix_dir = build_isa_l()
80+
if self.compiler.compiler_type == "msvc":
81+
compiler = copy.deepcopy(self.compiler)
82+
compiler.initialize()
83+
compiler_command = f'"{compiler.cc}"'
84+
compiler_args = compiler.compile_options
85+
elif self.compiler.compiler_type == "unix":
86+
compiler_command = self.compiler.compiler[0]
87+
compiler_args = self.compiler.compiler[1:]
88+
else:
89+
raise NotImplementedError("Unknown compiler")
90+
isa_l_prefix_dir = build_isa_l(compiler_command,
91+
" ".join(compiler_args))
8092
if SYSTEM_IS_UNIX:
8193
ext.extra_objects = [
8294
os.path.join(isa_l_prefix_dir, "lib", "libisal.a")]
@@ -113,7 +125,7 @@ def build_extension(self, ext):
113125
# 'cache' is only available from python 3.9 onwards.
114126
# see: https://docs.python.org/3/library/functools.html#functools.cache
115127
@functools.lru_cache(maxsize=None)
116-
def build_isa_l():
128+
def build_isa_l(compiler_command: str, compiler_options: str):
117129
# Creating temporary directories
118130
build_dir = tempfile.mktemp()
119131
temp_prefix = tempfile.mkdtemp()
@@ -123,7 +135,13 @@ def build_isa_l():
123135
# it.
124136
build_env = os.environ.copy()
125137
# Add -fPIC flag to allow static compilation
126-
build_env["CFLAGS"] = build_env.get("CFLAGS", "") + " -fPIC"
138+
build_env["CC"] = compiler_command
139+
if SYSTEM_IS_UNIX:
140+
build_env["CFLAGS"] = compiler_options + " -fPIC"
141+
elif SYSTEM_IS_WINDOWS:
142+
# The nmake file has CLFAGS_REL for all the compiler options.
143+
# This is added to CFLAGS with all the necessary include options.
144+
build_env["CFLAGS_REL"] = compiler_options
127145
if hasattr(os, "sched_getaffinity"):
128146
cpu_count = len(os.sched_getaffinity(0))
129147
else: # sched_getaffinity not available on all platforms
@@ -133,11 +151,10 @@ def build_isa_l():
133151
subprocess.run(os.path.join(build_dir, "autogen.sh"), **run_args)
134152
subprocess.run([os.path.join(build_dir, "configure"),
135153
"--prefix", temp_prefix], **run_args)
136-
subprocess.run(["make", "-j", str(cpu_count)],
137-
**run_args)
138-
subprocess.run(["make", "install"], **run_args)
154+
subprocess.run(["make", "-j", str(cpu_count)], **run_args)
155+
subprocess.run(["make", "-j", str(cpu_count), "install"], **run_args)
139156
elif SYSTEM_IS_WINDOWS:
140-
subprocess.run(["nmake", "/f", "Makefile.nmake"], **run_args)
157+
subprocess.run(["nmake", "/E", "/f", "Makefile.nmake"], **run_args)
141158
Path(temp_prefix, "include").mkdir()
142159
print(temp_prefix, file=sys.stderr)
143160
shutil.copytree(os.path.join(build_dir, "include"),
@@ -152,7 +169,7 @@ def build_isa_l():
152169

153170
setup(
154171
name="isal",
155-
version="0.7.0",
172+
version="0.8.0",
156173
description="Faster zlib and gzip compatible compression and "
157174
"decompression by providing python bindings for the ISA-L "
158175
"library.",

src/isal/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@
3939
"__version__"
4040
]
4141

42-
__version__ = "0.7.0"
42+
__version__ = "0.8.0"

src/isal/igzip.py

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
import gzip
2626
import io
2727
import os
28+
import struct
2829
import sys
30+
import time
31+
from typing import List, Optional, SupportsInt
2932

3033
from . import isal_zlib
3134

@@ -35,6 +38,8 @@
3538
_COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
3639
_COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
3740

41+
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
42+
3843
try:
3944
BadGzipFile = gzip.BadGzipFile # type: ignore
4045
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
@@ -52,7 +57,7 @@ def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_TRADEOFF,
5257
5358
The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
5459
binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
55-
"rb", and the default compresslevel is 9.
60+
"rb", and the default compresslevel is 2.
5661
5762
For binary mode, this function is equivalent to the GzipFile constructor:
5863
GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
@@ -217,24 +222,105 @@ def _add_read_data(self, data):
217222
_GzipReader = _IGzipReader
218223

219224

220-
# Plagiarized from gzip.py from python's stdlib.
225+
def _create_simple_gzip_header(compresslevel: int,
226+
mtime: Optional[SupportsInt] = None) -> bytes:
227+
"""
228+
Write a simple gzip header with no extra fields.
229+
:param compresslevel: Compresslevel used to determine the xfl bytes.
230+
:param mtime: The mtime (must support conversion to a 32-bit integer).
231+
:return: A bytes object representing the gzip header.
232+
"""
233+
if mtime is None:
234+
mtime = time.time()
235+
# There is no best compression level. ISA-L only provides algorithms for
236+
# fast and medium levels.
237+
xfl = 4 if compresslevel == _COMPRESS_LEVEL_FAST else 0
238+
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
239+
# fields added to header), mtime, xfl and os (255 for unknown OS).
240+
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
241+
242+
221243
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
222244
"""Compress data in one shot and return the compressed string.
223245
Optional argument is the compression level, in range of 0-3.
224246
"""
225-
buf = io.BytesIO()
226-
with IGzipFile(fileobj=buf, mode='wb',
227-
compresslevel=compresslevel, mtime=mtime) as f:
228-
f.write(data)
229-
return buf.getvalue()
247+
header = _create_simple_gzip_header(compresslevel, mtime)
248+
# Compress the data without header or trailer in a raw deflate block.
249+
compressed = isal_zlib.compress(data, compresslevel, wbits=-15)
250+
length = len(data) & 0xFFFFFFFF
251+
crc = isal_zlib.crc32(data)
252+
trailer = struct.pack("<LL", crc, length)
253+
return header + compressed + trailer
254+
255+
256+
def _gzip_header_end(data: bytes) -> int:
257+
"""
258+
Find the start of the raw deflate block in a gzip file.
259+
:param data: Compressed data that starts with a gzip header.
260+
:return: The end of the header / start of the raw deflate block.
261+
"""
262+
eof_error = EOFError("Compressed file ended before the end-of-stream "
263+
"marker was reached")
264+
if len(data) < 10:
265+
raise eof_error
266+
# We are not interested in mtime, xfl and os flags.
267+
magic, method, flags = struct.unpack("<HBB", data[:4])
268+
if magic != 0x8b1f:
269+
raise BadGzipFile(f"Not a gzipped file ({repr(data[:2])})")
270+
if method != 8:
271+
raise BadGzipFile("Unknown compression method")
272+
pos = 10
273+
if flags & FEXTRA:
274+
if len(data) < pos + 2:
275+
raise eof_error
276+
xlen = int.from_bytes(data[pos: pos + 2], "little", signed=False)
277+
pos += 2 + xlen
278+
if flags & FNAME:
279+
pos = data.find(b"\x00", pos) + 1
280+
# pos will be -1 + 1 when null byte not found.
281+
if not pos:
282+
raise eof_error
283+
if flags & FCOMMENT:
284+
pos = data.find(b"\x00", pos) + 1
285+
if not pos:
286+
raise eof_error
287+
if flags & FHCRC:
288+
if len(data) < pos + 2:
289+
raise eof_error
290+
header_crc = int.from_bytes(data[pos: pos + 2], "little", signed=False)
291+
# CRC is stored as a 16-bit integer by taking last bits of crc32.
292+
crc = isal_zlib.crc32(data[:pos]) & 0xFFFF
293+
if header_crc != crc:
294+
raise BadGzipFile(f"Corrupted header. Checksums do not "
295+
f"match: {crc} != {header_crc}")
296+
pos += 2
297+
return pos
230298

231299

232300
def decompress(data):
233301
"""Decompress a gzip compressed string in one shot.
234302
Return the decompressed string.
235303
"""
236-
with IGzipFile(fileobj=io.BytesIO(data)) as f:
237-
return f.read()
304+
all_blocks: List[bytes] = []
305+
while True:
306+
if data == b"":
307+
break
308+
header_end = _gzip_header_end(data)
309+
do = isal_zlib.decompressobj(-15)
310+
block = do.decompress(data[header_end:]) + do.flush()
311+
if not do.eof or len(do.unused_data) < 8:
312+
raise EOFError("Compressed file ended before the end-of-stream "
313+
"marker was reached")
314+
checksum, length = struct.unpack("<II", do.unused_data[:8])
315+
crc = isal_zlib.crc32(block)
316+
if crc != checksum:
317+
raise BadGzipFile("CRC check failed")
318+
if length != len(block):
319+
raise BadGzipFile("Incorrect length of data produced")
320+
all_blocks.append(block)
321+
# Remove all padding null bytes and start next block.
322+
data = do.unused_data[8:].lstrip(b"\x00")
323+
return b"".join(all_blocks)
238324

239325

240326
def main():

src/isal/isal_zlib.pyi

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ Z_DEFAULT_COMPRESSION: int
2828
DEF_BUF_SIZE: int
2929
DEF_MEM_LEVEL: int
3030
MAX_WBITS: int
31-
ISAL_DEFAULT_HIST_BITS: int
3231

3332
DEFLATED: int
3433

@@ -38,12 +37,9 @@ Z_HUFFMAN_ONLY: int
3837
Z_FILTERED: int
3938
Z_FIXED: int
4039

41-
ISAL_NO_FLUSH: int
42-
ISAL_SYNC_FLUSH: int
43-
ISAL_FULL_FLUSH: int
44-
4540
Z_NO_FLUSH: int
4641
Z_SYNC_FLUSH: int
42+
Z_FULL_FLUSH: int
4743
Z_FINISH: int
4844

4945
class IsalError(OSError): ...

0 commit comments

Comments
 (0)