-
Notifications
You must be signed in to change notification settings - Fork 16
Support for lz4 compression #163 #168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
34efbb2
00de271
7ab5809
bcd85b4
0d1a114
64e55de
8e474a8
bf32deb
9376298
d3992d9
0a3bacd
ea4d533
9bf878d
2ee242d
e5194f4
f59fa84
15d59db
2aea1ca
4c18c24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,8 @@ __pycache__/ | |
.tox | ||
venv/ | ||
src/xopen/_version.py | ||
settings.json | ||
.coverage | ||
coverage.xml | ||
.vscode/* | ||
.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,30 +5,40 @@ build-backend = "setuptools.build_meta" | |
[project] | ||
name = "xopen" | ||
authors = [ | ||
{name = "Marcel Martin", email = "[email protected]"}, | ||
gnzsnz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{name = "Ruben Vorderman", email = "[email protected]"} | ||
{ name = "Marcel Martin", email = "[email protected]" }, | ||
{ name = "Ruben Vorderman", email = "[email protected]" }, | ||
] | ||
description = "Open compressed files transparently" | ||
readme = "README.rst" | ||
license = {text = "MIT"} | ||
license = { text = "MIT" } | ||
classifiers = [ | ||
"Development Status :: 5 - Production/Stable", | ||
"License :: OSI Approved :: MIT License", | ||
"Programming Language :: Python :: 3" | ||
"Programming Language :: Python :: 3", | ||
] | ||
requires-python = ">=3.8" | ||
dynamic = ["version"] | ||
dependencies = [ | ||
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', | ||
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"' | ||
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', | ||
] | ||
|
||
[project.urls] | ||
homepage = "https://github.com/pycompression/xopen/" | ||
|
||
[project.optional-dependencies] | ||
dev = ["pytest"] | ||
dev = [ | ||
"pytest", | ||
"pytest-timeout", | ||
"tox", | ||
"black", | ||
"flake8", | ||
"mypy", | ||
"twine", | ||
"setuptools_scm[toml]", | ||
] | ||
gnzsnz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
zstd = ["zstandard<1"] | ||
lz4 = ["lz4>=4.3.2"] | ||
gnzsnz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
[tool.setuptools_scm] | ||
write_to = "src/xopen/_version.py" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ | |
XOPEN_DEFAULT_BZ2_COMPRESSION = 9 | ||
XOPEN_DEFAULT_XZ_COMPRESSION = 6 | ||
XOPEN_DEFAULT_ZST_COMPRESSION = 3 | ||
XOPEN_DEFAULT_LZ4_COMPRESSION = 1 | ||
|
||
igzip: Optional[ModuleType] | ||
isal_zlib: Optional[ModuleType] | ||
|
@@ -70,6 +71,11 @@ | |
except ImportError: | ||
zstandard = None # type: ignore | ||
|
||
try: | ||
import lz4.frame # type: ignore | ||
except ImportError: | ||
lz4 = None | ||
|
||
try: | ||
import fcntl | ||
|
||
|
@@ -120,6 +126,7 @@ class _ProgramSettings: | |
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"), | ||
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"), | ||
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))), | ||
"lz4": _ProgramSettings(("lz4",), tuple(range(1, 13)), "-T"), | ||
} | ||
|
||
|
||
|
@@ -551,6 +558,39 @@ def _open_zst( | |
return io.BufferedWriter(f) # mode "ab" and "wb" | ||
|
||
|
||
def _open_lz4( | ||
filename: FileOrPath, | ||
mode: str, | ||
compresslevel: Optional[int], | ||
threads: Optional[int], | ||
): | ||
assert mode in ("rb", "ab", "wb") | ||
if compresslevel is None: | ||
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION | ||
|
||
if threads != 0: | ||
try: | ||
return _PipedCompressionProgram( | ||
filename, | ||
mode, | ||
compresslevel, | ||
threads, | ||
program_settings=_PROGRAM_SETTINGS["lz4"], | ||
) | ||
except OSError: | ||
if lz4 is None: | ||
# No fallback available | ||
raise | ||
|
||
if lz4 is None: | ||
raise ImportError("lz4 module not available") | ||
rhpvorderman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel) | ||
if "r" in mode: | ||
return f | ||
# Buffer writes on lz4.open to mitigate overhead of small writes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For Gzip this overhead is present becase gzip is written in Python. Did you benchmark this to check if it made a differences for small writes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, i just follow what other compression formats where doing. for small writes we could use dictionaries for zstd and lz4, which should boost performance for small writes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gzip and Bzip2
Xopen does not support that. Gzip can also use dictionaries, but xopen does not provide the handles for that. That is more suited for low level libraries. |
||
return io.BufferedWriter(f) | ||
|
||
|
||
def _open_gz( | ||
filename: FileOrPath, | ||
mode: str, | ||
|
@@ -683,6 +723,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]: | |
elif bs[:4] == b"\x28\xb5\x2f\xfd": | ||
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1 | ||
return "zst" | ||
elif bs[:4] == b"\x04\x22\x4d\x18": | ||
# https://en.wikipedia.org/wiki/LZ4_(compression_algorithm) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather have a link to a specification or other technical documentation here. There is no guarantee that wikipedia will keep listing the magic bytes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed on 4c18c24 |
||
return "lz4" | ||
|
||
return None | ||
finally: | ||
if closefd: | ||
|
@@ -694,7 +738,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]: | |
Attempt to detect file format from the filename extension. | ||
Return None if no format could be detected. | ||
""" | ||
for ext in ("bz2", "xz", "gz", "zst"): | ||
for ext in ("bz2", "xz", "gz", "zst", "lz4"): | ||
if isinstance(filename, bytes): | ||
if filename.endswith(b"." + ext.encode()): | ||
return ext | ||
|
@@ -797,6 +841,7 @@ def xopen( # noqa: C901 | |
- .bz2 uses bzip2 compression | ||
- .xz uses xz/lzma compression | ||
- .zst uses zstandard compression | ||
- .lz4 uses lz4 compression | ||
- otherwise, no compression is used | ||
|
||
When reading, if a file name extension is available, the format is detected | ||
|
@@ -808,7 +853,7 @@ def xopen( # noqa: C901 | |
compresslevel is the compression level for writing to gzip, xz and zst files. | ||
This parameter is ignored for the other compression formats. | ||
If set to None, a default depending on the format is used: | ||
gzip: 6, xz: 6, zstd: 3. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to self: didn't we change the gzip level to 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes we did. |
||
gzip: 6, xz: 6, zstd: 3, lz4: 1. | ||
gnzsnz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
When threads is None (the default), compressed file formats are read or written | ||
using a pipe to a subprocess running an external tool such as, | ||
|
@@ -828,7 +873,7 @@ def xopen( # noqa: C901 | |
|
||
format overrides the autodetection of input and output formats. This can be | ||
useful when compressed output needs to be written to a file without an | ||
extension. Possible values are "gz", "xz", "bz2", "zst". | ||
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4". | ||
""" | ||
if mode in ("r", "w", "a"): | ||
mode += "t" # type: ignore | ||
|
@@ -844,10 +889,10 @@ def xopen( # noqa: C901 | |
elif _file_is_a_socket_or_pipe(filename): | ||
filename = open(filename, binary_mode) # type: ignore | ||
|
||
if format not in (None, "gz", "xz", "bz2", "zst"): | ||
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"): | ||
raise ValueError( | ||
f"Format not supported: {format}. " | ||
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'" | ||
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'." | ||
) | ||
detected_format = format or _detect_format_from_extension(filepath) | ||
if detected_format is None and "r" in mode: | ||
|
@@ -861,6 +906,8 @@ def xopen( # noqa: C901 | |
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads) | ||
elif detected_format == "zst": | ||
opened_file = _open_zst(filename, binary_mode, compresslevel, threads) | ||
elif detected_format == "lz4": | ||
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads) | ||
else: | ||
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode) | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.