Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,16 @@ jobs:
optional-deps: false
with-libs: false
with-zstandard: true
with-lz4: true
- os: windows-latest
python-version: "3.10"
steps:
- name: Install optional tools macOS
if: runner.os == 'macOS' && matrix.optional-deps
run: brew install pigz pbzip2 isa-l zstd
run: brew install pigz pbzip2 isa-l zstd lz4
- name: Install optional tools Linux
if: runner.os == 'Linux' && matrix.optional-deps
run: sudo apt-get install pigz pbzip2 isal zstd
run: sudo apt-get install pigz pbzip2 isal zstd lz4
- name: Remove xz
if: runner.os == 'Linux' && !matrix.optional-deps
run: while which xz; do sudo rm $(which xz); done
Expand All @@ -84,6 +85,9 @@ jobs:
- name: Test with zstandard
if: matrix.with-zstandard
run: tox -e zstd
- name: Test with lz4
if: matrix.with-lz4
run: tox -e lz4
- name: Upload coverage report
uses: codecov/codecov-action@v3

Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ __pycache__/
.tox
venv/
src/xopen/_version.py
settings.json
.coverage
coverage.xml
.vscode/*
.DS_Store
21 changes: 18 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Supported compression formats are:
- bzip2 (``.bz2``)
- xz (``.xz``)
- Zstandard (``.zst``) (optional)
- lz4 (``.lz4``) (optional)

``xopen`` is compatible with Python versions 3.8 and later.

Expand Down Expand Up @@ -73,7 +74,7 @@ The function opens the file using a function suitable for the detected
file format and returns an open file-like object.

When writing, the file format is chosen based on the file name extension:
``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``.
If the extension is not recognized, no compression is used.

When reading and a file name extension is available, the format is detected
Expand Down Expand Up @@ -101,13 +102,13 @@ preferred locale encoding.
**compresslevel**:
The compression level for writing to gzip, xz and Zstandard files.
If set to None, a default depending on the format is used:
gzip: 1, xz: 6, Zstandard: 3.
gzip: 1, xz: 6, Zstandard: 3, lz4: 1.

This parameter is ignored for other compression formats.

**format**:
Override the autodetection of the input or output format.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``.

**threads**:
Set the number of additional threads spawned for compression or decompression.
Expand Down Expand Up @@ -180,6 +181,20 @@ program or the Python ``zstandard`` package needs to be installed.
To ensure that you get the correct ``zstandard`` version, you can specify the ``zstd`` extra for
``xopen``, that is, install it using ``pip install xopen[zstd]``.

Optional lz4 support
--------------------------

For reading and writing lz4 (``.lz4``) files, either the ``lz4`` command-line
program or the Python ``lz4`` package needs to be installed.

* If the ``threads`` parameter to ``xopen()`` is ``None`` (the default) or any value greater than 0,
``xopen`` uses an external ``lz4`` process.
* If the above fails (because no ``lz4`` program is available) or if ``threads`` is 0,
the ``lz4`` package is used.

To ensure that ``lz4`` is installed, you can specify the ``lz4`` extra for
``xopen``, that is, install it using ``pip install xopen[lz4]``.


Changelog
---------
Expand Down
22 changes: 16 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,40 @@ build-backend = "setuptools.build_meta"
[project]
name = "xopen"
authors = [
{name = "Marcel Martin", email = "[email protected]"},
{name = "Ruben Vorderman", email = "[email protected]"}
{ name = "Marcel Martin", email = "[email protected]" },
{ name = "Ruben Vorderman", email = "[email protected]" },
]
description = "Open compressed files transparently"
readme = "README.rst"
license = {text = "MIT"}
license = { text = "MIT" }
classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3"
"Programming Language :: Python :: 3",
]
requires-python = ">=3.8"
dynamic = ["version"]
dependencies = [
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
]

[project.urls]
homepage = "https://github.com/pycompression/xopen/"

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"pytest-timeout",
"tox",
"black",
"flake8",
"mypy",
"twine",
"setuptools_scm[toml]",
]
zstd = ["zstandard<1"]
lz4 = ["lz4>=4.3.2"]

[tool.setuptools_scm]
write_to = "src/xopen/_version.py"
Expand Down
57 changes: 52 additions & 5 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
XOPEN_DEFAULT_BZ2_COMPRESSION = 9
XOPEN_DEFAULT_XZ_COMPRESSION = 6
XOPEN_DEFAULT_ZST_COMPRESSION = 3
XOPEN_DEFAULT_LZ4_COMPRESSION = 1

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
Expand Down Expand Up @@ -70,6 +71,11 @@
except ImportError:
zstandard = None # type: ignore

try:
import lz4.frame # type: ignore
except ImportError:
lz4 = None

try:
import fcntl

Expand Down Expand Up @@ -120,6 +126,7 @@ class _ProgramSettings:
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
"lz4": _ProgramSettings(("lz4",), tuple(range(1, 13)), "-T"),
}


Expand Down Expand Up @@ -551,6 +558,39 @@ def _open_zst(
return io.BufferedWriter(f) # mode "ab" and "wb"


def _open_lz4(
filename: FileOrPath,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
):
assert mode in ("rb", "ab", "wb")
if compresslevel is None:
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION

if threads != 0:
try:
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_PROGRAM_SETTINGS["lz4"],
)
except OSError:
if lz4 is None:
# No fallback available
raise

if lz4 is None:
raise ImportError("lz4 module not available")
f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel)
if "r" in mode:
return f
# Buffer writes on lz4.open to mitigate overhead of small writes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Gzip this overhead is present becase gzip is written in Python. Did you benchmark this to check if it made a differences for small writes?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, i just follow what other compression formats where doing.

for small writes we could use dictionaries for zstd and lz4, which should boost performance for small writes

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gzip and Bzip2 write calls are implemented in python. They have massive overhead. If the object you are writing to is implemented in C, that usually is not the case. I recommend benchmarking whether the BufferedWriter helps.

or small writes we could use dictionaries for zstd and lz4, which should boost performance for small writes

Xopen does not support that. Gzip can also use dictionaries, but xopen does not provide the handles for that. That is more suited for low level libraries.

return io.BufferedWriter(f)


def _open_gz(
filename: FileOrPath,
mode: str,
Expand Down Expand Up @@ -683,6 +723,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
elif bs[:4] == b"\x28\xb5\x2f\xfd":
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
return "zst"
elif bs[:4] == b"\x04\x22\x4d\x18":
# https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather have a link to a specification or other technical documentation here. There is no guarantee that wikipedia will keep listing the magic bytes.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed on 4c18c24

return "lz4"

return None
finally:
if closefd:
Expand All @@ -694,7 +738,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
Attempt to detect file format from the filename extension.
Return None if no format could be detected.
"""
for ext in ("bz2", "xz", "gz", "zst"):
for ext in ("bz2", "xz", "gz", "zst", "lz4"):
if isinstance(filename, bytes):
if filename.endswith(b"." + ext.encode()):
return ext
Expand Down Expand Up @@ -797,6 +841,7 @@ def xopen( # noqa: C901
- .bz2 uses bzip2 compression
- .xz uses xz/lzma compression
- .zst uses zstandard compression
- .lz4 uses lz4 compression
- otherwise, no compression is used

When reading, if a file name extension is available, the format is detected
Expand All @@ -808,7 +853,7 @@ def xopen( # noqa: C901
compresslevel is the compression level for writing to gzip, xz and zst files.
This parameter is ignored for the other compression formats.
If set to None, a default depending on the format is used:
gzip: 6, xz: 6, zstd: 3.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: didn't we change the gzip level to 1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we did.

gzip: 6, xz: 6, zstd: 3, lz4: 1.

When threads is None (the default), compressed file formats are read or written
using a pipe to a subprocess running an external tool such as,
Expand All @@ -828,7 +873,7 @@ def xopen( # noqa: C901

format overrides the autodetection of input and output formats. This can be
useful when compressed output needs to be written to a file without an
extension. Possible values are "gz", "xz", "bz2", "zst".
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4".
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
Expand All @@ -844,10 +889,10 @@ def xopen( # noqa: C901
elif _file_is_a_socket_or_pipe(filename):
filename = open(filename, binary_mode) # type: ignore

if format not in (None, "gz", "xz", "bz2", "zst"):
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"):
raise ValueError(
f"Format not supported: {format}. "
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'."
)
detected_format = format or _detect_format_from_extension(filepath)
if detected_format is None and "r" in mode:
Expand All @@ -861,6 +906,8 @@ def xopen( # noqa: C901
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
elif detected_format == "zst":
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
elif detected_format == "lz4":
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads)
else:
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)

Expand Down
Binary file added tests/file.txt.lz4
Binary file not shown.
10 changes: 9 additions & 1 deletion tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_ProgramSettings,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"]

try:
import fcntl
Expand Down Expand Up @@ -57,16 +57,24 @@ def available_zstd_programs():
return []


def available_lz4_programs():
if shutil.which("lz4"):
return [_PROGRAM_SETTINGS["lz4"]]
return []


PIPED_GZIP_PROGRAMS = available_gzip_programs()
PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
PIPED_XZ_PROGRAMS = available_xz_programs()
PIPED_ZST_PROGRAMS = available_zstd_programs()
PIPED_LZ4_PROGRAMS = available_lz4_programs()

ALL_PROGRAMS_WITH_EXTENSION = (
list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
+ list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
+ list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
+ list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
+ list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"])))
)


Expand Down
Loading
Loading