Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ jobs:
steps:
- name: Install optional tools macOS
if: runner.os == 'macOS' && matrix.optional-deps
run: brew install pigz pbzip2 isa-l zstd
run: brew install pigz pbzip2 isa-l zstd lz4
- name: Install optional tools Linux
if: runner.os == 'Linux' && matrix.optional-deps
run: sudo apt-get install pigz pbzip2 isal zstd
run: sudo apt-get install pigz pbzip2 isal zstd lz4
- name: Remove xz
if: runner.os == 'Linux' && !matrix.optional-deps
run: while which xz; do sudo rm $(which xz); done
Expand Down
10 changes: 7 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Supported compression formats are:
- gzip (``.gz``)
- bzip2 (``.bz2``)
- xz (``.xz``)
- lz4 (``.lz4``)
- Zstandard (``.zst``) (optional)


Expand Down Expand Up @@ -71,7 +72,7 @@ The function opens the file using a function suitable for the detected
file format and returns an open file-like object.

When writing, the file format is chosen based on the file name extension:
``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``.
If the extension is not recognized, no compression is used.

When reading and a file name extension is available, the format is detected
Expand Down Expand Up @@ -99,13 +100,13 @@ preferred locale encoding.
**compresslevel**:
The compression level for writing to gzip, xz and Zstandard files.
If set to None, a default depending on the format is used:
gzip: 1, xz: 6, Zstandard: 3.
gzip: 1, xz: 6, Zstandard: 3, lz4: 1.

This parameter is ignored for other compression formats.

**format**:
Override the autodetection of the input or output format.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``.

**threads**:
Set the number of additional threads spawned for compression or decompression.
Expand Down Expand Up @@ -138,6 +139,9 @@ built-in support for multithreaded compression.

For bz2 files, `pbzip2 (parallel bzip2) <http://compression.great-site.net/pbzip2/>`_ is used.

For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html)
package is used.

``xopen`` falls back to Python’s built-in functions
(``gzip.open``, ``lzma.open``, ``bz2.open``)
if none of the other methods can be used.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ requires-python = ">=3.9"
dynamic = ["version"]
dependencies = [
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'lz4>4.3.1; platform_python_implementation != "PyPy"',
]

[project.urls]
Expand Down
77 changes: 71 additions & 6 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
XOPEN_DEFAULT_BZ2_COMPRESSION = 9
XOPEN_DEFAULT_XZ_COMPRESSION = 6
XOPEN_DEFAULT_ZST_COMPRESSION = 3
XOPEN_DEFAULT_LZ4_COMPRESSION = 0

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
Expand Down Expand Up @@ -70,6 +71,11 @@
except ImportError:
zstandard = None # type: ignore

try:
import lz4.frame # type: ignore
except ImportError:
lz4 = None

try:
import fcntl

Expand Down Expand Up @@ -120,6 +126,7 @@ class _ProgramSettings:
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
"lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))),
}


Expand Down Expand Up @@ -551,6 +558,57 @@ def _open_zst(
return io.BufferedWriter(f) # mode "ab" and "wb"


def _open_lz4(
filename: FileOrPath,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
):
assert mode in ("rb", "ab", "wb")
if compresslevel is None:
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION

if lz4 is not None and (mode == "rb" or threads == 0):
# Use Python bindings
return lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel)

# Attempt to use the CLI program.
#
# Notes:
#
# - Multithreading in lz4 is only supported for compression, not for decompression.
# - Older versions of lz4 (such as v1.94, which comes with Ubuntu 24.04) do not support
# multithreading. They fail if one tries to pass the -T option.
# - The newer versions use a default of -T0, which chooses the number of threads
# automatically (presumably the number of available cores).
try:
# Try with the -T option first
import copy

program_settings = copy.copy(_PROGRAM_SETTINGS["lz4"])
program_settings.threads_flag = "-T"
return _PipedCompressionProgram(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea here (and as discussed in earlier comments) was to first try to run the lz4 binary with the -T option, but here, the -T option is not actually passed in because the ProgramSettings for lz4 do not set a threads_flag. So both this and what is below in the except clause actually do the same thing.

filename, mode, compresslevel, threads, program_settings=program_settings
)
except FileNotFoundError:
# Binary not found, use Python bindings if available
if lz4 is not None:
return lz4.frame.LZ4FrameFile(
filename, mode, compression_level=compresslevel
)
else:
raise
except OSError:
# Assume the problem is that the -T option is not supported and re-try without it:
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_PROGRAM_SETTINGS["lz4"],
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this should fall back to using the bindings if calling the lz4 binary fails.



def _open_gz(
filename: FileOrPath,
mode: str,
Expand Down Expand Up @@ -683,6 +741,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
elif bs[:4] == b"\x28\xb5\x2f\xfd":
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
return "zst"
elif bs[:4] == b"\x04\x22\x4d\x18":
# https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md
return "lz4"

return None
finally:
if closefd:
Expand All @@ -694,7 +756,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
Attempt to detect file format from the filename extension.
Return None if no format could be detected.
"""
for ext in ("bz2", "xz", "gz", "zst"):
for ext in ("bz2", "xz", "gz", "zst", "lz4"):
if isinstance(filename, bytes):
if filename.endswith(b"." + ext.encode()):
return ext
Expand All @@ -717,7 +779,7 @@ def _file_or_path_to_binary_stream(
# object is not binary, this will crash at a later point.
return file_or_path, False # type: ignore
raise TypeError(
f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}."
f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}."
)


Expand Down Expand Up @@ -797,6 +859,7 @@ def xopen( # noqa: C901
- .bz2 uses bzip2 compression
- .xz uses xz/lzma compression
- .zst uses zstandard compression
- .lz4 uses lz4 compression
- otherwise, no compression is used

When reading, if a file name extension is available, the format is detected
Expand All @@ -808,7 +871,7 @@ def xopen( # noqa: C901
compresslevel is the compression level for writing to gzip, xz and zst files.
This parameter is ignored for the other compression formats.
If set to None, a default depending on the format is used:
gzip: 6, xz: 6, zstd: 3.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: didn't we change the gzip level to 1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we did.

gzip: 6, xz: 6, zstd: 3, lz4: 0.

When threads is None (the default), compressed file formats are read or written
using a pipe to a subprocess running an external tool such as,
Expand All @@ -828,7 +891,7 @@ def xopen( # noqa: C901

format overrides the autodetection of input and output formats. This can be
useful when compressed output needs to be written to a file without an
extension. Possible values are "gz", "xz", "bz2", "zst".
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4".
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
Expand All @@ -844,10 +907,10 @@ def xopen( # noqa: C901
elif _file_is_a_socket_or_pipe(filename):
filename = open(filename, binary_mode) # type: ignore

if format not in (None, "gz", "xz", "bz2", "zst"):
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"):
raise ValueError(
f"Format not supported: {format}. "
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'."
)
detected_format = format or _detect_format_from_extension(filepath)
if detected_format is None and "r" in mode:
Expand All @@ -861,6 +924,8 @@ def xopen( # noqa: C901
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
elif detected_format == "zst":
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
elif detected_format == "lz4":
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads)
else:
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)

Expand Down
Binary file added tests/file.txt.lz4
Binary file not shown.
10 changes: 9 additions & 1 deletion tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_ProgramSettings,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"]

try:
import fcntl
Expand Down Expand Up @@ -57,16 +57,24 @@ def available_zstd_programs():
return []


def available_lz4_programs():
if shutil.which("lz4"):
return [_PROGRAM_SETTINGS["lz4"]]
return []


PIPED_GZIP_PROGRAMS = available_gzip_programs()
PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
PIPED_XZ_PROGRAMS = available_xz_programs()
PIPED_ZST_PROGRAMS = available_zstd_programs()
PIPED_LZ4_PROGRAMS = available_lz4_programs()

ALL_PROGRAMS_WITH_EXTENSION = (
list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
+ list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
+ list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
+ list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
+ list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"])))
)


Expand Down
34 changes: 26 additions & 8 deletions tests/test_xopen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,41 @@
Tests for the xopen.xopen function
"""
import bz2
import subprocess
import sys
import tempfile
from contextlib import contextmanager
import functools
import gzip
import io
import lzma
import os
from pathlib import Path
import shutil
import subprocess
import sys
import tempfile
from contextlib import contextmanager
from pathlib import Path


import pytest

from xopen import xopen, _detect_format_from_content
from xopen import _detect_format_from_content, xopen

try:
import lz4.frame
except ImportError:
lz4 = None
try:
import zstandard
except ImportError:
zstandard = None


# TODO this is duplicated in test_piped.py
TEST_DIR = Path(__file__).parent
CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"]
CONTENT = "".join(CONTENT_LINES)
extensions = ["", ".gz", ".bz2", ".xz"]
if shutil.which("zstd") or zstandard:
extensions += [".zst"]
if shutil.which("lz4") or lz4:
extensions += [".lz4"]
base = os.path.join(os.path.dirname(__file__), "file.txt")
files = [base + ext for ext in extensions]

Expand Down Expand Up @@ -369,6 +375,10 @@ def test_read_no_threads(ext):
}
if ext == ".zst" and zstandard is None:
return
if ext == ".lz4" and lz4 is None:
return
if ext == ".lz4" and lz4.frame is not None:
klasses[".lz4"] = lz4.frame.LZ4FrameFile
klass = klasses[ext]
with xopen(TEST_DIR / f"file.txt{ext}", "rb", threads=0) as f:
assert isinstance(f, klass), f
Expand Down Expand Up @@ -401,6 +411,10 @@ def test_write_no_threads(tmp_path, ext):
# Skip zst because if python-zstandard is not installed,
# we fall back to an external process even when threads=0
return
if ext == ".lz4" and lz4 is None:
return
if ext == ".lz4" and lz4.frame is not None:
klasses[".lz4"] = lz4.frame.LZ4FrameFile
klass = klasses[ext]
with xopen(tmp_path / f"out{ext}", "wb", threads=0) as f:
if isinstance(f, io.BufferedWriter):
Expand Down Expand Up @@ -613,7 +627,6 @@ def test_xopen_zst_long_window_size(threads):
def test_pass_file_object_for_reading(ext, threads):
if ext == ".zst" and zstandard is None:
return

with open(TEST_DIR / f"file.txt{ext}", "rb") as fh:
with xopen(fh, mode="rb", threads=threads) as f:
assert f.readline() == CONTENT_LINES[0].encode("utf-8")
Expand Down Expand Up @@ -641,6 +654,11 @@ def test_pass_bytesio_for_reading_and_writing(ext, threads):
format = None
if ext == ".zst" and zstandard is None:
return
if ext == ".lz4" and lz4 is None and threads == 0:
pytest.skip("lz4 not working for BytesIO in piped write mode")
if ext == ".lz4" and threads != 0:
# _PipedCompressionProgram not working on write mode
pytest.skip("lz4 not working for BytesIO in piped write mode")
first_line = CONTENT_LINES[0].encode("utf-8")
writer = xopen(filelike, "wb", format=format, threads=threads)
writer.write(first_line)
Expand Down
Loading