diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff4788b..225c86d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,10 +59,10 @@ jobs: steps: - name: Install optional tools macOS if: runner.os == 'macOS' && matrix.optional-deps - run: brew install pigz pbzip2 isa-l zstd + run: brew install pigz pbzip2 isa-l zstd lz4 - name: Install optional tools Linux if: runner.os == 'Linux' && matrix.optional-deps - run: sudo apt-get install pigz pbzip2 isal zstd + run: sudo apt-get install pigz pbzip2 isal zstd lz4 - name: Remove xz if: runner.os == 'Linux' && !matrix.optional-deps run: while which xz; do sudo rm $(which xz); done diff --git a/README.rst b/README.rst index 2e5c268..cb3ea91 100644 --- a/README.rst +++ b/README.rst @@ -26,6 +26,7 @@ Supported compression formats are: - gzip (``.gz``) - bzip2 (``.bz2``) - xz (``.xz``) +- lz4 (``.lz4``) - Zstandard (``.zst``) (optional) @@ -71,7 +72,7 @@ The function opens the file using a function suitable for the detected file format and returns an open file-like object. When writing, the file format is chosen based on the file name extension: -``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``. +``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``. If the extension is not recognized, no compression is used. When reading and a file name extension is available, the format is detected @@ -99,13 +100,13 @@ preferred locale encoding. **compresslevel**: The compression level for writing to gzip, xz and Zstandard files. If set to None, a default depending on the format is used: -gzip: 1, xz: 6, Zstandard: 3. +gzip: 1, xz: 6, Zstandard: 3, lz4: 1. This parameter is ignored for other compression formats. **format**: Override the autodetection of the input or output format. -Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``. +Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``. **threads**: Set the number of additional threads spawned for compression or decompression. @@ -138,6 +139,9 @@ built-in support for multithreaded compression. For bz2 files, `pbzip2 (parallel bzip2) `_ is used. +For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html) +package is used. + ``xopen`` falls back to Python’s built-in functions (``gzip.open``, ``lzma.open``, ``bz2.open``) if none of the other methods can be used. diff --git a/pyproject.toml b/pyproject.toml index c0f5bc2..cf99591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,8 @@ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ 'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', - 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"' + 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', + 'lz4>4.3.1; platform_python_implementation != "PyPy"', ] [project.urls] diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 89f5137..5980fde 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -43,6 +43,7 @@ XOPEN_DEFAULT_BZ2_COMPRESSION = 9 XOPEN_DEFAULT_XZ_COMPRESSION = 6 XOPEN_DEFAULT_ZST_COMPRESSION = 3 +XOPEN_DEFAULT_LZ4_COMPRESSION = 0 igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] @@ -70,6 +71,11 @@ except ImportError: zstandard = None # type: ignore +try: + import lz4.frame # type: ignore +except ImportError: + lz4 = None + try: import fcntl @@ -120,6 +126,7 @@ class _ProgramSettings: "zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"), "pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"), "gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))), + "lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))), } @@ -551,6 +558,41 @@ def _open_zst( return io.BufferedWriter(f) # mode "ab" and "wb" +def _open_lz4( + filename: FileOrPath, + mode: str, + compresslevel: Optional[int], + threads: Optional[int], +): + assert mode in ("rb", "ab", "wb") + if compresslevel is None: + compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION + + if lz4 is not None and (mode == "rb" or (mode in ("ab", "wb") and threads == 0)): + # use Python bindings + f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel) + return f + # use CLI program + try: + return _PipedCompressionProgram( + filename, + mode, + compresslevel, + threads, + program_settings=_PROGRAM_SETTINGS["lz4"], + ) + except OSError: + _program_settings = _PROGRAM_SETTINGS["lz4"] + _program_settings.threads_flag = None + return _PipedCompressionProgram( + filename, + mode, + compresslevel, + threads, + program_settings=_program_settings, + ) + + def _open_gz( filename: FileOrPath, mode: str, @@ -683,6 +725,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]: elif bs[:4] == b"\x28\xb5\x2f\xfd": # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1 return "zst" + elif bs[:4] == b"\x04\x22\x4d\x18": + # https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md + return "lz4" + return None finally: if closefd: @@ -694,7 +740,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]: Attempt to detect file format from the filename extension. Return None if no format could be detected. """ - for ext in ("bz2", "xz", "gz", "zst"): + for ext in ("bz2", "xz", "gz", "zst", "lz4"): if isinstance(filename, bytes): if filename.endswith(b"." + ext.encode()): return ext @@ -717,7 +763,7 @@ def _file_or_path_to_binary_stream( # object is not binary, this will crash at a later point. return file_or_path, False # type: ignore raise TypeError( - f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}." + f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}." ) @@ -797,6 +843,7 @@ def xopen( # noqa: C901 - .bz2 uses bzip2 compression - .xz uses xz/lzma compression - .zst uses zstandard compression + - .lz4 uses lz4 compression - otherwise, no compression is used When reading, if a file name extension is available, the format is detected @@ -808,7 +855,7 @@ def xopen( # noqa: C901 compresslevel is the compression level for writing to gzip, xz and zst files. This parameter is ignored for the other compression formats. If set to None, a default depending on the format is used: - gzip: 6, xz: 6, zstd: 3. + gzip: 6, xz: 6, zstd: 3, lz4: 0. When threads is None (the default), compressed file formats are read or written using a pipe to a subprocess running an external tool such as, @@ -828,7 +875,7 @@ def xopen( # noqa: C901 format overrides the autodetection of input and output formats. This can be useful when compressed output needs to be written to a file without an - extension. Possible values are "gz", "xz", "bz2", "zst". + extension. Possible values are "gz", "xz", "bz2", "zst", "lz4". """ if mode in ("r", "w", "a"): mode += "t" # type: ignore @@ -844,10 +891,10 @@ def xopen( # noqa: C901 elif _file_is_a_socket_or_pipe(filename): filename = open(filename, binary_mode) # type: ignore - if format not in (None, "gz", "xz", "bz2", "zst"): + if format not in (None, "gz", "xz", "bz2", "zst", "lz4"): raise ValueError( f"Format not supported: {format}. " - f"Choose one of: 'gz', 'xz', 'bz2', 'zst'" + f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'." ) detected_format = format or _detect_format_from_extension(filepath) if detected_format is None and "r" in mode: @@ -861,6 +908,8 @@ def xopen( # noqa: C901 opened_file = _open_bz2(filename, binary_mode, compresslevel, threads) elif detected_format == "zst": opened_file = _open_zst(filename, binary_mode, compresslevel, threads) + elif detected_format == "lz4": + opened_file = _open_lz4(filename, binary_mode, compresslevel, threads) else: opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode) diff --git a/tests/file.txt.lz4 b/tests/file.txt.lz4 new file mode 100644 index 0000000..5b2ed80 Binary files /dev/null and b/tests/file.txt.lz4 differ diff --git a/tests/test_piped.py b/tests/test_piped.py index 9f8afbe..eba903f 100644 --- a/tests/test_piped.py +++ b/tests/test_piped.py @@ -18,7 +18,7 @@ _ProgramSettings, ) -extensions = ["", ".gz", ".bz2", ".xz", ".zst"] +extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"] try: import fcntl @@ -57,16 +57,24 @@ def available_zstd_programs(): return [] +def available_lz4_programs(): + if shutil.which("lz4"): + return [_PROGRAM_SETTINGS["lz4"]] + return [] + + PIPED_GZIP_PROGRAMS = available_gzip_programs() PIPED_BZIP2_PROGRAMS = available_bzip2_programs() PIPED_XZ_PROGRAMS = available_xz_programs() PIPED_ZST_PROGRAMS = available_zstd_programs() +PIPED_LZ4_PROGRAMS = available_lz4_programs() ALL_PROGRAMS_WITH_EXTENSION = ( list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"]))) + list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"]))) + list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"]))) + list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"]))) + + list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"]))) ) diff --git a/tests/test_xopen.py b/tests/test_xopen.py index 9e8f816..8f1a21b 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -2,33 +2,37 @@ Tests for the xopen.xopen function """ import bz2 -import subprocess -import sys -import tempfile -from contextlib import contextmanager import functools import gzip import io import lzma import os -from pathlib import Path import shutil +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from pathlib import Path + import pytest -from xopen import xopen, _detect_format_from_content +from xopen import _detect_format_from_content, xopen +try: + import lz4.frame +except ImportError: + lz4 = None try: import zstandard except ImportError: zstandard = None - # TODO this is duplicated in test_piped.py TEST_DIR = Path(__file__).parent CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"] CONTENT = "".join(CONTENT_LINES) -extensions = ["", ".gz", ".bz2", ".xz"] +extensions = ["", ".gz", ".bz2", ".xz", ".lz4"] if shutil.which("zstd") or zstandard: extensions += [".zst"] base = os.path.join(os.path.dirname(__file__), "file.txt") @@ -109,6 +113,8 @@ def test_binary(fname): def test_roundtrip(ext, tmp_path, threads, mode): if ext == ".zst" and threads == 0 and zstandard is None: return + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") path = tmp_path / f"file{ext}" data = b"Hello" if mode == "b" else "Hello" with xopen(path, "w" + mode, threads=threads) as f: @@ -203,6 +209,8 @@ def test_next(fname): def test_has_iter_method(ext, tmp_path): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") path = tmp_path / f"out{ext}" with xopen(path, mode="w") as f: # Writing anything isn’t strictly necessary, but if we don’t, then @@ -270,6 +278,8 @@ def test_invalid_compression_level(tmp_path): def test_append(ext, threads, tmp_path): if ext == ".zst" and zstandard is None and threads == 0: pytest.skip("No zstandard installed") + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") text = b"AB" reference = text + text path = tmp_path / f"the-file{ext}" @@ -286,6 +296,8 @@ def test_append(ext, threads, tmp_path): @pytest.mark.parametrize("ext", extensions) def test_append_text(ext, tmp_path): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") text = "AB" reference = text + text path = tmp_path / f"the-file{ext}" @@ -369,12 +381,18 @@ def test_read_no_threads(ext): } if ext == ".zst" and zstandard is None: return + if ext == ".lz4" and lz4 is None: + return + if ext == ".lz4" and lz4.frame is not None: + klasses[".lz4"] = lz4.frame.LZ4FrameFile klass = klasses[ext] with xopen(TEST_DIR / f"file.txt{ext}", "rb", threads=0) as f: assert isinstance(f, klass), f def test_write_threads(tmp_path, ext): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") path = tmp_path / f"out.{ext}" with xopen(path, mode="w", threads=3) as f: f.write("hello") @@ -401,6 +419,11 @@ def test_write_no_threads(tmp_path, ext): # Skip zst because if python-zstandard is not installed, # we fall back to an external process even when threads=0 return + if ext == ".lz4" and lz4 is None: + # Skip lz4 if lz4 is not installed + return + if ext == ".lz4" and lz4.frame is not None: + klasses[".lz4"] = lz4.frame.LZ4FrameFile klass = klasses[ext] with xopen(tmp_path / f"out{ext}", "wb", threads=0) as f: if isinstance(f, io.BufferedWriter): @@ -450,6 +473,8 @@ def test_read_pathlib_binary(fname): def test_write_pathlib(ext, tmp_path): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") path = tmp_path / f"hello.txt{ext}" with xopen(path, mode="wt") as f: f.write("hello") @@ -458,6 +483,8 @@ def test_write_pathlib(ext, tmp_path): def test_write_pathlib_binary(ext, tmp_path): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") path = tmp_path / f"hello.txt{ext}" with xopen(path, mode="wb") as f: f.write(b"hello") @@ -495,6 +522,8 @@ def test_falls_back_to_lzma_open(lacking_xz_permissions): def test_open_many_writers(tmp_path, ext): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") files = [] # Because lzma.open allocates a lot of memory, # open fewer files to avoid MemoryError on 32-bit architectures @@ -540,6 +569,8 @@ def test_override_output_format_wrong_format(tmp_path): @pytest.mark.parametrize("opener", OPENERS) @pytest.mark.parametrize("extension", extensions) def test_text_encoding_newline_passthrough(opener, extension, tmp_path): + if extension == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") if extension == ".zst" and zstandard is None: return # "Eén ree\nTwee reeën\n" latin-1 encoded with \r for as line separator. @@ -555,6 +586,8 @@ def test_text_encoding_newline_passthrough(opener, extension, tmp_path): @pytest.mark.parametrize("opener", OPENERS) @pytest.mark.parametrize("extension", extensions) def test_text_encoding_errors(opener, extension, tmp_path): + if extension == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") if extension == ".zst" and zstandard is None: return # "Eén ree\nTwee reeën\n" latin-1 encoded. This is not valid ascii. @@ -613,7 +646,6 @@ def test_xopen_zst_long_window_size(threads): def test_pass_file_object_for_reading(ext, threads): if ext == ".zst" and zstandard is None: return - with open(TEST_DIR / f"file.txt{ext}", "rb") as fh: with xopen(fh, mode="rb", threads=threads) as f: assert f.readline() == CONTENT_LINES[0].encode("utf-8") @@ -641,6 +673,11 @@ def test_pass_bytesio_for_reading_and_writing(ext, threads): format = None if ext == ".zst" and zstandard is None: return + if ext == ".lz4" and lz4 is None and threads == 0: + pytest.skip("lz4 not working for BytesIO in piped write mode") + if ext == ".lz4" and threads != 0: + # _PipedCompressionProgram not working on write mode + pytest.skip("lz4 not working for BytesIO in piped write mode") first_line = CONTENT_LINES[0].encode("utf-8") writer = xopen(filelike, "wb", format=format, threads=threads) writer.write(first_line) @@ -690,6 +727,8 @@ def test_xopen_read_from_pipe(ext, threads): @pytest.mark.parametrize("threads", (0, 1)) def test_xopen_write_to_pipe(threads, ext): + if ext == ".lz4" and shutil.which("lz4") is None: + pytest.skip("lz4 not installed") if ext == ".zst" and zstandard is None: return format = ext.lstrip(".")