Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ jobs:
steps:
- name: Install optional tools macOS
if: runner.os == 'macOS' && matrix.optional-deps
run: brew install pigz pbzip2 isa-l zstd
run: brew install pigz pbzip2 isa-l zstd lz4
- name: Install optional tools Linux
if: runner.os == 'Linux' && matrix.optional-deps
run: sudo apt-get install pigz pbzip2 isal zstd
run: sudo apt-get install pigz pbzip2 isal zstd lz4
- name: Remove xz
if: runner.os == 'Linux' && !matrix.optional-deps
run: while which xz; do sudo rm $(which xz); done
Expand Down
10 changes: 7 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Supported compression formats are:
- gzip (``.gz``)
- bzip2 (``.bz2``)
- xz (``.xz``)
- lz4 (``.lz4``)
- Zstandard (``.zst``) (optional)


Expand Down Expand Up @@ -71,7 +72,7 @@ The function opens the file using a function suitable for the detected
file format and returns an open file-like object.

When writing, the file format is chosen based on the file name extension:
``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``.
If the extension is not recognized, no compression is used.

When reading and a file name extension is available, the format is detected
Expand Down Expand Up @@ -99,13 +100,13 @@ preferred locale encoding.
**compresslevel**:
The compression level for writing to gzip, xz and Zstandard files.
If set to None, a default depending on the format is used:
gzip: 1, xz: 6, Zstandard: 3.
gzip: 1, xz: 6, Zstandard: 3, lz4: 1.

This parameter is ignored for other compression formats.

**format**:
Override the autodetection of the input or output format.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``.

**threads**:
Set the number of additional threads spawned for compression or decompression.
Expand Down Expand Up @@ -138,6 +139,9 @@ built-in support for multithreaded compression.

For bz2 files, `pbzip2 (parallel bzip2) <http://compression.great-site.net/pbzip2/>`_ is used.

For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html)
package is used.

``xopen`` falls back to Python’s built-in functions
(``gzip.open``, ``lzma.open``, ``bz2.open``)
if none of the other methods can be used.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ requires-python = ">=3.9"
dynamic = ["version"]
dependencies = [
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'lz4>4.3.1; platform_python_implementation != "PyPy"',
]

[project.urls]
Expand Down
62 changes: 56 additions & 6 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
XOPEN_DEFAULT_BZ2_COMPRESSION = 9
XOPEN_DEFAULT_XZ_COMPRESSION = 6
XOPEN_DEFAULT_ZST_COMPRESSION = 3
XOPEN_DEFAULT_LZ4_COMPRESSION = 0

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
Expand Down Expand Up @@ -70,6 +71,11 @@
except ImportError:
zstandard = None # type: ignore

try:
import lz4.frame # type: ignore
except ImportError:
lz4 = None

try:
import fcntl

Expand Down Expand Up @@ -120,6 +126,7 @@ class _ProgramSettings:
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
"lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))),
}


Expand Down Expand Up @@ -551,6 +558,42 @@ def _open_zst(
return io.BufferedWriter(f) # mode "ab" and "wb"


def _open_lz4(
filename: FileOrPath,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
):
assert mode in ("rb", "ab", "wb")
if compresslevel is None:
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION

if lz4 is not None and (mode == "rb" or (mode in ("ab", "wb") and threads == 0)):
# use Python bindings
f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel)
return f
else:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

else clause is redundant. If the above case is true, the function returns.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed on 4c18c24

# use CLI program
try:
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_PROGRAM_SETTINGS["lz4"],
)
except OSError:
_program_settings = _PROGRAM_SETTINGS["lz4"]
_program_settings.threads_flag = None
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_program_settings,
)


def _open_gz(
filename: FileOrPath,
mode: str,
Expand Down Expand Up @@ -683,6 +726,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
elif bs[:4] == b"\x28\xb5\x2f\xfd":
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
return "zst"
elif bs[:4] == b"\x04\x22\x4d\x18":
# https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather have a link to a specification or other technical documentation here. There is no guarantee that wikipedia will keep listing the magic bytes.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed on 4c18c24

return "lz4"

return None
finally:
if closefd:
Expand All @@ -694,7 +741,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
Attempt to detect file format from the filename extension.
Return None if no format could be detected.
"""
for ext in ("bz2", "xz", "gz", "zst"):
for ext in ("bz2", "xz", "gz", "zst", "lz4"):
if isinstance(filename, bytes):
if filename.endswith(b"." + ext.encode()):
return ext
Expand All @@ -717,7 +764,7 @@ def _file_or_path_to_binary_stream(
# object is not binary, this will crash at a later point.
return file_or_path, False # type: ignore
raise TypeError(
f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}."
f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}."
)


Expand Down Expand Up @@ -797,6 +844,7 @@ def xopen( # noqa: C901
- .bz2 uses bzip2 compression
- .xz uses xz/lzma compression
- .zst uses zstandard compression
- .lz4 uses lz4 compression
- otherwise, no compression is used

When reading, if a file name extension is available, the format is detected
Expand All @@ -808,7 +856,7 @@ def xopen( # noqa: C901
compresslevel is the compression level for writing to gzip, xz and zst files.
This parameter is ignored for the other compression formats.
If set to None, a default depending on the format is used:
gzip: 6, xz: 6, zstd: 3.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: didn't we change the gzip level to 1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we did.

gzip: 6, xz: 6, zstd: 3, lz4: 0.

When threads is None (the default), compressed file formats are read or written
using a pipe to a subprocess running an external tool such as,
Expand All @@ -828,7 +876,7 @@ def xopen( # noqa: C901

format overrides the autodetection of input and output formats. This can be
useful when compressed output needs to be written to a file without an
extension. Possible values are "gz", "xz", "bz2", "zst".
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4".
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
Expand All @@ -844,10 +892,10 @@ def xopen( # noqa: C901
elif _file_is_a_socket_or_pipe(filename):
filename = open(filename, binary_mode) # type: ignore

if format not in (None, "gz", "xz", "bz2", "zst"):
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"):
raise ValueError(
f"Format not supported: {format}. "
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'."
)
detected_format = format or _detect_format_from_extension(filepath)
if detected_format is None and "r" in mode:
Expand All @@ -861,6 +909,8 @@ def xopen( # noqa: C901
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
elif detected_format == "zst":
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
elif detected_format == "lz4":
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads)
else:
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)

Expand Down
Binary file added tests/file.txt.lz4
Binary file not shown.
10 changes: 9 additions & 1 deletion tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_ProgramSettings,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"]

try:
import fcntl
Expand Down Expand Up @@ -57,16 +57,24 @@ def available_zstd_programs():
return []


def available_lz4_programs():
if shutil.which("lz4"):
return [_PROGRAM_SETTINGS["lz4"]]
return []


PIPED_GZIP_PROGRAMS = available_gzip_programs()
PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
PIPED_XZ_PROGRAMS = available_xz_programs()
PIPED_ZST_PROGRAMS = available_zstd_programs()
PIPED_LZ4_PROGRAMS = available_lz4_programs()

ALL_PROGRAMS_WITH_EXTENSION = (
list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
+ list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
+ list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
+ list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
+ list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"])))
)


Expand Down
Loading
Loading