Skip to content

Commit ca19044

Browse files
committed
Merge branch 'handle-zarr-3.1.0' of https://github.com/d-v-b/numcodecs into handle-zarr-3.1.0
2 parents 171e5a5 + 5cad56f commit ca19044

File tree

10 files changed

+302
-30
lines changed

10 files changed

+302
-30
lines changed

docs/compression/zstd.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ Zstd
77
.. autoattribute:: codec_id
88
.. automethod:: encode
99
.. automethod:: decode
10+
.. note::
11+
If the compressed data does not contain the decompressed size, streaming
12+
decompression will be used.
1013
.. automethod:: get_config
1114
.. automethod:: from_config
1215

docs/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ Maintenance
9595

9696
Improvements
9797
~~~~~~~~~~~~
98+
* Add streaming decompression for ZSTD (:issue:`699`)
99+
By :user:`Mark Kittisopikul <mkitti>`.
98100
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
99101
By :user:`Cas Wognum <cwognum>`.
100102

numcodecs/abc.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,13 @@
2929
"""
3030

3131
from abc import ABC, abstractmethod
32-
from typing import Optional
3332

3433

3534
class Codec(ABC):
3635
"""Codec abstract base class."""
3736

3837
# override in sub-class
39-
codec_id: Optional[str] = None
38+
codec_id: str | None = None
4039
"""Codec identifier."""
4140

4241
@abstractmethod

numcodecs/checksum32.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import zlib
44
from contextlib import suppress
55
from types import ModuleType
6-
from typing import Literal, Optional
6+
from typing import Literal
77

88
import numpy as np
99
from typing_extensions import Buffer
@@ -12,7 +12,7 @@
1212
from .compat import ensure_contiguous_ndarray, ndarray_copy
1313
from .jenkins import jenkins_lookup3
1414

15-
_crc32c: Optional[ModuleType] = None
15+
_crc32c: ModuleType | None = None
1616
with suppress(ImportError):
1717
import crc32c as _crc32c # type: ignore[no-redef, unused-ignore]
1818

numcodecs/lzma.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from types import ModuleType
2-
from typing import Optional
32

4-
_lzma: Optional[ModuleType] = None
3+
_lzma: ModuleType | None = None
54
try:
65
import lzma as _lzma
76
except ImportError: # pragma: no cover

numcodecs/tests/test_pyzstd.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Check Zstd against pyzstd package
2+
3+
import numpy as np
4+
import pytest
5+
import pyzstd
6+
7+
from numcodecs.zstd import Zstd
8+
9+
test_data = [
10+
b"Hello World!",
11+
np.arange(113).tobytes(),
12+
np.arange(10, 15).tobytes(),
13+
np.random.randint(3, 50, size=(53,), dtype=np.uint16).tobytes(),
14+
]
15+
16+
17+
@pytest.mark.parametrize("input", test_data)
18+
def test_pyzstd_simple(input):
19+
"""
20+
Test if Zstd.[decode, encode] can perform the inverse operation to
21+
pyzstd.[compress, decompress] in the simple case.
22+
"""
23+
z = Zstd()
24+
assert z.decode(pyzstd.compress(input)) == input
25+
assert pyzstd.decompress(z.encode(input)) == input
26+
27+
28+
@pytest.mark.xfail
29+
@pytest.mark.parametrize("input", test_data)
30+
def test_pyzstd_simple_multiple_frames_decode(input):
31+
"""
32+
Test decompression of two concatenated frames of known sizes
33+
34+
numcodecs.zstd.Zstd currently fails because it only assesses the size of the
35+
first frame. Rather, it should keep iterating through all the frames until
36+
the end of the input buffer.
37+
"""
38+
z = Zstd()
39+
assert pyzstd.decompress(pyzstd.compress(input) * 2) == input * 2
40+
assert z.decode(pyzstd.compress(input) * 2) == input * 2
41+
42+
43+
@pytest.mark.parametrize("input", test_data)
44+
def test_pyzstd_simple_multiple_frames_encode(input):
45+
"""
46+
Test if pyzstd can decompress two concatenated frames from Zstd.encode
47+
"""
48+
z = Zstd()
49+
assert pyzstd.decompress(z.encode(input) * 2) == input * 2
50+
51+
52+
@pytest.mark.parametrize("input", test_data)
53+
def test_pyzstd_streaming(input):
54+
"""
55+
Test if Zstd can decode a single frame and concatenated frames in streaming
56+
mode where the decompressed size is not recorded in the frame header.
57+
"""
58+
pyzstd_c = pyzstd.ZstdCompressor()
59+
pyzstd_d = pyzstd.ZstdDecompressor()
60+
pyzstd_e = pyzstd.EndlessZstdDecompressor()
61+
z = Zstd()
62+
63+
d_bytes = input
64+
pyzstd_c.compress(d_bytes)
65+
c_bytes = pyzstd_c.flush()
66+
assert z.decode(c_bytes) == d_bytes
67+
assert pyzstd_d.decompress(z.encode(d_bytes)) == d_bytes
68+
69+
# Test multiple streaming frames
70+
assert z.decode(c_bytes * 2) == pyzstd_e.decompress(c_bytes * 2)
71+
assert z.decode(c_bytes * 3) == pyzstd_e.decompress(c_bytes * 3)
72+
assert z.decode(c_bytes * 4) == pyzstd_e.decompress(c_bytes * 4)
73+
assert z.decode(c_bytes * 5) == pyzstd_e.decompress(c_bytes * 5)
74+
assert z.decode(c_bytes * 7) == pyzstd_e.decompress(c_bytes * 7)
75+
assert z.decode(c_bytes * 11) == pyzstd_e.decompress(c_bytes * 11)
76+
assert z.decode(c_bytes * 13) == pyzstd_e.decompress(c_bytes * 13)
77+
assert z.decode(c_bytes * 99) == pyzstd_e.decompress(c_bytes * 99)

numcodecs/tests/test_zarr3.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def test_generic_filter(
122122
],
123123
)
124124

125-
a[:, :] = data.copy()
126-
a = zarr.open_array(store / "generic", mode="r")
125+
a[:, :] = data.copy()
126+
a = zarr.open_array(store / "generic", mode="r")
127127
np.testing.assert_array_equal(data, a[:, :])
128128

129129

@@ -140,8 +140,8 @@ def test_generic_filter_bitround(store: StorePath):
140140
filters=[numcodecs.zarr3.BitRound(keepbits=3)],
141141
)
142142

143-
a[:, :] = data.copy()
144-
a = zarr.open_array(store / "generic_bitround", mode="r")
143+
a[:, :] = data.copy()
144+
a = zarr.open_array(store / "generic_bitround", mode="r")
145145
assert np.allclose(data, a[:, :], atol=0.1)
146146

147147

@@ -158,8 +158,8 @@ def test_generic_filter_quantize(store: StorePath):
158158
filters=[numcodecs.zarr3.Quantize(digits=3)],
159159
)
160160

161-
a[:, :] = data.copy()
162-
a = zarr.open_array(store / "generic_quantize", mode="r")
161+
a[:, :] = data.copy()
162+
a = zarr.open_array(store / "generic_quantize", mode="r")
163163
assert np.allclose(data, a[:, :], atol=0.001)
164164

165165

@@ -177,8 +177,8 @@ def test_generic_filter_packbits(store: StorePath):
177177
filters=[numcodecs.zarr3.PackBits()],
178178
)
179179

180-
a[:, :] = data.copy()
181-
a = zarr.open_array(store / "generic_packbits", mode="r")
180+
a[:, :] = data.copy()
181+
a = zarr.open_array(store / "generic_packbits", mode="r")
182182
np.testing.assert_array_equal(data, a[:, :])
183183

184184
with pytest.raises(ValueError, match=".*requires bool dtype.*"):
@@ -217,8 +217,8 @@ def test_generic_checksum(
217217
compressors=[codec_class()],
218218
)
219219

220-
a[:, :] = data.copy()
221-
a = zarr.open_array(store / "generic_checksum", mode="r")
220+
a[:, :] = data.copy()
221+
a = zarr.open_array(store / "generic_checksum", mode="r")
222222
np.testing.assert_array_equal(data, a[:, :])
223223

224224

@@ -267,8 +267,8 @@ def test_delta_astype(store: StorePath):
267267
],
268268
)
269269

270-
a[:, :] = data.copy()
271-
a = zarr.open_array(store / "generic", mode="r")
270+
a[:, :] = data.copy()
271+
a = zarr.open_array(store / "generic", mode="r")
272272
np.testing.assert_array_equal(data, a[:, :])
273273

274274

numcodecs/tests/test_zstd.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import itertools
2+
import subprocess
23

34
import numpy as np
45
import pytest
56

6-
try:
7-
from numcodecs.zstd import Zstd
8-
except ImportError: # pragma: no cover
9-
pytest.skip("numcodecs.zstd not available", allow_module_level=True)
10-
11-
127
from numcodecs.tests.common import (
138
check_backwards_compatibility,
149
check_config,
@@ -17,6 +12,7 @@
1712
check_err_encode_object_buffer,
1813
check_repr,
1914
)
15+
from numcodecs.zstd import Zstd
2016

2117
codecs = [
2218
Zstd(),
@@ -90,3 +86,73 @@ def test_native_functions():
9086
assert Zstd.default_level() == 3
9187
assert Zstd.min_level() == -131072
9288
assert Zstd.max_level() == 22
89+
90+
91+
def test_streaming_decompression():
92+
# Test input frames with unknown frame content size
93+
codec = Zstd()
94+
95+
# If the zstd command line interface is available, check the bytes
96+
cli = zstd_cli_available()
97+
if cli:
98+
view_zstd_streaming_bytes()
99+
100+
# Encode bytes directly that were the result of streaming compression
101+
bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!'
102+
dec = codec.decode(bytes_val)
103+
dec_expected = b'Hello World!'
104+
assert dec == dec_expected
105+
if cli:
106+
assert bytes_val == generate_zstd_streaming_bytes(dec_expected)
107+
assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True)
108+
109+
# Two consecutive frames given as input
110+
bytes2 = bytes(bytearray(bytes_val * 2))
111+
dec2 = codec.decode(bytes2)
112+
dec2_expected = b'Hello World!Hello World!'
113+
assert dec2 == dec2_expected
114+
if cli:
115+
assert dec2_expected == generate_zstd_streaming_bytes(bytes2, decompress=True)
116+
117+
# Single long frame that decompresses to a large output
118+
bytes3 = b'(\xb5/\xfd\x00X$\x02\x00\xa4\x03ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz\x01\x00:\xfc\xdfs\x05\x05L\x00\x00\x08s\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08k\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08c\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08[\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08S\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08K\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08C\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08u\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08m\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08e\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08]\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08U\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08M\x01\x00\xfc\xff9\x10\x02M\x00\x00\x08E\x01\x00\xfc\x7f\x1d\x08\x01'
119+
dec3 = codec.decode(bytes3)
120+
dec3_expected = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz' * 1024 * 32
121+
assert dec3 == dec3_expected
122+
if cli:
123+
assert bytes3 == generate_zstd_streaming_bytes(dec3_expected)
124+
assert dec3_expected == generate_zstd_streaming_bytes(bytes3, decompress=True)
125+
126+
# Garbage input results in an error
127+
bytes4 = bytes(bytearray([0, 0, 0, 0, 0, 0, 0, 0]))
128+
with pytest.raises(RuntimeError, match='Zstd decompression error: invalid input data'):
129+
codec.decode(bytes4)
130+
131+
132+
def generate_zstd_streaming_bytes(input: bytes, *, decompress: bool = False) -> bytes:
133+
"""
134+
Use the zstd command line interface to compress or decompress bytes in streaming mode.
135+
"""
136+
if decompress:
137+
args = ["-d"]
138+
else:
139+
args = []
140+
141+
p = subprocess.run(["zstd", "--no-check", *args], input=input, capture_output=True)
142+
return p.stdout
143+
144+
145+
def view_zstd_streaming_bytes():
146+
bytes_val = generate_zstd_streaming_bytes(b"Hello world!")
147+
print(f" bytes_val = {bytes_val}")
148+
149+
bytes3 = generate_zstd_streaming_bytes(
150+
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz" * 1024 * 32
151+
)
152+
print(f" bytes3 = {bytes3}")
153+
154+
155+
def zstd_cli_available() -> bool:
156+
return not subprocess.run(
157+
["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
158+
).returncode

numcodecs/zfpy.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
from contextlib import suppress
33
from importlib.metadata import PackageNotFoundError, version
44
from types import ModuleType
5-
from typing import Optional
65

7-
_zfpy: Optional[ModuleType] = None
6+
_zfpy: ModuleType | None = None
87

98
_zfpy_version: tuple = ()
109
with suppress(PackageNotFoundError):

0 commit comments

Comments
 (0)