Skip to content

Commit 0183eb5

Browse files
committed
bring in contents of numcodecs.zarr3
1 parent c21d1f9 commit 0183eb5

File tree

1 file changed

+351
-0
lines changed

1 file changed

+351
-0
lines changed

src/zarr/codecs/_numcodecs.py

Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
"""
2+
This module provides the compatibility for :py:mod:`numcodecs` in Zarr version 3.
3+
4+
A compatibility module is required because the codec handling in Zarr version 3 is different from Zarr version 2.
5+
6+
You can use codecs from :py:mod:`numcodecs` by constructing codecs from :py:mod:`numcodecs.zarr3` using the same parameters as the original codecs.
7+
8+
>>> import zarr
9+
>>> import numcodecs.zarr3
10+
>>>
11+
>>> array = zarr.create_array(
12+
... store="data.zarr",
13+
... shape=(1024, 1024),
14+
... chunks=(64, 64),
15+
... dtype="uint32",
16+
... filters=[numcodecs.zarr3.Delta()],
17+
... compressors=[numcodecs.zarr3.BZ2(level=5)])
18+
>>> array[:] = np.arange(*array.shape).astype(array.dtype)
19+
20+
.. note::
21+
22+
Please note that the codecs in :py:mod:`numcodecs.zarr3` are not part of the Zarr version 3 specification.
23+
Using these codecs might cause interoperability issues with other Zarr implementations.
24+
"""
25+
26+
from __future__ import annotations
27+
28+
import asyncio
29+
import math
30+
from dataclasses import dataclass, replace
31+
from functools import cached_property
32+
from typing import TYPE_CHECKING, Any, Self
33+
from warnings import warn
34+
35+
import numpy as np
36+
37+
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
38+
from zarr.abc.metadata import Metadata
39+
from zarr.core.buffer.cpu import as_numpy_array_wrapper
40+
from zarr.core.common import JSON, parse_named_configuration, product
41+
from zarr.dtype import UInt8, ZDType, parse_dtype
42+
from zarr.errors import ZarrUserWarning
43+
from zarr.registry import get_numcodec
44+
45+
if TYPE_CHECKING:
46+
from zarr.abc.numcodec import Numcodec
47+
from zarr.core.array_spec import ArraySpec
48+
from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
49+
50+
CODEC_PREFIX = "numcodecs."
51+
52+
53+
def _expect_name_prefix(codec_name: str) -> str:
54+
if not codec_name.startswith(CODEC_PREFIX):
55+
raise ValueError(
56+
f"Expected name to start with '{CODEC_PREFIX}'. Got {codec_name} instead."
57+
) # pragma: no cover
58+
return codec_name.removeprefix(CODEC_PREFIX)
59+
60+
61+
def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]:
62+
parsed_name, parsed_configuration = parse_named_configuration(data)
63+
if not parsed_name.startswith(CODEC_PREFIX):
64+
raise ValueError(
65+
f"Expected name to start with '{CODEC_PREFIX}'. Got {parsed_name} instead."
66+
) # pragma: no cover
67+
id = _expect_name_prefix(parsed_name)
68+
return {"id": id, **parsed_configuration}
69+
70+
71+
@dataclass(frozen=True)
72+
class _NumcodecsCodec(Metadata):
73+
codec_name: str
74+
codec_config: dict[str, JSON]
75+
76+
def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None:
77+
"""To be used only when creating the actual public-facing codec class."""
78+
super().__init_subclass__(**kwargs)
79+
if codec_name is not None:
80+
namespace = codec_name
81+
82+
cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}"
83+
cls.codec_name = f"{CODEC_PREFIX}{namespace}"
84+
cls.__doc__ = f"""
85+
See :class:`{cls_name}` for more details and parameters.
86+
"""
87+
88+
def __init__(self, **codec_config: JSON) -> None:
89+
if not self.codec_name:
90+
raise ValueError(
91+
"The codec name needs to be supplied through the `codec_name` attribute."
92+
) # pragma: no cover
93+
unprefixed_codec_name = _expect_name_prefix(self.codec_name)
94+
95+
if "id" not in codec_config:
96+
codec_config = {"id": unprefixed_codec_name, **codec_config}
97+
elif codec_config["id"] != unprefixed_codec_name:
98+
raise ValueError(
99+
f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}."
100+
) # pragma: no cover
101+
102+
object.__setattr__(self, "codec_config", codec_config)
103+
warn(
104+
"Numcodecs codecs are not in the Zarr version 3 specification and "
105+
"may not be supported by other zarr implementations.",
106+
category=ZarrUserWarning,
107+
stacklevel=2,
108+
)
109+
110+
@cached_property
111+
def _codec(self) -> Numcodec:
112+
return get_numcodec(self.codec_config) # type: ignore[arg-type]
113+
114+
@classmethod
115+
def from_dict(cls, data: dict[str, JSON]) -> Self:
116+
codec_config = _parse_codec_configuration(data)
117+
return cls(**codec_config)
118+
119+
def to_dict(self) -> dict[str, JSON]:
120+
codec_config = self.codec_config.copy()
121+
codec_config.pop("id", None)
122+
return {
123+
"name": self.codec_name,
124+
"configuration": codec_config,
125+
}
126+
127+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
128+
raise NotImplementedError # pragma: no cover
129+
130+
# Override __repr__ because dynamically constructed classes don't seem to work otherwise
131+
def __repr__(self) -> str:
132+
codec_config = self.codec_config.copy()
133+
codec_config.pop("id", None)
134+
return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})"
135+
136+
137+
class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec):
138+
def __init__(self, **codec_config: JSON) -> None:
139+
super().__init__(**codec_config)
140+
141+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
142+
return await asyncio.to_thread(
143+
as_numpy_array_wrapper,
144+
self._codec.decode,
145+
chunk_data,
146+
chunk_spec.prototype,
147+
)
148+
149+
def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer:
150+
encoded = self._codec.encode(chunk_data.as_array_like())
151+
if isinstance(encoded, np.ndarray): # Required for checksum codecs
152+
return prototype.buffer.from_bytes(encoded.tobytes())
153+
return prototype.buffer.from_bytes(encoded)
154+
155+
async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
156+
return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)
157+
158+
159+
class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec):
160+
def __init__(self, **codec_config: JSON) -> None:
161+
super().__init__(**codec_config)
162+
163+
async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
164+
chunk_ndarray = chunk_data.as_ndarray_like()
165+
out = await asyncio.to_thread(self._codec.decode, chunk_ndarray)
166+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
167+
168+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
169+
chunk_ndarray = chunk_data.as_ndarray_like()
170+
out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
171+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out)
172+
173+
174+
class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec):
175+
def __init__(self, **codec_config: JSON) -> None:
176+
super().__init__(**codec_config)
177+
178+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
179+
chunk_bytes = chunk_data.to_bytes()
180+
out = await asyncio.to_thread(self._codec.decode, chunk_bytes)
181+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
182+
183+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
184+
chunk_ndarray = chunk_data.as_ndarray_like()
185+
out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
186+
return chunk_spec.prototype.buffer.from_bytes(out)
187+
188+
189+
# bytes-to-bytes codecs
190+
class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"):
191+
pass
192+
193+
194+
class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"):
195+
pass
196+
197+
198+
class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"):
199+
pass
200+
201+
202+
class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"):
203+
pass
204+
205+
206+
class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"):
207+
pass
208+
209+
210+
class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"):
211+
pass
212+
213+
214+
class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"):
215+
pass
216+
217+
218+
class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"):
219+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle:
220+
if self.codec_config.get("elementsize") is None:
221+
dtype = array_spec.dtype.to_native_dtype()
222+
return Shuffle(**{**self.codec_config, "elementsize": dtype.itemsize})
223+
return self # pragma: no cover
224+
225+
226+
# array-to-array codecs ("filters")
227+
class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"):
228+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
229+
if astype := self.codec_config.get("astype"):
230+
dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload]
231+
return replace(chunk_spec, dtype=dtype)
232+
return chunk_spec
233+
234+
235+
class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"):
236+
pass
237+
238+
239+
class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"):
240+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
241+
if astype := self.codec_config.get("astype"):
242+
dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload]
243+
return replace(chunk_spec, dtype=dtype)
244+
return chunk_spec
245+
246+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset:
247+
if self.codec_config.get("dtype") is None:
248+
dtype = array_spec.dtype.to_native_dtype()
249+
return FixedScaleOffset(**{**self.codec_config, "dtype": str(dtype)})
250+
return self
251+
252+
253+
class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"):
254+
def __init__(self, **codec_config: JSON) -> None:
255+
super().__init__(**codec_config)
256+
257+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize:
258+
if self.codec_config.get("dtype") is None:
259+
dtype = array_spec.dtype.to_native_dtype()
260+
return Quantize(**{**self.codec_config, "dtype": str(dtype)})
261+
return self
262+
263+
264+
class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"):
265+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
266+
return replace(
267+
chunk_spec,
268+
shape=(1 + math.ceil(product(chunk_spec.shape) / 8),),
269+
dtype=UInt8(),
270+
)
271+
272+
# todo: remove this type: ignore when this class can be defined w.r.t.
273+
# a single zarr dtype API
274+
def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None:
275+
# this is bugged and will fail
276+
_dtype = dtype.to_native_dtype()
277+
if _dtype != np.dtype("bool"):
278+
raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.")
279+
280+
281+
class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"):
282+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
283+
dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type]
284+
return replace(chunk_spec, dtype=dtype)
285+
286+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> AsType:
287+
if self.codec_config.get("decode_dtype") is None:
288+
# TODO: remove these coverage exemptions the correct way, i.e. with tests
289+
dtype = array_spec.dtype.to_native_dtype() # pragma: no cover
290+
return AsType(**{**self.codec_config, "decode_dtype": str(dtype)}) # pragma: no cover
291+
return self
292+
293+
294+
# bytes-to-bytes checksum codecs
295+
class _NumcodecsChecksumCodec(_NumcodecsBytesBytesCodec):
296+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
297+
return input_byte_length + 4 # pragma: no cover
298+
299+
300+
class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"):
301+
pass
302+
303+
304+
class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"):
305+
pass
306+
307+
308+
class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"):
309+
pass
310+
311+
312+
class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"):
313+
pass
314+
315+
316+
class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"):
317+
pass
318+
319+
320+
# array-to-bytes codecs
321+
class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"):
322+
pass
323+
324+
325+
class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"):
326+
pass
327+
328+
329+
__all__ = [
330+
"BZ2",
331+
"CRC32",
332+
"CRC32C",
333+
"LZ4",
334+
"LZMA",
335+
"ZFPY",
336+
"Adler32",
337+
"AsType",
338+
"BitRound",
339+
"Blosc",
340+
"Delta",
341+
"FixedScaleOffset",
342+
"Fletcher32",
343+
"GZip",
344+
"JenkinsLookup3",
345+
"PCodec",
346+
"PackBits",
347+
"Quantize",
348+
"Shuffle",
349+
"Zlib",
350+
"Zstd",
351+
]

0 commit comments

Comments
 (0)