|
| 1 | +""" |
| 2 | +This module provides the compatibility for :py:mod:`numcodecs` in Zarr version 3. |
| 3 | +
|
| 4 | +A compatibility module is required because the codec handling in Zarr version 3 is different from Zarr version 2. |
| 5 | +
|
| 6 | +You can use codecs from :py:mod:`numcodecs` by constructing codecs from :py:mod:`numcodecs.zarr3` using the same parameters as the original codecs. |
| 7 | +
|
| 8 | +>>> import zarr |
| 9 | +>>> import numcodecs.zarr3 |
| 10 | +>>> |
| 11 | +>>> array = zarr.create_array( |
| 12 | +... store="data.zarr", |
| 13 | +... shape=(1024, 1024), |
| 14 | +... chunks=(64, 64), |
| 15 | +... dtype="uint32", |
| 16 | +... filters=[numcodecs.zarr3.Delta()], |
| 17 | +... compressors=[numcodecs.zarr3.BZ2(level=5)]) |
| 18 | +>>> array[:] = np.arange(*array.shape).astype(array.dtype) |
| 19 | +
|
| 20 | +.. note:: |
| 21 | +
|
| 22 | + Please note that the codecs in :py:mod:`numcodecs.zarr3` are not part of the Zarr version 3 specification. |
| 23 | + Using these codecs might cause interoperability issues with other Zarr implementations. |
| 24 | +""" |
| 25 | + |
| 26 | +from __future__ import annotations |
| 27 | + |
| 28 | +import asyncio |
| 29 | +import math |
| 30 | +from dataclasses import dataclass, replace |
| 31 | +from functools import cached_property |
| 32 | +from typing import TYPE_CHECKING, Any, Self |
| 33 | +from warnings import warn |
| 34 | + |
| 35 | +import numpy as np |
| 36 | + |
| 37 | +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec |
| 38 | +from zarr.abc.metadata import Metadata |
| 39 | +from zarr.core.buffer.cpu import as_numpy_array_wrapper |
| 40 | +from zarr.core.common import JSON, parse_named_configuration, product |
| 41 | +from zarr.dtype import UInt8, ZDType, parse_dtype |
| 42 | +from zarr.errors import ZarrUserWarning |
| 43 | +from zarr.registry import get_numcodec |
| 44 | + |
| 45 | +if TYPE_CHECKING: |
| 46 | + from zarr.abc.numcodec import Numcodec |
| 47 | + from zarr.core.array_spec import ArraySpec |
| 48 | + from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer |
| 49 | + |
| 50 | +CODEC_PREFIX = "numcodecs." |
| 51 | + |
| 52 | + |
| 53 | +def _expect_name_prefix(codec_name: str) -> str: |
| 54 | + if not codec_name.startswith(CODEC_PREFIX): |
| 55 | + raise ValueError( |
| 56 | + f"Expected name to start with '{CODEC_PREFIX}'. Got {codec_name} instead." |
| 57 | + ) # pragma: no cover |
| 58 | + return codec_name.removeprefix(CODEC_PREFIX) |
| 59 | + |
| 60 | + |
| 61 | +def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]: |
| 62 | + parsed_name, parsed_configuration = parse_named_configuration(data) |
| 63 | + if not parsed_name.startswith(CODEC_PREFIX): |
| 64 | + raise ValueError( |
| 65 | + f"Expected name to start with '{CODEC_PREFIX}'. Got {parsed_name} instead." |
| 66 | + ) # pragma: no cover |
| 67 | + id = _expect_name_prefix(parsed_name) |
| 68 | + return {"id": id, **parsed_configuration} |
| 69 | + |
| 70 | + |
| 71 | +@dataclass(frozen=True) |
| 72 | +class _NumcodecsCodec(Metadata): |
| 73 | + codec_name: str |
| 74 | + codec_config: dict[str, JSON] |
| 75 | + |
| 76 | + def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None: |
| 77 | + """To be used only when creating the actual public-facing codec class.""" |
| 78 | + super().__init_subclass__(**kwargs) |
| 79 | + if codec_name is not None: |
| 80 | + namespace = codec_name |
| 81 | + |
| 82 | + cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}" |
| 83 | + cls.codec_name = f"{CODEC_PREFIX}{namespace}" |
| 84 | + cls.__doc__ = f""" |
| 85 | + See :class:`{cls_name}` for more details and parameters. |
| 86 | + """ |
| 87 | + |
| 88 | + def __init__(self, **codec_config: JSON) -> None: |
| 89 | + if not self.codec_name: |
| 90 | + raise ValueError( |
| 91 | + "The codec name needs to be supplied through the `codec_name` attribute." |
| 92 | + ) # pragma: no cover |
| 93 | + unprefixed_codec_name = _expect_name_prefix(self.codec_name) |
| 94 | + |
| 95 | + if "id" not in codec_config: |
| 96 | + codec_config = {"id": unprefixed_codec_name, **codec_config} |
| 97 | + elif codec_config["id"] != unprefixed_codec_name: |
| 98 | + raise ValueError( |
| 99 | + f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}." |
| 100 | + ) # pragma: no cover |
| 101 | + |
| 102 | + object.__setattr__(self, "codec_config", codec_config) |
| 103 | + warn( |
| 104 | + "Numcodecs codecs are not in the Zarr version 3 specification and " |
| 105 | + "may not be supported by other zarr implementations.", |
| 106 | + category=ZarrUserWarning, |
| 107 | + stacklevel=2, |
| 108 | + ) |
| 109 | + |
| 110 | + @cached_property |
| 111 | + def _codec(self) -> Numcodec: |
| 112 | + return get_numcodec(self.codec_config) # type: ignore[arg-type] |
| 113 | + |
| 114 | + @classmethod |
| 115 | + def from_dict(cls, data: dict[str, JSON]) -> Self: |
| 116 | + codec_config = _parse_codec_configuration(data) |
| 117 | + return cls(**codec_config) |
| 118 | + |
| 119 | + def to_dict(self) -> dict[str, JSON]: |
| 120 | + codec_config = self.codec_config.copy() |
| 121 | + codec_config.pop("id", None) |
| 122 | + return { |
| 123 | + "name": self.codec_name, |
| 124 | + "configuration": codec_config, |
| 125 | + } |
| 126 | + |
| 127 | + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: |
| 128 | + raise NotImplementedError # pragma: no cover |
| 129 | + |
| 130 | + # Override __repr__ because dynamically constructed classes don't seem to work otherwise |
| 131 | + def __repr__(self) -> str: |
| 132 | + codec_config = self.codec_config.copy() |
| 133 | + codec_config.pop("id", None) |
| 134 | + return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})" |
| 135 | + |
| 136 | + |
| 137 | +class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec): |
| 138 | + def __init__(self, **codec_config: JSON) -> None: |
| 139 | + super().__init__(**codec_config) |
| 140 | + |
| 141 | + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: |
| 142 | + return await asyncio.to_thread( |
| 143 | + as_numpy_array_wrapper, |
| 144 | + self._codec.decode, |
| 145 | + chunk_data, |
| 146 | + chunk_spec.prototype, |
| 147 | + ) |
| 148 | + |
| 149 | + def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer: |
| 150 | + encoded = self._codec.encode(chunk_data.as_array_like()) |
| 151 | + if isinstance(encoded, np.ndarray): # Required for checksum codecs |
| 152 | + return prototype.buffer.from_bytes(encoded.tobytes()) |
| 153 | + return prototype.buffer.from_bytes(encoded) |
| 154 | + |
| 155 | + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: |
| 156 | + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) |
| 157 | + |
| 158 | + |
| 159 | +class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec): |
| 160 | + def __init__(self, **codec_config: JSON) -> None: |
| 161 | + super().__init__(**codec_config) |
| 162 | + |
| 163 | + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: |
| 164 | + chunk_ndarray = chunk_data.as_ndarray_like() |
| 165 | + out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) |
| 166 | + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) |
| 167 | + |
| 168 | + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: |
| 169 | + chunk_ndarray = chunk_data.as_ndarray_like() |
| 170 | + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) |
| 171 | + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) |
| 172 | + |
| 173 | + |
| 174 | +class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec): |
| 175 | + def __init__(self, **codec_config: JSON) -> None: |
| 176 | + super().__init__(**codec_config) |
| 177 | + |
| 178 | + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: |
| 179 | + chunk_bytes = chunk_data.to_bytes() |
| 180 | + out = await asyncio.to_thread(self._codec.decode, chunk_bytes) |
| 181 | + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) |
| 182 | + |
| 183 | + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: |
| 184 | + chunk_ndarray = chunk_data.as_ndarray_like() |
| 185 | + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) |
| 186 | + return chunk_spec.prototype.buffer.from_bytes(out) |
| 187 | + |
| 188 | + |
| 189 | +# bytes-to-bytes codecs |
| 190 | +class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"): |
| 191 | + pass |
| 192 | + |
| 193 | + |
| 194 | +class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"): |
| 195 | + pass |
| 196 | + |
| 197 | + |
| 198 | +class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"): |
| 199 | + pass |
| 200 | + |
| 201 | + |
| 202 | +class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"): |
| 203 | + pass |
| 204 | + |
| 205 | + |
| 206 | +class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"): |
| 207 | + pass |
| 208 | + |
| 209 | + |
| 210 | +class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"): |
| 211 | + pass |
| 212 | + |
| 213 | + |
| 214 | +class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"): |
| 215 | + pass |
| 216 | + |
| 217 | + |
| 218 | +class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"): |
| 219 | + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: |
| 220 | + if self.codec_config.get("elementsize") is None: |
| 221 | + dtype = array_spec.dtype.to_native_dtype() |
| 222 | + return Shuffle(**{**self.codec_config, "elementsize": dtype.itemsize}) |
| 223 | + return self # pragma: no cover |
| 224 | + |
| 225 | + |
| 226 | +# array-to-array codecs ("filters") |
| 227 | +class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"): |
| 228 | + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: |
| 229 | + if astype := self.codec_config.get("astype"): |
| 230 | + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] |
| 231 | + return replace(chunk_spec, dtype=dtype) |
| 232 | + return chunk_spec |
| 233 | + |
| 234 | + |
| 235 | +class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"): |
| 236 | + pass |
| 237 | + |
| 238 | + |
| 239 | +class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"): |
| 240 | + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: |
| 241 | + if astype := self.codec_config.get("astype"): |
| 242 | + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] |
| 243 | + return replace(chunk_spec, dtype=dtype) |
| 244 | + return chunk_spec |
| 245 | + |
| 246 | + def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: |
| 247 | + if self.codec_config.get("dtype") is None: |
| 248 | + dtype = array_spec.dtype.to_native_dtype() |
| 249 | + return FixedScaleOffset(**{**self.codec_config, "dtype": str(dtype)}) |
| 250 | + return self |
| 251 | + |
| 252 | + |
| 253 | +class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"): |
| 254 | + def __init__(self, **codec_config: JSON) -> None: |
| 255 | + super().__init__(**codec_config) |
| 256 | + |
| 257 | + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize: |
| 258 | + if self.codec_config.get("dtype") is None: |
| 259 | + dtype = array_spec.dtype.to_native_dtype() |
| 260 | + return Quantize(**{**self.codec_config, "dtype": str(dtype)}) |
| 261 | + return self |
| 262 | + |
| 263 | + |
| 264 | +class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"): |
| 265 | + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: |
| 266 | + return replace( |
| 267 | + chunk_spec, |
| 268 | + shape=(1 + math.ceil(product(chunk_spec.shape) / 8),), |
| 269 | + dtype=UInt8(), |
| 270 | + ) |
| 271 | + |
| 272 | + # todo: remove this type: ignore when this class can be defined w.r.t. |
| 273 | + # a single zarr dtype API |
| 274 | + def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None: |
| 275 | + # this is bugged and will fail |
| 276 | + _dtype = dtype.to_native_dtype() |
| 277 | + if _dtype != np.dtype("bool"): |
| 278 | + raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.") |
| 279 | + |
| 280 | + |
| 281 | +class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"): |
| 282 | + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: |
| 283 | + dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type] |
| 284 | + return replace(chunk_spec, dtype=dtype) |
| 285 | + |
| 286 | + def evolve_from_array_spec(self, array_spec: ArraySpec) -> AsType: |
| 287 | + if self.codec_config.get("decode_dtype") is None: |
| 288 | + # TODO: remove these coverage exemptions the correct way, i.e. with tests |
| 289 | + dtype = array_spec.dtype.to_native_dtype() # pragma: no cover |
| 290 | + return AsType(**{**self.codec_config, "decode_dtype": str(dtype)}) # pragma: no cover |
| 291 | + return self |
| 292 | + |
| 293 | + |
| 294 | +# bytes-to-bytes checksum codecs |
| 295 | +class _NumcodecsChecksumCodec(_NumcodecsBytesBytesCodec): |
| 296 | + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: |
| 297 | + return input_byte_length + 4 # pragma: no cover |
| 298 | + |
| 299 | + |
| 300 | +class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"): |
| 301 | + pass |
| 302 | + |
| 303 | + |
| 304 | +class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"): |
| 305 | + pass |
| 306 | + |
| 307 | + |
| 308 | +class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"): |
| 309 | + pass |
| 310 | + |
| 311 | + |
| 312 | +class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"): |
| 313 | + pass |
| 314 | + |
| 315 | + |
| 316 | +class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"): |
| 317 | + pass |
| 318 | + |
| 319 | + |
| 320 | +# array-to-bytes codecs |
| 321 | +class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): |
| 322 | + pass |
| 323 | + |
| 324 | + |
| 325 | +class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): |
| 326 | + pass |
| 327 | + |
| 328 | + |
| 329 | +__all__ = [ |
| 330 | + "BZ2", |
| 331 | + "CRC32", |
| 332 | + "CRC32C", |
| 333 | + "LZ4", |
| 334 | + "LZMA", |
| 335 | + "ZFPY", |
| 336 | + "Adler32", |
| 337 | + "AsType", |
| 338 | + "BitRound", |
| 339 | + "Blosc", |
| 340 | + "Delta", |
| 341 | + "FixedScaleOffset", |
| 342 | + "Fletcher32", |
| 343 | + "GZip", |
| 344 | + "JenkinsLookup3", |
| 345 | + "PCodec", |
| 346 | + "PackBits", |
| 347 | + "Quantize", |
| 348 | + "Shuffle", |
| 349 | + "Zlib", |
| 350 | + "Zstd", |
| 351 | +] |
0 commit comments