diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 0ae8017ca9..e87187d63b 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -42,23 +42,43 @@ requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec This is the current default configuration:: >>> zarr.config.pprint() - {'array': {'order': 'C', - 'write_empty_chunks': False}, - 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.buffer.cpu.Buffer', - 'codec_pipeline': {'batch_size': 1, - 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, - 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + {'array': {'order': 'C', 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', 'bytes': 'zarr.codecs.bytes.BytesCodec', 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', 'endian': 'zarr.codecs.bytes.BytesCodec', 'gzip': 'zarr.codecs.gzip.GzipCodec', + 'numcodecs.adler32': 'zarr.codecs._numcodecs.Adler32', + 'numcodecs.astype': 'zarr.codecs._numcodecs.AsType', + 'numcodecs.bitround': 'zarr.codecs._numcodecs.BitRound', + 'numcodecs.blosc': 'zarr.codecs._numcodecs.Blosc', + 'numcodecs.bz2': 'zarr.codecs._numcodecs.BZ2', + 'numcodecs.crc32': 'zarr.codecs._numcodecs.CRC32', + 'numcodecs.crc32c': 'zarr.codecs._numcodecs.CRC32C', + 'numcodecs.delta': 'zarr.codecs._numcodecs.Delta', + 'numcodecs.fixedscaleoffset': 'zarr.codecs._numcodecs.FixedScaleOffset', + 'numcodecs.fletcher32': 'zarr.codecs._numcodecs.Fletcher32', + 'numcodecs.gZip': 'zarr.codecs._numcodecs.GZip', + 'numcodecs.jenkins_lookup3': 'zarr.codecs._numcodecs.JenkinsLookup3', + 'numcodecs.lz4': 'zarr.codecs._numcodecs.LZ4', + 'numcodecs.lzma': 'zarr.codecs._numcodecs.LZMA', + 'numcodecs.packbits': 'zarr.codecs._numcodecs.PackBits', + 'numcodecs.pcodec': 'zarr.codecs._numcodecs.PCodec', + 'numcodecs.quantize': 'zarr.codecs._numcodecs.Quantize', + 'numcodecs.shuffle': 'zarr.codecs._numcodecs.Shuffle', + 'numcodecs.zfpy': 'zarr.codecs._numcodecs.ZFPY', + 'numcodecs.zlib': 'zarr.codecs._numcodecs.Zlib', + 'numcodecs.zstd': 'zarr.codecs._numcodecs.Zstd', 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', 'transpose': 'zarr.codecs.transpose.TransposeCodec', 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, - 'default_zarr_format': 3, - 'json_indent': 2, - 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', - 'threading': {'max_workers': None}} + 'default_zarr_format': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..0f84f57594 --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,374 @@ +""" +This module provides the compatibility for :py:mod:`numcodecs` in Zarr version 3. + +A compatibility module is required because the codec handling in Zarr version 3 is different from Zarr version 2. + +You can use codecs from :py:mod:`numcodecs` by constructing codecs from :py:mod:`numcodecs.zarr3` using the same parameters as the original codecs. + +>>> import zarr +>>> import numcodecs.zarr3 +>>> +>>> array = zarr.create_array( +... store="data.zarr", +... shape=(1024, 1024), +... chunks=(64, 64), +... dtype="uint32", +... filters=[numcodecs.zarr3.Delta()], +... compressors=[numcodecs.zarr3.BZ2(level=5)]) +>>> array[:] = np.arange(*array.shape).astype(array.dtype) + +.. note:: + + Please note that the codecs in :py:mod:`numcodecs.zarr3` are not part of the Zarr version 3 specification. + Using these codecs might cause interoperability issues with other Zarr implementations. +""" + +from __future__ import annotations + +import asyncio +import math +from dataclasses import dataclass, replace +from functools import cached_property +from typing import TYPE_CHECKING, Any, Self +from warnings import warn + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.abc.metadata import Metadata +from zarr.core.buffer.cpu import as_numpy_array_wrapper +from zarr.core.common import JSON, parse_named_configuration, product +from zarr.dtype import UInt8, ZDType, parse_dtype +from zarr.errors import ZarrUserWarning +from zarr.registry import get_numcodec, register_codec + +if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer + +CODEC_PREFIX = "numcodecs." + + +def _expect_name_prefix(codec_name: str) -> str: + if not codec_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {codec_name} instead." + ) # pragma: no cover + return codec_name.removeprefix(CODEC_PREFIX) + + +def _parse_codec_configuration(data: dict[str, JSON]) -> dict[str, JSON]: + parsed_name, parsed_configuration = parse_named_configuration(data) + if not parsed_name.startswith(CODEC_PREFIX): + raise ValueError( + f"Expected name to start with '{CODEC_PREFIX}'. Got {parsed_name} instead." + ) # pragma: no cover + id = _expect_name_prefix(parsed_name) + return {"id": id, **parsed_configuration} + + +@dataclass(frozen=True) +class _NumcodecsCodec(Metadata): + codec_name: str + codec_config: dict[str, JSON] + + def __init_subclass__(cls, *, codec_name: str | None = None, **kwargs: Any) -> None: + """To be used only when creating the actual public-facing codec class.""" + super().__init_subclass__(**kwargs) + if codec_name is not None: + namespace = codec_name + + cls_name = f"{CODEC_PREFIX}{namespace}.{cls.__name__}" + cls.codec_name = f"{CODEC_PREFIX}{namespace}" + cls.__doc__ = f""" + See :class:`{cls_name}` for more details and parameters. + """ + + def __init__(self, **codec_config: JSON) -> None: + if not self.codec_name: + raise ValueError( + "The codec name needs to be supplied through the `codec_name` attribute." + ) # pragma: no cover + unprefixed_codec_name = _expect_name_prefix(self.codec_name) + + if "id" not in codec_config: + codec_config = {"id": unprefixed_codec_name, **codec_config} + elif codec_config["id"] != unprefixed_codec_name: + raise ValueError( + f"Codec id does not match {unprefixed_codec_name}. Got: {codec_config['id']}." + ) # pragma: no cover + + object.__setattr__(self, "codec_config", codec_config) + warn( + "Numcodecs codecs are not in the Zarr version 3 specification and " + "may not be supported by other zarr implementations.", + category=ZarrUserWarning, + stacklevel=2, + ) + + @cached_property + def _codec(self) -> Numcodec: + return get_numcodec(self.codec_config) # type: ignore[arg-type] + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + codec_config = _parse_codec_configuration(data) + return cls(**codec_config) + + def to_dict(self) -> dict[str, JSON]: + codec_config = self.codec_config.copy() + codec_config.pop("id", None) + return { + "name": self.codec_name, + "configuration": codec_config, + } + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError # pragma: no cover + + # Override __repr__ because dynamically constructed classes don't seem to work otherwise + def __repr__(self) -> str: + codec_config = self.codec_config.copy() + codec_config.pop("id", None) + return f"{self.__class__.__name__}(codec_name={self.codec_name!r}, codec_config={codec_config!r})" + + +class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self._codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self._codec.encode(chunk_data.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) + + +class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self._codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) + + +# bytes-to-bytes codecs +class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"): + pass + + +class LZ4(_NumcodecsBytesBytesCodec, codec_name="lz4"): + pass + + +class Zstd(_NumcodecsBytesBytesCodec, codec_name="zstd"): + pass + + +class Zlib(_NumcodecsBytesBytesCodec, codec_name="zlib"): + pass + + +class GZip(_NumcodecsBytesBytesCodec, codec_name="gzip"): + pass + + +class BZ2(_NumcodecsBytesBytesCodec, codec_name="bz2"): + pass + + +class LZMA(_NumcodecsBytesBytesCodec, codec_name="lzma"): + pass + + +class Shuffle(_NumcodecsBytesBytesCodec, codec_name="shuffle"): + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Shuffle: + if self.codec_config.get("elementsize") is None: + dtype = array_spec.dtype.to_native_dtype() + return Shuffle(**{**self.codec_config, "elementsize": dtype.itemsize}) + return self # pragma: no cover + + +# array-to-array codecs ("filters") +class Delta(_NumcodecsArrayArrayCodec, codec_name="delta"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + +class BitRound(_NumcodecsArrayArrayCodec, codec_name="bitround"): + pass + + +class FixedScaleOffset(_NumcodecsArrayArrayCodec, codec_name="fixedscaleoffset"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + dtype = parse_dtype(np.dtype(astype), zarr_format=3) # type: ignore[call-overload] + return replace(chunk_spec, dtype=dtype) + return chunk_spec + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return FixedScaleOffset(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class Quantize(_NumcodecsArrayArrayCodec, codec_name="quantize"): + def __init__(self, **codec_config: JSON) -> None: + super().__init__(**codec_config) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Quantize: + if self.codec_config.get("dtype") is None: + dtype = array_spec.dtype.to_native_dtype() + return Quantize(**{**self.codec_config, "dtype": str(dtype)}) + return self + + +class PackBits(_NumcodecsArrayArrayCodec, codec_name="packbits"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + return replace( + chunk_spec, + shape=(1 + math.ceil(product(chunk_spec.shape) / 8),), + dtype=UInt8(), + ) + + # todo: remove this type: ignore when this class can be defined w.r.t. + # a single zarr dtype API + def validate(self, *, dtype: ZDType[Any, Any], **_kwargs: Any) -> None: + # this is bugged and will fail + _dtype = dtype.to_native_dtype() + if _dtype != np.dtype("bool"): + raise ValueError(f"Packbits filter requires bool dtype. Got {dtype}.") + + +class AsType(_NumcodecsArrayArrayCodec, codec_name="astype"): + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + dtype = parse_dtype(np.dtype(self.codec_config["encode_dtype"]), zarr_format=3) # type: ignore[arg-type] + return replace(chunk_spec, dtype=dtype) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> AsType: + if self.codec_config.get("decode_dtype") is None: + # TODO: remove these coverage exemptions the correct way, i.e. with tests + dtype = array_spec.dtype.to_native_dtype() # pragma: no cover + return AsType(**{**self.codec_config, "decode_dtype": str(dtype)}) # pragma: no cover + return self + + +# bytes-to-bytes checksum codecs +class _NumcodecsChecksumCodec(_NumcodecsBytesBytesCodec): + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + return input_byte_length + 4 # pragma: no cover + + +class CRC32(_NumcodecsChecksumCodec, codec_name="crc32"): + pass + + +class CRC32C(_NumcodecsChecksumCodec, codec_name="crc32c"): + pass + + +class Adler32(_NumcodecsChecksumCodec, codec_name="adler32"): + pass + + +class Fletcher32(_NumcodecsChecksumCodec, codec_name="fletcher32"): + pass + + +class JenkinsLookup3(_NumcodecsChecksumCodec, codec_name="jenkins_lookup3"): + pass + + +# array-to-bytes codecs +class PCodec(_NumcodecsArrayBytesCodec, codec_name="pcodec"): + pass + + +class ZFPY(_NumcodecsArrayBytesCodec, codec_name="zfpy"): + pass + + +# TODO: move the codec registration outside this module +register_codec("numcodecs.bz2", BZ2) +register_codec("numcodecs.crc32", CRC32) +register_codec("numcodecs.crc32c", CRC32C) +register_codec("numcodecs.lz4", LZ4) +register_codec("numcodecs.lzma", LZMA) +register_codec("numcodecs.zfpy", ZFPY) +register_codec("numcodecs.adler32", Adler32) +register_codec("numcodecs.astype", AsType) +register_codec("numcodecs.bitround", BitRound) +register_codec("numcodecs.blosc", Blosc) +register_codec("numcodecs.delta", Delta) +register_codec("numcodecs.fixedscaleoffset", FixedScaleOffset) +register_codec("numcodecs.fletcher32", Fletcher32) +register_codec("numcodecs.gzip", GZip) +register_codec("numcodecs.jenkins_lookup3", JenkinsLookup3) +register_codec("numcodecs.pcodec", PCodec) +register_codec("numcodecs.packbits", PackBits) +register_codec("numcodecs.quantize", Quantize) +register_codec("numcodecs.shuffle", Shuffle) +register_codec("numcodecs.zlib", Zlib) +register_codec("numcodecs.zstd", Zstd) + +__all__ = [ + "BZ2", + "CRC32", + "CRC32C", + "LZ4", + "LZMA", + "ZFPY", + "Adler32", + "AsType", + "BitRound", + "Blosc", + "Delta", + "FixedScaleOffset", + "Fletcher32", + "GZip", + "JenkinsLookup3", + "PCodec", + "PackBits", + "Quantize", + "Shuffle", + "Zlib", + "Zstd", +] diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index cc3c33cd17..b955788642 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -125,6 +125,27 @@ def enable_gpu(self) -> ConfigSet: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer", diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 46216205f7..11f2be234a 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -155,7 +155,6 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) - config_entry = config.get("codecs", {}).get(key) if config_entry is None: if len(codec_classes) == 1: diff --git a/tests/test_array.py b/tests/test_array.py index a316ee127f..742166a229 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1713,7 +1713,7 @@ def test_roundtrip_numcodecs() -> None: # Create the array with the correct codecs root = zarr.group(store) warn_msg = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." - with pytest.warns(UserWarning, match=warn_msg): + with pytest.warns(ZarrUserWarning, match=warn_msg): root.create_array( "test", shape=(720, 1440), @@ -1728,7 +1728,7 @@ def test_roundtrip_numcodecs() -> None: BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) - with pytest.warns(UserWarning, match=warn_msg): + with pytest.warns(ZarrUserWarning, match=warn_msg): metadata = root["test"].metadata.to_dict() expected = (*filters, BYTES_CODEC, *compressors) assert metadata["codecs"] == expected diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index 1c4d550587..ee01bdc85f 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -1,10 +1,56 @@ from __future__ import annotations +import contextlib +import pickle +from typing import TYPE_CHECKING, Any + +import numpy as np +import pytest from numcodecs import GZip +from zarr import config, create_array, open_array from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.codecs import _numcodecs +from zarr.errors import ZarrUserWarning from zarr.registry import get_numcodec +if TYPE_CHECKING: + from collections.abc import Iterator + + +@contextlib.contextmanager +def codec_conf() -> Iterator[Any]: + base_conf = config.get("codecs") + new_conf = { + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkinslookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", + } + + yield config.set({"codecs": new_conf | base_conf}) + + +if TYPE_CHECKING: + from zarr.core.common import JSON + def test_get_numcodec() -> None: assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] @@ -22,3 +68,282 @@ def test_is_numcodec_cls() -> None: Test the _is_numcodec_cls function """ assert _is_numcodec_cls(GZip) + + +EXPECTED_WARNING_STR = "Numcodecs codecs are not in the Zarr version 3.*" + +ALL_CODECS = [getattr(_numcodecs, cls_name) for cls_name in _numcodecs.__all__] + + +@pytest.mark.parametrize("codec_class", ALL_CODECS) +def test_docstring(codec_class: type[_numcodecs._NumcodecsCodec]) -> None: + assert "See :class:`numcodecs." in codec_class.__doc__ # type: ignore[operator] + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + ], +) +def test_generic_compressor(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +@pytest.mark.parametrize( + ("codec_class", "codec_config"), + [ + (_numcodecs.Delta, {"dtype": "float32"}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 25.5}), + (_numcodecs.FixedScaleOffset, {"offset": 0, "scale": 51, "astype": "uint16"}), + (_numcodecs.AsType, {"encode_dtype": "float32", "decode_dtype": "float32"}), + ], + ids=[ + "delta", + "fixedscaleoffset", + "fixedscaleoffset2", + "astype", + ], +) +def test_generic_filter( + codec_class: type[_numcodecs._NumcodecsArrayArrayCodec], + codec_config: dict[str, JSON], +) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + codec_class(**codec_config), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_generic_filter_bitround() -> None: + data = np.linspace(0, 1, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.BitRound(keepbits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.1) + + +def test_generic_filter_quantize() -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.Quantize(digits=3)], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + assert np.allclose(data, b[:, :], atol=0.001) + + +def test_generic_filter_packbits() -> None: + data = np.zeros((16, 16), dtype="bool") + data[0:4, :] = True + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + a[:, :] = data.copy() + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + with pytest.raises(ValueError, match=".*requires bool dtype.*"): + create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype="uint32", + fill_value=0, + filters=[_numcodecs.PackBits()], + ) + + +@pytest.mark.parametrize( + "codec_class", + [ + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + ], +) +def test_generic_checksum(codec_class: type[_numcodecs._NumcodecsBytesBytesCodec]) -> None: + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + compressors=[codec_class()], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +@pytest.mark.parametrize("codec_class", [_numcodecs.PCodec, _numcodecs.ZFPY]) +def test_generic_bytes_codec(codec_class: type[_numcodecs._NumcodecsArrayBytesCodec]) -> None: + try: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec_class()._codec # noqa: B018 + except ValueError as e: # pragma: no cover + if "codec not available" in str(e): + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + else: + raise + except ImportError as e: # pragma: no cover + pytest.xfail(f"{codec_class.codec_name} is not available: {e}") + + data = np.arange(0, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + serializer=codec_class(), + ) + + a[:, :] = data.copy() + np.testing.assert_array_equal(data, a[:, :]) + + +def test_delta_astype() -> None: + data = np.linspace(0, 10, 256, dtype="i8").reshape((16, 16)) + + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + a = create_array( + {}, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + filters=[ + _numcodecs.Delta(dtype="i8", astype="i2"), + ], + ) + + a[:, :] = data.copy() + with codec_conf(): + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + b = open_array(a.store, mode="r") + np.testing.assert_array_equal(data, b[:, :]) + + +def test_repr() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert repr(codec) == "LZ4(codec_name='numcodecs.lz4', codec_config={'level': 5})" + + +def test_to_dict() -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = _numcodecs.LZ4(level=5) + assert codec.to_dict() == {"name": "numcodecs.lz4", "configuration": {"level": 5}} + + +@pytest.mark.parametrize( + "codec_cls", + [ + _numcodecs.Blosc, + _numcodecs.LZ4, + _numcodecs.Zstd, + _numcodecs.Zlib, + _numcodecs.GZip, + _numcodecs.BZ2, + _numcodecs.LZMA, + _numcodecs.Shuffle, + _numcodecs.BitRound, + _numcodecs.Delta, + _numcodecs.FixedScaleOffset, + _numcodecs.Quantize, + _numcodecs.PackBits, + _numcodecs.AsType, + _numcodecs.CRC32, + _numcodecs.CRC32C, + _numcodecs.Adler32, + _numcodecs.Fletcher32, + _numcodecs.JenkinsLookup3, + _numcodecs.PCodec, + _numcodecs.ZFPY, + ], +) +def test_codecs_pickleable(codec_cls: type[_numcodecs._NumcodecsCodec]) -> None: + with pytest.warns(ZarrUserWarning, match=EXPECTED_WARNING_STR): + codec = codec_cls() + + expected = codec + + p = pickle.dumps(codec) + actual = pickle.loads(p) + assert actual == expected diff --git a/tests/test_config.py b/tests/test_config.py index 0c029dda3a..2d308c2816 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -73,6 +73,27 @@ def test_config_defaults_set() -> None: "transpose": "zarr.codecs.transpose.TransposeCodec", "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "numcodecs.bz2": "zarr.codecs._numcodecs.BZ2", + "numcodecs.crc32": "zarr.codecs._numcodecs.CRC32", + "numcodecs.crc32c": "zarr.codecs._numcodecs.CRC32C", + "numcodecs.lz4": "zarr.codecs._numcodecs.LZ4", + "numcodecs.lzma": "zarr.codecs._numcodecs.LZMA", + "numcodecs.zfpy": "zarr.codecs._numcodecs.ZFPY", + "numcodecs.adler32": "zarr.codecs._numcodecs.Adler32", + "numcodecs.astype": "zarr.codecs._numcodecs.AsType", + "numcodecs.bitround": "zarr.codecs._numcodecs.BitRound", + "numcodecs.blosc": "zarr.codecs._numcodecs.Blosc", + "numcodecs.delta": "zarr.codecs._numcodecs.Delta", + "numcodecs.fixedscaleoffset": "zarr.codecs._numcodecs.FixedScaleOffset", + "numcodecs.fletcher32": "zarr.codecs._numcodecs.Fletcher32", + "numcodecs.gZip": "zarr.codecs._numcodecs.GZip", + "numcodecs.jenkins_lookup3": "zarr.codecs._numcodecs.JenkinsLookup3", + "numcodecs.pcodec": "zarr.codecs._numcodecs.PCodec", + "numcodecs.packbits": "zarr.codecs._numcodecs.PackBits", + "numcodecs.shuffle": "zarr.codecs._numcodecs.Shuffle", + "numcodecs.quantize": "zarr.codecs._numcodecs.Quantize", + "numcodecs.zlib": "zarr.codecs._numcodecs.Zlib", + "numcodecs.zstd": "zarr.codecs._numcodecs.Zstd", }, "buffer": "zarr.buffer.cpu.Buffer", "ndbuffer": "zarr.buffer.cpu.NDBuffer",