diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d5c995d2ca..b27b7e1f46 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,13 +2,23 @@ from abc import abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar - -from typing_extensions import ReadOnly, TypedDict +from typing import ( + TYPE_CHECKING, + ClassVar, + Generic, + Literal, + Self, + TypedDict, + TypeGuard, + TypeVar, + overload, +) + +from typing_extensions import Protocol, ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -181,6 +191,34 @@ async def encode( """ return await _batching_helper(self._encode_single, chunks_and_specs) + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + raise NotImplementedError + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" @@ -471,3 +509,22 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return await func(chunk, chunk_spec) return wrap + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 53e981c3bd..2421bce37a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -431,6 +431,7 @@ async def getsize(self, key: str) -> int: FileNotFoundError When the given key does not exist in the store. """ + # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 3c6c99c21c..8ea1143771 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,18 +2,26 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, overload import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + CodecJSON, + CodecJSON_V2, +) from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.buffer.core import BufferPrototype + from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat @dataclass(frozen=True) @@ -98,3 +106,102 @@ async def _encode_single( def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError + + +@dataclass(frozen=True, kw_only=True) +class NumcodecsWrapper: + codec: Numcodec + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: + if zarr_format == 2: + return self.codec.get_config() + elif zarr_format == 3: + config = self.codec.get_config() + config_no_id = {k: v for k, v in config.items() if k != "id"} + return {"name": config["id"], "configuration": config_no_id} + raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotADirectoryError( + "This class does not support creating instances from JSON data for Zarr format 2." + ) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError( + "This class does not support creating instances from JSON data for Zarr format 3." + ) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + def to_array_array(self) -> NumcodecsArrayArrayCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. + """ + return NumcodecsArrayArrayCodec(codec=self.codec) + + def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. + """ + return NumcodecsBytesBytesCodec(codec=self.codec) + + def to_array_bytes(self) -> NumcodecsArrayBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. + """ + return NumcodecsArrayBytesCodec(codec=self.codec) + + +class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper + + return await asyncio.to_thread( + as_numpy_array_wrapper, + self.codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self.codec.encode(chunk_bytes.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec): + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self.codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f89f127852..ab904afa1a 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,19 +1,32 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Final, + Literal, + NotRequired, + TypedDict, + TypeGuard, + overload, +) import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version - -from zarr.abc.codec import BytesBytesCodec -from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from typing_extensions import ReadOnly + +from zarr.abc.codec import BytesBytesCodec, CodecJSON +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.core.dtype.common import HasItemSize +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -22,39 +35,66 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") + +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") + + +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int + typesize: NotRequired[int] -class BloscShuffle(Enum): + +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + + +class BloscJSON_V2(BloscConfigV2): """ - Enum for shuffle filter used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" - - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] + id: ReadOnly[Literal["blosc"]] -class BloscCname(Enum): +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V3. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) + + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -85,31 +125,35 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): - """blosc codec""" - is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -120,24 +164,74 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname.value, + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", "clevel": self.clevel, - "shuffle": self.shuffle.value, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, - }, - } + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize, + }, + } + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 @@ -147,10 +241,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec @@ -158,15 +249,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 @@ -179,6 +265,8 @@ async def _decode_single( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: + from zarr.core.buffer.cpu import as_numpy_array_wrapper + return await asyncio.to_thread( as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype ) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 7576119c82..9682ff9860 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,15 +1,17 @@ from __future__ import annotations import sys +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np +from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness from zarr.registry import register_codec @@ -28,35 +30,111 @@ class Endian(Enum): little = "little" -default_system_endian = Endian(sys.byteorder) +# TODO: unify with the endianness defined in core.dtype.common +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" + +default_system_endian = sys.byteorder + + +class BytesConfig(TypedDict): + endian: NotRequired[EndiannessStr] + + +class BytesJSON_V2(BytesConfig): + """ + JSON representation of the bytes codec for zarr v2. + """ + + id: ReadOnly[Literal["bytes"]] + + +BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] + + +def parse_endianness(data: object) -> EndiannessStr: + if data in ENDIANNESS_STR: + return data # type: ignore [return-value] + raise ValueError(f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}") + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BytesJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"id", "endian"}, {"id"}) + and data["id"] == "bytes" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BytesJSON_V3]: + return data == "bytes" or ( + ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name"}, {"name", "configuration"}) + and data["name"] == "bytes" + ) + and isinstance(data.get("configuration", {}), Mapping) + and set(data.get("configuration", {}).keys()) in ({"endian"}, set()) + ) @dataclass(frozen=True) class BytesCodec(ArrayBytesCodec): - """bytes codec""" - is_fixed_size = True - endian: Endian | None + endian: EndiannessStr | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: EndiannessStr | str | None = default_system_endian) -> None: + endian_parsed = None if endian is None else parse_endianness(endian) object.__setattr__(self, "endian", endian_parsed) @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.endian is None: + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls(endian=data.get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + # Three different representations of the exact same codec... + if data in ("bytes", {"name": "bytes"}, {"name": "bytes", "configuration": {}}): + return cls() + else: + return cls(endian=data["configuration"].get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @overload + def to_json(self, zarr_format: Literal[2]) -> BytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BytesJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BytesJSON_V2 | BytesJSON_V3: + if zarr_format == 2: + if self.endian is not None: + return { + "id": "bytes", + "endian": self.endian, + } + return {"id": "bytes"} + elif zarr_format == 3: + if self.endian is not None: + return { + "name": "bytes", + "configuration": {"endian": self.endian}, + } return {"name": "bytes"} - else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if not isinstance(array_spec.dtype, HasEndianness): @@ -75,9 +153,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None - if isinstance(chunk_spec.dtype, HasEndianness): - dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + endian = self.endian if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: + dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] else: dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() @@ -109,7 +187,7 @@ async def _encode_single( ): # type-ignore is a numpy bug # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) # type: ignore[arg-type] chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index c2e30f689a..6fa656b3b8 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,14 +1,16 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, cast, overload import numpy as np import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec -from zarr.core.common import JSON, parse_named_configuration +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -18,20 +20,77 @@ from zarr.core.buffer import Buffer +class Crc32Config(TypedDict): ... + + +class Crc32cJSON_V2(CodecJSON_V2[Literal["crc32c"]]): ... + + +class Crc32cJSON_V3(NamedConfig[Literal["crc32c"], Crc32Config]): ... + + +def check_json_v2(data: CodecJSON) -> TypeGuard[Crc32cJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()) == {"id"} and data["id"] == "crc32c" + + +def check_json_v3(data: CodecJSON) -> TypeGuard[Crc32cJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name", "configuration"}, {"name"}) + and data["name"] == "crc32c" + and data.get("configuration") in ({}, None) + ) + + @dataclass(frozen=True) class Crc32cCodec(BytesBytesCodec): - """crc32c codec""" - is_fixed_size = True @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) parse_named_configuration(data, "crc32c", require_configuration=False) return cls() + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls() + msg = ( + "Invalid Zarr V2 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('id')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls() + msg = ( + "Invalid Zarr V3 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('name')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return {"name": "crc32c"} + @overload + def to_json(self, zarr_format: Literal[2]) -> Crc32cJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Crc32cJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "crc32c"} + elif zarr_format == 3: + return {"name": "crc32c"} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + async def _decode_single( self, chunk_bytes: Buffer, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 9e6515a4d1..bddaa2bca6 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -1,14 +1,20 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip +from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -28,10 +34,26 @@ def parse_gzip_level(data: JSON) -> int: return data +class GZipConfig(TypedDict): + level: int + + +class GZipJSON_V2(GZipConfig): + """ + The JSON form of the GZip codec in Zarr V2. + """ + + id: ReadOnly[Literal["gzip"]] + + +class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): + """ + The JSON form of the GZip codec in Zarr V3. + """ + + @dataclass(frozen=True) class GzipCodec(BytesBytesCodec): - """gzip codec""" - is_fixed_size = False level: int = 5 @@ -43,11 +65,56 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> GZipJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: + if zarr_format == 2: + return {"id": "gzip", "level": self.level} + elif zarr_format == 3: + return {"name": "gzip", "configuration": {"level": self.level}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "level"} + and data["id"] == "gzip" + and isinstance(data["level"], int) + ) + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "gzip" + and isinstance(data["configuration"], dict) + and "level" in data["configuration"] + and isinstance(data["configuration"]["level"], int) + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls(level=data["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls(level=data["configuration"]["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") async def _decode_single( self, diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 888d258649..e2fcb7020a 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,20 +1,35 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + NamedTuple, + NotRequired, + Self, + TypedDict, + TypeGuard, + cast, + overload, +) import numpy as np import numpy.typing as npt +from typing_extensions import ReadOnly from zarr.abc.codec import ( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, + CodecJSON, + CodecJSON_V2, + CodecJSON_V3, CodecPipeline, ) from zarr.abc.store import ( @@ -38,8 +53,8 @@ from zarr.core.common import ( ChunkCoords, ChunkCoordsLike, + NamedRequiredConfig, parse_enum, - parse_named_configuration, parse_shapelike, product, ) @@ -52,6 +67,7 @@ morton_order_iter, ) from zarr.core.metadata.v3 import parse_codecs +from zarr.errors import CodecValidationError from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec if TYPE_CHECKING: @@ -65,6 +81,33 @@ ShardMapping = Mapping[ChunkCoords, Buffer] ShardMutableMapping = MutableMapping[ChunkCoords, Buffer] +IndexLocation = Literal["start", "end"] + + +class ShardingConfigV2(TypedDict): + codecs: tuple[CodecJSON_V2[str], ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V2[str], ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingConfigV3(TypedDict): + codecs: tuple[CodecJSON_V3, ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V3, ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingJSON_V2(ShardingConfigV2): + """ + The JSON form of the sharding codec in Zarr V2. + """ + + id: ReadOnly[Literal["sharding_indexed"]] + + +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... + class ShardingCodecIndexLocation(Enum): """ @@ -79,6 +122,37 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +def check_json_v2(data: CodecJSON) -> TypeGuard[ShardingJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "codecs", "chunk_shape"} + and data["id"] == "sharding_indexed" + and isinstance(data["chunk_shape"], Sequence) + and not isinstance(data["chunk_shape"], str) + and isinstance(data["codecs"], Sequence) + and not isinstance(data["codecs"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ShardingJSON_V3]: + # TODO: Automate this with a function that does runtime type checking on typeddicts. + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "sharding_indexed" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"codecs", "chunk_shape", "index_codecs", "index_location"} + and isinstance(data["configuration"]["chunk_shape"], Sequence) + and not isinstance(data["configuration"]["chunk_shape"], str) + and isinstance(data["configuration"]["codecs"], Sequence) + and not isinstance(data["configuration"]["codecs"], str) + and isinstance(data["configuration"]["index_codecs"], Sequence) + and not isinstance(data["configuration"]["index_codecs"], str) + and data["configuration"]["index_location"] in ("start", "end") + ) + + @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): shard_dict: ShardMapping @@ -333,8 +407,6 @@ async def finalize( class ShardingCodec( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin ): - """Sharding codec""" - chunk_shape: ChunkCoords codecs: tuple[Codec, ...] index_codecs: tuple[Codec, ...] @@ -385,23 +457,76 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + codecs=data["codecs"], + index_codecs=data["index_codecs"], + index_location=data["index_location"], + chunk_shape=data["chunk_shape"], + ) + msg = ( + "Invalid Zarr V2 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'codecs', 'index_codecs', 'chunk_shape', 'index_location')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + codecs=data["configuration"]["codecs"], + index_codecs=data["configuration"]["index_codecs"], + index_location=data["configuration"]["index_location"], + chunk_shape=data["configuration"]["chunk_shape"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('codecs', 'index_codecs', 'index_location', 'chunk_shape')" + ) + raise CodecValidationError(msg) @property def codec_pipeline(self) -> CodecPipeline: return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: - return { - "name": "sharding_indexed", - "configuration": { + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ShardingJSON_V3: ... + + def to_json(self, zarr_format: int) -> ShardingJSON_V2 | ShardingJSON_V3: + if zarr_format == 2: + return { + "id": "sharding_indexed", + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), "chunk_shape": self.chunk_shape, - "codecs": tuple(s.to_dict() for s in self.codecs), - "index_codecs": tuple(s.to_dict() for s in self.index_codecs), "index_location": self.index_location.value, - }, - } + } + elif zarr_format == 3: + return { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": self.chunk_shape, + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "index_location": self.index_location.value, + }, + } + raise ValueError(f"Unsupported Zarr format {zarr_format}. Expected 2 or 3.") def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index d6310d38a4..7c713780da 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -1,14 +1,21 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np +from typing_extensions import ReadOnly -from zarr.abc.codec import ArrayArrayCodec +from zarr.abc.codec import ArrayArrayCodec, CodecJSON from zarr.core.array_spec import ArraySpec -from zarr.core.common import JSON, ChunkCoordsLike, parse_named_configuration +from zarr.core.common import ( + JSON, + ChunkCoordsLike, + NamedRequiredConfig, + ZarrFormat, +) +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -27,10 +34,46 @@ def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: return tuple(cast("Iterable[int]", data)) +class TransposeConfig(TypedDict): + order: tuple[int, ...] + + +class TransposeJSON_V2(TransposeConfig): + """ + The JSON form of the Transpose codec in Zarr V2. + """ + + id: ReadOnly[Literal["transpose"]] + + +class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): + """ + The JSON form of the Transpose codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[TransposeJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "configuration"} + and data["id"] == "transpose" + and isinstance(data["order"], Sequence) + and not isinstance(data["order"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[TransposeJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "transpose" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"order"} + ) + + @dataclass(frozen=True) class TransposeCodec(ArrayArrayCodec): - """Transpose codec""" - is_fixed_size = True order: tuple[int, ...] @@ -42,12 +85,47 @@ def __init__(self, *, order: ChunkCoordsLike) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v2(data): + return cls(order=data["order"]) # type: ignore[arg-type] + msg = ( + "Invalid Zarr V2 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'order')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v3(data): + return cls(order=data["configuration"]["order"]) + msg = ( + "Invalid Zarr V3 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('order')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + @overload + def to_json(self, zarr_format: Literal[2]) -> TransposeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> TransposeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "transpose", "order": self.order} + elif zarr_format == 3: + return {"name": "transpose", "configuration": {"order": self.order}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def validate( self, shape: tuple[int, ...], diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 28c64be1c0..4124883ab0 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,14 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -22,12 +22,29 @@ _vlen_bytes_codec = VLenBytes() +class VlenUF8Config(TypedDict): ... + + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): ... + + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... + + +class VLenBytesConfig(TypedDict): ... + + +class VLenBytesJSON_V2(CodecJSON_V2[Literal["vlen-bytes"]]): ... + + +VLenBytesJSON_V3 = NamedConfig[Literal["vlen-bytes"], VLenBytesConfig] | Literal["vlen-bytes"] + + @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): - """Variable-length UTF8 codec""" - @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-utf8", require_configuration=False ) @@ -37,6 +54,40 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-utf8", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenUTF8JSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: + if zarr_format == 2: + return {"id": "vlen-utf8"} + else: + return {"name": "vlen-utf8"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V2]: + return data == {"id": "vlen-utf8"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V3]: + return data in ( + {"name": "vlen-utf8"}, + {"name": "vlen-utf8", "configuration": {}}, + "vlen-utf8", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -74,15 +125,45 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "vlen-bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenBytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenBytesJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenBytesJSON_V2 | VLenBytesJSON_V3: + if zarr_format == 2: + return {"id": "vlen-bytes"} + else: + return {"name": "vlen-bytes"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V2]: + return data == {"id": "vlen-bytes"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V3]: + return data in ( + {"name": "vlen-bytes"}, + {"name": "vlen-bytes", "configuration": {}}, + "vlen-bytes", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index ead41e7b5f..ba0bc2461a 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -1,17 +1,24 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, overload import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version +from typing_extensions import ReadOnly -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) +from zarr.errors import CodecValidationError from zarr.registry import register_codec if TYPE_CHECKING: @@ -21,6 +28,43 @@ from zarr.core.buffer import Buffer +class ZstdConfig_V2(TypedDict): + level: int + + +class ZstdConfig_V3(TypedDict): + level: int + checksum: bool + + +class ZstdJSON_V2(ZstdConfig_V2): + """ + The JSON form of the ZStandard codec in Zarr v2. + """ + + id: ReadOnly[Literal["zstd"]] + + +class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): + """ + The JSON form of the ZStandard codec in Zarr v3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[ZstdJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()).issuperset({"id", "level"}) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "zstd" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"level", "checksum"} + ) + + def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): if data >= 23: @@ -37,8 +81,6 @@ def parse_checksum(data: JSON) -> bool: @dataclass(frozen=True) class ZstdCodec(BytesBytesCodec): - """zstd codec""" - is_fixed_size = True level: int = 0 @@ -61,11 +103,52 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "zstd") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + if "checksum" in data: + return cls(level=data["level"], checksum=data["checksum"]) + else: + return cls(level=data["level"]) + + msg = ( + "Invalid Zarr V2 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'level')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + level=data["configuration"]["level"], checksum=data["configuration"]["checksum"] + ) + msg = ( + "Invalid Zarr V3 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration') " + "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ZstdJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: + if zarr_format == 2: + return {"id": "zstd", "level": self.level} + else: + return { + "name": "zstd", + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2ce33df7ba..aead755362 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -24,10 +24,10 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec -from zarr.abc.numcodec import Numcodec, _is_numcodec +from zarr.abc.numcodec import Numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo @@ -60,24 +60,20 @@ ZarrFormat, _default_zarr_format, _warn_order_kwarg, - ceildiv, concurrent_map, parse_shapelike, product, ) -from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( VariableLengthBytes, VariableLengthUTF8, ZDType, ZDTypeLike, - parse_dtype, + parse_data_type, ) from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( - AsyncOIndex, - AsyncVIndex, BasicIndexer, BasicSelection, BlockIndex, @@ -94,6 +90,7 @@ Selection, VIndex, _iter_grid, + ceildiv, check_fields, check_no_multi_fields, is_pure_fancy_indexing, @@ -111,7 +108,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( - CompressorLikev2, + CompressorLike_V2, get_object_codec_id, parse_compressor, parse_filters, @@ -129,7 +126,7 @@ from zarr.storage._utils import _relativize_path if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -203,8 +200,26 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + _codecs: tuple[Codec, ...] = () + if metadata.filters is not None: + _codecs += metadata.filters + if metadata.compressor is not None: + _codecs += (metadata.compressor,) + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance( + metadata.dtype, HasObjectCodec + ): + # The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style + # chain of filters + compressor might not contain a codec identifiable as an array-bytes codec. + # In such a case, we will insert a bytes codec that applies no endian transformation. + # We skip this insertion if the data type is an instance of HasObjectCodec, because + # in zarr v2 these data types required a special codec that functioned like an array bytes codec. + _codecs = (BytesCodec(endian=None),) + _codecs + if metadata.order == "F": + # Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec + # framework, we express C or F ordered arrays by adding a transpose codec to the front + # of the list of codecs. + _codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs + return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -341,7 +356,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: CompressorLikev2 | Literal["auto"] = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -618,7 +633,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -817,8 +832,8 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLikev2 = None, + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -855,8 +870,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | Numcodec] | None = None, - compressor: CompressorLike = "auto", + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | Literal["auto"] = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -868,14 +883,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - compressor_parsed: CompressorLikev2 + compressor_parsed: CompressorLike_V2 if compressor == "auto": compressor_parsed = default_compressor_v2(dtype) - elif isinstance(compressor, BytesBytesCodec): - raise ValueError( - "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " - "Use a numcodecs codec directly instead." - ) else: compressor_parsed = compressor @@ -1426,56 +1436,6 @@ async def getitem( ) return await self._get_selection(indexer, prototype=prototype) - async def get_orthogonal_selection( - self, - selection: OrthogonalSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_mask_selection( - self, - mask: MaskSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - async def get_coordinate_selection( - self, - selection: CoordinateSelection, - *, - out: NDBuffer | None = None, - fields: Fields | None = None, - prototype: BufferPrototype | None = None, - ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out_array = await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype - ) - - if hasattr(out_array, "shape"): - # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) - return out_array - async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = False) -> None: """ Asynchronously save the array metadata. @@ -1517,7 +1477,7 @@ async def _set_selection( if isinstance(array_like, np._typing._SupportsArrayFunc): # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + array_like_ = cast(np._typing._SupportsArrayFunc, array_like) value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): @@ -1531,8 +1491,7 @@ async def _set_selection( value = value.astype(dtype=self.dtype, order="A") else: value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. @@ -1607,19 +1566,6 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - @property - def oindex(self) -> AsyncOIndex[T_ArrayMetadata]: - """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and - :func:`set_orthogonal_selection` for documentation and examples.""" - return AsyncOIndex(self) - - @property - def vindex(self) -> AsyncVIndex[T_ArrayMetadata]: - """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, - :func:`set_coordinate_selection`, :func:`get_mask_selection` and - :func:`set_mask_selection` for documentation and examples.""" - return AsyncVIndex(self) - async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: """ Asynchronously resize the array to a new shape. @@ -4300,7 +4246,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - zdtype = parse_dtype(dtype, zarr_format=zarr_format) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4709,26 +4655,6 @@ def _parse_chunk_key_encoding( return result -def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: - """ - Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. - """ - - dtype_category = categorize_data_type(dtype) - - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) - - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) - - def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ Given a data type, return the default filters for that data type. @@ -4772,7 +4698,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: """ Given a data type, return the default filters for that data type. @@ -4781,13 +4707,9 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ if isinstance(dtype, HasObjectCodec): if dtype.object_codec_id == "vlen-bytes": - from numcodecs import VLenBytes - - return (VLenBytes(),) + return (VLenBytesCodec(),) elif dtype.object_codec_id == "vlen-utf8": - from numcodecs import VLenUTF8 - - return (VLenUTF8(),) + return (VLenUTF8Codec(),) else: msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) @@ -4798,7 +4720,7 @@ def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: """ Given a data type, return the default compressors for that data type. - This is just the numcodecs ``Zstd`` codec. + This is just the ``Zstd`` codec. """ from numcodecs import Zstd @@ -4821,12 +4743,9 @@ def _parse_chunk_encoding_v2( _compressor = None elif compressor == "auto": _compressor = default_compressor_v2(dtype) - elif isinstance(compressor, tuple | list) and len(compressor) == 1: + elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4834,14 +4753,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters_v2(dtype) else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not _is_numcodec(f): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) if isinstance(dtype, HasObjectCodec): # check the filters and the compressor for the object codec required for this data type @@ -4849,12 +4760,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.to_json(zarr_format=2),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.to_json(zarr_format=2) for f in _filters], + _compressor.to_json(zarr_format=2) if _compressor is not None else None, ) ) if object_codec_id is None: @@ -4894,7 +4805,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = (filters,) else: maybe_array_array = cast("Iterable[Codec | dict[str, JSON]]", filters) - out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + out_array_array = tuple( + _parse_array_array_codec(c, zarr_format=3) for c in maybe_array_array + ) if serializer == "auto": out_array_bytes = default_serializer_v3(dtype) @@ -4902,26 +4815,34 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - out_array_bytes = _parse_array_bytes_codec(serializer) + out_array_bytes = _parse_array_bytes_codec(serializer, zarr_format=3) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_compressors_v3(dtype) else: - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - if isinstance(compressors, dict | Codec): + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] + if isinstance(compressors, dict | Codec | Numcodec): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors) + maybe_bytes_bytes = compressors # type: ignore[assignment] - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + out_bytes_bytes = tuple( + _parse_bytes_bytes_codec(c, zarr_format=3) for c in maybe_bytes_bytes + ) + + # specialize codecs as needed given the dtype + + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - - # TODO: add checks to ensure that the right serializer is used for vlen data types + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes @@ -4942,8 +4863,6 @@ def _parse_deprecated_compressor( compressors = () else: compressors = (compressor,) - elif zarr_format == 2 and compressor == compressors == "auto": - compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 3bc3c1cfc7..3eeb1aabd5 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -14,6 +14,7 @@ Codec, CodecPipeline, ) +from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar @@ -88,7 +89,6 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) - return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, @@ -220,7 +220,6 @@ async def encode_batch( zip(chunk_array_batch, chunk_specs, strict=False) ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs, strict=False) ) @@ -495,16 +494,169 @@ def codecs_from_list( from zarr.codecs.sharding import ShardingCodec array_array: tuple[ArrayArrayCodec, ...] = () - array_bytes_maybe: ArrayBytesCodec | None = None + array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + # handle two cases + # either all of the codecs are numcodecwrapper instances, in which case we set the last element + # to array-bytes and the rest to array-array + # or one of the codecs is an array-bytes, in which case we convert any preceding numcodecswrapper + # instances to array-array, and any following numcodecswrapper instances to bytes-bytes + + codecs_tup = tuple(codecs) + array_array_idcs: tuple[tuple[int, ArrayArrayCodec], ...] = () + array_bytes_idcs: tuple[tuple[int, ArrayBytesCodec], ...] = () + bytes_bytes_idcs: tuple[tuple[int, BytesBytesCodec], ...] = () + numcodec_wrapper_idcs: tuple[tuple[int, NumcodecsWrapper], ...] = () + + for idx, codec in enumerate(codecs_tup): + match codec: + case ArrayArrayCodec(): + array_array_idcs += ((idx, codec),) + case ArrayBytesCodec(): + array_bytes_idcs += ((idx, codec),) + case BytesBytesCodec(): + bytes_bytes_idcs += ((idx, codec),) + case NumcodecsWrapper(): # type: ignore[union-attr] + numcodec_wrapper_idcs += ((idx, codec),) + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs_tup) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", category=ZarrUserWarning, stacklevel=3, ) + if len(array_bytes_idcs) == 0: + # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an + # array-bytes codec, this is an error. + if len(numcodec_wrapper_idcs) == 0: + msg = ( + f"The codecs {codecs_tup} do not include an ArrayBytesCodec or a codec castable to an " + "ArrayBytesCodec, such as a NumcodecsWrapper. This is an invalid sequence of codecs." + ) + raise ValueError(msg) + elif len(numcodec_wrapper_idcs) == len(codecs_tup): + # All the codecs are numcodecs wrappers. This means we have no information about which + # codec is array-array, array-bytes, and bytes-bytes, so we we just cast the numcodecs wrappers + # into a sequence of array-array codecs terminated by a single array-bytes codec. + # This choice is almost arbitrary. + # It would be equally valid to convert the first codec to an array-bytes, and the remaining + # codecs to bytes-bytes, or to pick a random codec and convert it to array-bytes, then + # converting all the preceding codecs to array-array, and the following codecs to bytes-bytes. + # But we know from experience that the Zarr V2-style chunk encoding pipelines typically + # start with array-array transformations, so casting all but one of the unknown codecs + # to array-array is a safe choice. + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) + else: + # There are no array-bytes codecs, there is at least one numcodec wrapper, but there are + # also some array-array and / or bytes-bytes codecs + if len(array_array_idcs) > 0: + # There is at least one array-array codec. We will use it as a reference point for + # casting any numcodecs wrappers. + last_array_array_idx = array_array_idcs[-1][0] + + if last_array_array_idx == len(codecs_tup) - 1: + # The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec. This + # cannot be fixed by converting numcodecs wrappers, so we raise an exception. + raise ValueError( + "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." + ) + + for idx, aac in enumerate(codecs_tup[: (last_array_array_idx + 1)]): + # Iterate over the codecs leading up to the last array-array codec. + if isinstance(aac, ArrayArrayCodec): + # Any array-array codec gets added to the list of array-array codecs + array_array += (aac,) + elif isinstance(aac, NumcodecsWrapper): + # Any numcodecs wrapper gets converted to an array-array codec + array_array += (aac.to_array_array(),) + else: + # Any other kind of codec is invalid and we raise an exception. + msg = f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + + if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + # The codec following the last array-array codec is a numcodecs wrapper. + # We will cast it to an array-bytes codec. + array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() + else: + # The codec following the last array-array codec was a bytes bytes codec, or + # something else entirely. This is invalid and we raise an exception. + msg = ( + f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " + f"{last_array_array_idx + 1}." + "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " + f"{type(codecs_tup[last_array_array_idx + 1])}" + ) + raise TypeError(msg) + + start = last_array_array_idx + 2 + for idx, rem in enumerate(codecs_tup[start:]): + # We have already checked the codec after the last array-array codec, so we start + # iterating over the codecs after that. + if isinstance(rem, BytesBytesCodec): + bytes_bytes += (rem,) + elif isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) + else: + msg = f"Invalid codec {rem} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + # there are no array-array codecs, just numcodecs wrappers and bytes-bytes codecs + first_bytes_bytes_idx = bytes_bytes_idcs[0][0] + if first_bytes_bytes_idx == 0: + raise ValueError( + "The first codec is a BytesBytesCodec, but there is no ArrayBytesCodec." + ) + else: + # Iterate over all codecs. Cast all numcodecs wrappers to array-array codecs, until + # the codec immediately prior to the first bytes-bytes codec, which we cast to + # an array-bytes codec. All codecs after that point are cast to bytes-bytes codecs. + for idx, bb_codec in enumerate(codecs_tup): + if idx < first_bytes_bytes_idx - 1: + # This must be a numcodecs wrapper. cast it to array-array + array_array += (bb_codec.to_array_array(),) + elif idx == first_bytes_bytes_idx - 1: + array_bytes_maybe = bb_codec.to_array_bytes() + else: + if isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + elif isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + else: + msg = f"Invalid codec {bb_codec} at index {idx}. Expected a NumcodecsWrapper" + raise TypeError(msg) + + elif len(array_bytes_idcs) == 1: + bb_idx, ab_codec = array_bytes_idcs[0] + array_bytes_maybe = ab_codec + + end = bb_idx + + for idx, aa_codec in enumerate(codecs_tup[:end]): + if isinstance(aa_codec, ArrayArrayCodec): + array_array += (aa_codec,) + elif isinstance(aa_codec, NumcodecsWrapper): + array_array += (aa_codec.to_array_array(),) + else: + msg = f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + start = bb_idx + 1 + if bb_idx < len(codecs_tup) - 1: + for idx, bb_codec in enumerate(codecs_tup[start:]): + if isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + elif isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + else: + msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + raise ValueError("More than one ArrayBytes codec found, that is a big error!") + + return array_array, array_bytes_maybe, bytes_bytes for prev_codec, cur_codec in pairwise((None, *codecs)): if isinstance(cur_codec, ArrayArrayCodec): @@ -542,7 +694,7 @@ def codecs_from_list( f"Got {type(prev_codec)} instead." ) bytes_bytes += (cur_codec,) - else: + elif isinstance(cur_codec, NumcodecsWrapper): raise TypeError if array_bytes_maybe is None: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 4c0247426e..52ffdb3df5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,7 @@ Final, Generic, Literal, + NotRequired, TypedDict, TypeVar, cast, @@ -48,13 +49,32 @@ DimensionNames = Iterable[str | None] | None TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) +BaseConfig = Mapping[str, object] +TConfig = TypeVar("TConfig", bound=BaseConfig) class NamedConfig(TypedDict, Generic[TName, TConfig]): """ - A typed dictionary representing an object with a name and configuration, where the configuration - is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration key is not required. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: NotRequired[ReadOnly[TConfig]] + """The configuration of the object.""" + + +class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): + """ + A typed dictionary representing an object with a `"name"` and `"configuration"` keys. + + The configuration is required. This class is generic with two type parameters: the type of the name (``TName``) and the type of the configuration (``TConfig``). diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index efc6bd7949..6475ccb1ef 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,17 +1,20 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast +from zarr.abc.codec import ArrayArrayCodec, Codec, Numcodec from zarr.abc.metadata import Metadata -from zarr.abc.numcodec import Numcodec, _is_numcodec +from zarr.abc.numcodec import _is_numcodec +from zarr.codecs._v2 import NumcodecsWrapper +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.dtype.common import OBJECT_CODEC_IDS from zarr.errors import ZarrUserWarning -from zarr.registry import get_numcodec +from zarr.registry import get_codec if TYPE_CHECKING: from typing import Literal, Self @@ -20,12 +23,12 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.common import DTypeSpec_V2 from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, TDType_co, TScalar_co, - ZDType, ) import json @@ -43,6 +46,9 @@ parse_shapelike, ) from zarr.core.config import config, parse_indexing_order +from zarr.core.dtype.wrapper import ( + ZDType, +) from zarr.core.metadata.common import parse_attributes @@ -56,7 +62,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None +CompressorLike_V2: TypeAlias = Mapping[str, JSON] | Numcodec | Codec @dataclass(frozen=True, kw_only=True) @@ -66,9 +72,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[Numcodec, ...] | None = None + filters: tuple[Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: Numcodec | None + compressor: Codec attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -81,8 +87,8 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: CompressorLikev2 = None, - filters: Iterable[Numcodec | dict[str, JSON]] | None = None, + compressor: CompressorLike_V2 | None = None, + filters: Iterable[CompressorLike_V2] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -90,6 +96,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) + # TODO: remove this + if not isinstance(dtype, ZDType): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -99,6 +108,18 @@ def __init__( fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value + + array_spec = ArraySpec( + shape=shape_parsed, + dtype=dtype, + fill_value=fill_value_parsed, + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + prototype=default_buffer_prototype(), # TODO: prototype is not needed here. + ) + if compressor_parsed is not None: + compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + if filters_parsed is not None: + filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -132,10 +153,10 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( - json.dumps(zattrs_dict, indent=json_indent, allow_nan=True).encode() + json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() ), } @@ -158,6 +179,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: object_codec_id = get_object_codec_id((_compressor,)) # we add a layer of indirection here around the dtype attribute of the array metadata # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { "name": data["dtype"], "object_codec_id": object_codec_id, @@ -197,34 +219,24 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if _is_numcodec(zarray_dict["compressor"]): - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - + if self.compressor is not None: + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + else: + zarray_dict["compressor"] = None + new_filters = [] if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if _is_numcodec(f): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters + new_filters.extend([f.to_json(zarr_format=2) for f in self.filters]) + else: + new_filters = None + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] + # serialize the dtype after fill value-specific JSON encoding + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] # type: ignore[assignment] return zarray_dict @@ -262,20 +274,23 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[Numcodec, ...] | None: +def parse_filters(data: object) -> tuple[ArrayArrayCodec | NumcodecsWrapper, ...] | None: """ Parse a potential tuple of filters """ - out: list[Numcodec] = [] + out: list[Codec | NumcodecsWrapper] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if _is_numcodec(val): + if isinstance(val, (Codec, NumcodecsWrapper)): out.append(val) + elif _is_numcodec(val): + out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - out.append(get_numcodec(val)) # type: ignore[arg-type] + codec = get_codec(val, zarr_format=2) + out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -286,19 +301,28 @@ def parse_filters(data: object) -> tuple[Numcodec, ...] | None: return tuple(out) # take a single codec instance and wrap it in a tuple if _is_numcodec(data): + return (NumcodecsWrapper(codec=data),) + elif isinstance(data, Codec): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> Numcodec | None: +def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ - if data is None or _is_numcodec(data): + # TODO: only validate the compressor in one place. currently we do it twice, once in init_array + # and again when constructing metadata + if data is None or isinstance(data, Codec | NumcodecsWrapper): return data + if _is_numcodec(data): + try: + return get_codec(data.get_config(), zarr_format=2) + except KeyError: + return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return get_numcodec(data) # type: ignore[arg-type] + return get_codec(data, zarr_format=2) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6f79fb4b09..4711fc96ae 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -30,13 +30,12 @@ ZARR_JSON, ChunkCoords, DimensionNames, - parse_named_configuration, parse_shapelike, ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec def parse_zarr_format(data: object) -> Literal[3]: @@ -63,8 +62,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out += (get_codec_class(name_parsed).from_dict(c),) + out += (get_codec(c, zarr_format=3),) return out @@ -82,9 +80,14 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" + # avoid circular import from zarr.codecs.sharding import ShardingCodec + from zarr.core.codec_pipeline import codecs_from_list - abc = validate_array_bytes_codec(codecs) + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + _codecs = (*array_array_codecs, array_bytes_codec, *bytes_bytes_codecs) + + abc = validate_array_bytes_codec(_codecs) # Recursively resolve array-bytes codecs within sharding codecs while isinstance(abc, ShardingCodec): @@ -288,7 +291,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: d = self.to_dict() return { ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(d, allow_nan=True, indent=json_indent).encode() + json.dumps(d, allow_nan=False, indent=json_indent).encode() ) } @@ -335,6 +338,10 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + out_dict["codecs"] = () + for codec in self.codecs: + out_dict["codecs"] += (codec.to_json(zarr_format=3),) + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` # until then, we have this hack here, which relies on the fact that to_dict will pass through diff --git a/src/zarr/errors.py b/src/zarr/errors.py index 0055ea3c6c..3239d59c0d 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -100,3 +100,6 @@ class ZarrRuntimeWarning(RuntimeWarning): """ A warning for dubious runtime behavior. """ + + +class CodecValidationError(ValueError): ... diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 46216205f7..e3ca422697 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -17,12 +17,14 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON, CodecJSON_V2, CodecPipeline, ) from zarr.abc.numcodec import Numcodec + from zarr.codecs._v2 import NumcodecsWrapper from zarr.core.buffer import Buffer, NDBuffer - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat __all__ = [ "Registry", @@ -144,15 +146,18 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: +def _get_codec_class( + key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False +) -> type[Codec]: if reload_config: _reload_config() - if key in __codec_registries: + if key in registry: # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) - __codec_registries[key].lazy_load() + registry[key].lazy_load() + + codec_classes = registry[key] - codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) @@ -173,6 +178,42 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec | NumcodecsWrapper: + """ + Get an instance of a codec from a name and a configuration + """ + from zarr.codecs._v2 import NumcodecsWrapper + + codec_name: str + if zarr_format == 2: + if isinstance(request, str): + raise TypeError( + f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" + ) + else: + codec_name = request["id"] + elif zarr_format == 3: + if isinstance(request, str): + codec_name = request + else: + codec_name = request["name"] + else: + raise ValueError( + f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" + ) # pragma: no cover + try: + codec_cls = get_codec_class(codec_name) + return codec_cls.from_json(request, zarr_format=zarr_format) + except KeyError: + # if we can't find the codec in the zarr python registry, try the numcodecs registry + codec = get_numcodec(request) + return NumcodecsWrapper(codec=codec) + + +def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: + return _get_codec_class(key, __codec_registries, reload_config=reload_config) + + def _resolve_codec(data: dict[str, JSON]) -> Codec: """ Get a codec instance from a dict representation of that codec. @@ -181,19 +222,28 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec + from zarr.codecs._v2 import Numcodec, NumcodecsBytesBytesCodec, NumcodecsWrapper + result: BytesBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_bytes_bytes() if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") @@ -201,19 +251,26 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_bytes() if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -221,19 +278,26 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_array() + elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") diff --git a/tests/test_api.py b/tests/test_api.py index 69fc9b5b16..4420606f39 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1299,17 +1299,6 @@ def test_v2_without_compressor() -> None: assert arr.compressors == () -def test_v2_with_v3_compressor() -> None: - # Check trying to create a v2 array with a v3 compressor fails - with pytest.raises( - ValueError, - match="Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. Use a numcodecs codec directly instead.", - ): - zarr.create( - store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() - ) - - def add_empty_file(path: Path) -> Path: fpath = path / "a.txt" fpath.touch() diff --git a/tests/test_array.py b/tests/test_array.py index a316ee127f..6e6d7eca7e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1137,8 +1137,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": True}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": True}},), ], ) @pytest.mark.parametrize( diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 6e6e9df383..296889a55a 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -8,10 +8,50 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BloscCodec +from zarr.codecs.blosc import ( + BLOSC_CNAME, + BLOSC_SHUFFLE, + BloscCname, + BloscJSON_V2, + BloscJSON_V3, + BloscShuffle, +) from zarr.core.buffer import default_buffer_prototype from zarr.storage import StorePath +@pytest.mark.parametrize("shuffle", BLOSC_SHUFFLE) +@pytest.mark.parametrize("cname", BLOSC_CNAME) +@pytest.mark.parametrize("clevel", [1, 2]) +@pytest.mark.parametrize("blocksize", [1, 2]) +@pytest.mark.parametrize("typesize", [1, 2]) +def test_to_json_v2( + cname: BloscCname, shuffle: BloscShuffle, clevel: int, blocksize: int, typesize: int +) -> None: + codec = BloscCodec( + shuffle=shuffle, cname=cname, clevel=clevel, blocksize=blocksize, typesize=typesize + ) + expected_v2: BloscJSON_V2 = { + "id": "blosc", + "cname": cname, + "clevel": clevel, + "shuffle": BLOSC_SHUFFLE.index(shuffle), + "blocksize": blocksize, + } + expected_v3: BloscJSON_V3 = { + "name": "blosc", + "configuration": { + "cname": cname, + "clevel": clevel, + "shuffle": shuffle, + "blocksize": blocksize, + "typesize": typesize, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype", ["uint8", "uint16"]) async def test_blosc_evolve(store: Store, dtype: str) -> None: diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_bytes.py similarity index 76% rename from tests/test_codecs/test_endian.py rename to tests/test_codecs/test_bytes.py index ab64afb1b8..e991885fa6 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_bytes.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np import pytest @@ -10,6 +10,26 @@ from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.bytes import BytesJSON_V2, BytesJSON_V3 + + +@pytest.mark.parametrize("endian", ["big", "little"]) +def test_bytescodec_to_json(endian: Literal["big", "little"]) -> None: + codec = BytesCodec(endian=endian) + expected_v2: BytesJSON_V2 = { + "id": "bytes", + "endian": endian, + } + expected_v3: BytesJSON_V3 = { + "name": "bytes", + "configuration": { + "endian": endian, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 4753036c87..db146c04de 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -6,6 +8,26 @@ from zarr.codecs import GzipCodec from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.gzip import GZipJSON_V2, GZipJSON_V3 + + +@pytest.mark.parametrize("level", [1, 5, 9]) +def test_json(level: int) -> None: + codec = GzipCodec(level=level) + expected_v2: GZipJSON_V2 = { + "id": "gzip", + "level": level, + } + expected_v3: GZipJSON_V3 = { + "name": "gzip", + "configuration": { + "level": level, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_gzip(store: Store) -> None: diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index eb80545ff3..ba25b07da4 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,6 @@ import pickle -from typing import Any +from codecs import Codec +from typing import TYPE_CHECKING, Any, Literal import numpy as np import numpy.typing as npt @@ -16,6 +17,8 @@ ShardingCodecIndexLocation, TransposeCodec, ) +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.crc32c_ import Crc32cCodec from zarr.core.buffer import NDArrayLike, default_buffer_prototype from zarr.errors import ZarrUserWarning from zarr.storage import StorePath, ZipStore @@ -23,6 +26,45 @@ from ..conftest import ArrayRequest from .test_codecs import _AsyncArrayProxy, order_from_dim +if TYPE_CHECKING: + from zarr.codecs.sharding import ShardingJSON_V2, ShardingJSON_V3 + + +@pytest.mark.parametrize("index_location", ["start", "end"]) +@pytest.mark.parametrize("chunk_shape", [(32, 32), (64, 64)]) +@pytest.mark.parametrize("codecs", [(BytesCodec(),)]) +@pytest.mark.parametrize("index_codecs", [(Crc32cCodec(),)]) +def test_sharding_codec_to_json( + index_location: Literal["start", "end"], + chunk_shape: tuple[int, ...], + codecs: tuple[Codec, ...], + index_codecs: tuple[Codec, ...], +) -> None: + codec = ShardingCodec( + chunk_shape=chunk_shape, + codecs=codecs, + index_location=index_location, + index_codecs=index_codecs, + ) + expected_v2: ShardingJSON_V2 = { + "id": "sharding_indexed", + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=2) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=2) for c in index_codecs), + "index_location": index_location, + } + expected_v3: ShardingJSON_V3 = { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": chunk_shape, + "codecs": tuple(c.to_json(zarr_format=3) for c in codecs), + "index_codecs": tuple(c.to_json(zarr_format=3) for c in index_codecs), + "index_location": index_location, + }, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("index_location", ["start", "end"]) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 06ec668ad3..0d48d3fddf 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -10,6 +12,21 @@ from .test_codecs import _AsyncArrayProxy +if TYPE_CHECKING: + from zarr.codecs.transpose import TransposeJSON_V2, TransposeJSON_V3 + + +@pytest.mark.parametrize("order", [(1, 2, 3), (2, 1, 0)]) +def test_transpose_to_json(order: tuple[int, ...]) -> None: + codec = TransposeCodec(order=order) + expected_v2: TransposeJSON_V2 = {"id": "transpose", "order": order} + expected_v3: TransposeJSON_V3 = { + "name": "transpose", + "configuration": {"order": order}, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index cf0905daca..50c69b6ae2 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,6 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.codecs.vlen_utf8 import VLenUTF8Codec, VLenUTF8JSON_V2, VLenUTF8JSON_V3 from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata @@ -22,6 +23,16 @@ expected_array_string_dtype = np.dtype("O") +def test_vlen_utf8_to_json() -> None: + codec = VLenUTF8Codec() + expected_v2: VLenUTF8JSON_V2 = {"id": "vlen-utf8"} + expected_v3: VLenUTF8JSON_V3 = { + "name": "vlen-utf8", + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 6068f53443..7c18a25103 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -6,6 +8,25 @@ from zarr.codecs import ZstdCodec from zarr.storage import StorePath +if TYPE_CHECKING: + from zarr.codecs.zstd import ZstdJSON_V2, ZstdJSON_V3 + + +@pytest.mark.parametrize("level", [1, 5, 9]) +@pytest.mark.parametrize("checksum", [True, False]) +def test_json(level: int, checksum: bool) -> None: + codec = ZstdCodec(level=level, checksum=checksum) + expected_v2: ZstdJSON_V2 = { + "id": "zstd", + "level": level, + } + expected_v3: ZstdJSON_V3 = { + "name": "zstd", + "configuration": {"level": level, "checksum": checksum}, + } + assert codec.to_json(zarr_format=2) == expected_v2 + assert codec.to_json(zarr_format=3) == expected_v3 + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("checksum", [True, False]) diff --git a/tests/test_config.py b/tests/test_config.py index 0c029dda3a..e740fb84ce 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -181,7 +181,18 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {}}], + compressors=[ + { + "name": "blosc", + "configuration": { + "cname": "lz4", + "clevel": 1, + "shuffle": "noshuffle", + "blocksize": 1, + "typesize": 1, + }, + }, + ], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_group.py b/tests/test_group.py index e5cfe82daa..bc6398761a 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -12,7 +12,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr import zarr.api.asynchronous @@ -22,6 +21,7 @@ from zarr.abc.store import Store from zarr.core import sync_group from zarr.core._info import GroupInfo +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.dtype.common import unpack_dtype_json @@ -597,7 +597,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": Blosc(), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { @@ -624,8 +624,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "name": "default", }, "codecs": ( - {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + default_serializer_v3(dtype).to_json(zarr_format=zarr_format), + *[ + c.to_json(zarr_format=zarr_format) + for c in default_compressors_v3(dtype) + ], ), "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -74,7 +74,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -117,7 +117,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted})