From a36726846bf569163d17ace65e85e5db2dd5d622 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 15:44:18 +0200 Subject: [PATCH 01/29] add numcodec protocol --- src/zarr/abc/codec.py | 28 ++++++++++++- src/zarr/api/asynchronous.py | 5 +-- src/zarr/api/synchronous.py | 4 +- src/zarr/codecs/_numcodecs.py | 37 ++++++++++++++++ src/zarr/codecs/_v2.py | 77 ++++++++++++++++++++++++++++------ src/zarr/core/array.py | 16 +++---- tests/test_api.py | 2 +- tests/test_array.py | 2 +- tests/test_codecs/test_vlen.py | 2 +- 9 files changed, 142 insertions(+), 31 deletions(-) create mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..d5c995d2ca 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,14 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar + +from typing_extensions import ReadOnly, TypedDict from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +37,27 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecJSON_V2(TypedDict, Generic[TName]): + """The JSON representation of a codec for Zarr V2""" + + id: ReadOnly[TName] + + +def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: + return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str) + + +CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]] +"""The JSON representation of a codec for Zarr V3.""" + +# The widest type we will *accept* for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] +"""The widest type of JSON-like input that could specify a codec.""" + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 9a380082b0..044d881c22 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -46,9 +46,8 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc - from zarr.abc.codec import Codec + from zarr.codecs._v2 import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike @@ -871,7 +870,7 @@ async def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 4ce02e7b6d..df667c0b23 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -14,12 +14,12 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc import numpy as np import numpy.typing as npt from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, @@ -609,7 +609,7 @@ def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..4a8f43b5c6 --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,37 @@ +import numcodecs.registry as numcodecs_registry + +from zarr.abc.codec import CodecJSON_V2 +from zarr.codecs._v2 import Numcodec + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + codec_id = data["id"] + cls = numcodecs_registry.codec_registry.get(codec_id) + if cls is None and data in numcodecs_registry.entries: + cls = numcodecs_registry.entries[data].load() + numcodecs_registry.register_codec(cls, codec_id=data) + if cls is not None: + return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] + raise KeyError(data) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..8deae99a6d 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,26 +2,77 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard -import numcodecs import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like +from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: - import numcodecs.abc - from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): - filters: tuple[numcodecs.abc.Codec, ...] | None - compressor: numcodecs.abc.Codec | None + filters: tuple[Numcodec, ...] | None + compressor: Numcodec | None is_fixed_size = False @@ -33,9 +84,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) + chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] else: - chunk = cdata + chunk = cdata # type: ignore[assignment] # apply filters if self.filters: @@ -56,7 +107,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -85,7 +136,7 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) + chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] # check object encoding if ensure_ndarray_like(chunk).dtype == object: @@ -93,9 +144,9 @@ async def _encode_single( # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) + cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] else: - cdata = chunk + cdata = chunk # type: ignore[assignment] cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 260e94bc88..68a4694a55 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -27,7 +27,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec +from zarr.codecs._v2 import Numcodec, V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -607,7 +607,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -818,7 +818,7 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLikev2 = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: @@ -856,7 +856,7 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, @@ -3898,7 +3898,7 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec @@ -3911,10 +3911,10 @@ def _build_parents( ) CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4944,7 +4944,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) + compressors = (compressor,) # type: ignore[assignment] elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/tests/test_api.py b/tests/test_api.py index 01fb40f050..b245685e30 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1282,7 +1282,7 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: dtype=src.dtype, overwrite=True, zarr_format=zarr_format, - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] ) z[:10, :10] = src[:10, :10] diff --git a/tests/test_array.py b/tests/test_array.py index f672006f9a..6342ce6430 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1684,7 +1684,7 @@ def test_roundtrip_numcodecs() -> None: shape=(720, 1440), chunks=(720, 1440), dtype="float64", - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] filters=filters, fill_value=-9.99, dimension_names=["lat", "lon"], diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 6fe1863464..cf0905daca 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -40,7 +40,7 @@ def test_vlen_string( chunks=data.shape, dtype=data.dtype, fill_value="", - compressors=compressor, + compressors=compressor, # type: ignore[arg-type] ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy From 1d424c0ba32bb535d9f787435980e6e87bc802c4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:25:36 +0200 Subject: [PATCH 02/29] add tests for numcodecs compatibility --- tests/test_codecs/test_numcodecs.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/test_codecs/test_numcodecs.py diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py new file mode 100644 index 0000000000..a3824cc386 --- /dev/null +++ b/tests/test_codecs/test_numcodecs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from numcodecs import GZip + +from zarr.codecs._numcodecs import get_numcodec +from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls + + +def test_get_numcodec() -> None: + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + + +def test_is_numcodec() -> None: + """ + Test the _is_numcodec function + """ + assert _is_numcodec(GZip()) + + +def test_is_numcodec_cls() -> None: + """ + Test the _is_numcodec_cls function + """ + assert _is_numcodec_cls(GZip) From 41dd6ff53664f7b33db24e2a56562a9ddc53484b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:32:31 +0200 Subject: [PATCH 03/29] changelog --- changes/3318.misc.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3318.misc.rst diff --git a/changes/3318.misc.rst b/changes/3318.misc.rst new file mode 100644 index 0000000000..f8308e6b97 --- /dev/null +++ b/changes/3318.misc.rst @@ -0,0 +1,2 @@ +Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward +making ``numcodecs`` an optional dependency for ``zarr-python``. \ No newline at end of file From c435a59728b6671250f14e4b512527bcbf329ea6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:43:41 +0200 Subject: [PATCH 04/29] ignore unknown key --- tests/test_codecs/test_numcodecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index a3824cc386..bb381c615a 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -7,7 +7,7 @@ def test_get_numcodec() -> None: - assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] def test_is_numcodec() -> None: From 8e50ef8606bc0ada69127841befe179717d2a0c4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 1 Aug 2025 11:05:43 +0200 Subject: [PATCH 05/29] remove re-implementation of get_codec --- src/zarr/codecs/_numcodecs.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 4a8f43b5c6..b00f258db5 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,3 @@ -import numcodecs.registry as numcodecs_registry - from zarr.abc.codec import CodecJSON_V2 from zarr.codecs._v2 import Numcodec @@ -22,16 +20,11 @@ def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: Examples -------- - >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec = get_codec({'id': 'zlib', 'level': 1}) >>> codec Zlib(level=1) """ - codec_id = data["id"] - cls = numcodecs_registry.codec_registry.get(codec_id) - if cls is None and data in numcodecs_registry.entries: - cls = numcodecs_registry.entries[data].load() - numcodecs_registry.register_codec(cls, codec_id=data) - if cls is not None: - return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] - raise KeyError(data) + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] From 84c9780616399512412c838ac54d8d68dcb65216 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:15:37 +0200 Subject: [PATCH 06/29] avoid circular imports by importing lower-level routines exactly where needed --- src/zarr/abc/store.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1fbdb3146c..31e9728f8a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,10 +6,6 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.common import concurrent_map -from zarr.core.config import config - if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -438,6 +434,8 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. + from zarr.core.buffer.core import default_buffer_prototype + value = await self.get(key, prototype=default_buffer_prototype()) if value is None: raise FileNotFoundError(key) @@ -476,6 +474,9 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). + from zarr.core.common import concurrent_map + from zarr.core.config import config + keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) From 9a2f35ba7efd639caa9bd7d92414bfa82f7b509b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:16:26 +0200 Subject: [PATCH 07/29] push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations --- src/zarr/abc/numcodec.py | 59 +++++++++++++++++++ src/zarr/api/asynchronous.py | 2 +- src/zarr/api/synchronous.py | 2 +- src/zarr/codecs/_numcodecs.py | 30 ---------- src/zarr/codecs/_v2.py | 59 +------------------ src/zarr/core/_info.py | 7 +-- src/zarr/core/array.py | 47 +++++++-------- src/zarr/core/metadata/v2.py | 39 ++++++------ src/zarr/registry.py | 30 ++++++++++ tests/test_array.py | 18 +++--- tests/test_codecs/test_numcodecs.py | 4 +- .../test_v2_dtype_regression.py | 8 +-- 12 files changed, 154 insertions(+), 151 deletions(-) create mode 100644 src/zarr/abc/numcodec.py delete mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py new file mode 100644 index 0000000000..d2c9380146 --- /dev/null +++ b/src/zarr/abc/numcodec.py @@ -0,0 +1,59 @@ +from typing import Self, TypeGuard + +from typing_extensions import Protocol + +from zarr.abc.codec import CodecJSON_V2 +from zarr.core.buffer import Buffer, NDBuffer + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d8d3a5f21d..9137d6395c 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -47,7 +47,7 @@ from collections.abc import Iterable from zarr.abc.codec import Codec - from zarr.codecs._v2 import Numcodec + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index df667c0b23..a368d37a5b 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,8 +18,8 @@ import numpy.typing as npt from zarr.abc.codec import Codec + from zarr.abc.numcodec import Numcodec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py deleted file mode 100644 index b00f258db5..0000000000 --- a/src/zarr/codecs/_numcodecs.py +++ /dev/null @@ -1,30 +0,0 @@ -from zarr.abc.codec import CodecJSON_V2 -from zarr.codecs._v2 import Numcodec - - -def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: - """ - Resolve a numcodec codec from the numcodecs registry. - - This requires the Numcodecs package to be installed. - - Parameters - ---------- - data : CodecJSON_V2 - The JSON metadata for the codec. - - Returns - ------- - codec : Numcodec - - Examples - -------- - - >>> codec = get_codec({'id': 'zlib', 'level': 1}) - >>> codec - Zlib(level=1) - """ - - from numcodecs.registry import get_codec - - return get_codec(data) # type: ignore[no-any-return] diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8deae99a6d..92eda38226 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,73 +2,20 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... - - -def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: - """ - Check if the given object implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return _is_numcodec_cls(type(obj)) - - -def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: - """ - Check if the given object is a class implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return ( - isinstance(obj, type) - and hasattr(obj, "codec_id") - and isinstance(obj.codec_id, str) - and hasattr(obj, "encode") - and callable(obj.encode) - and hasattr(obj, "decode") - and callable(obj.decode) - and hasattr(obj, "get_config") - and callable(obj.get_config) - and hasattr(obj, "from_config") - and callable(obj.from_config) - ) - - @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): filters: tuple[Numcodec, ...] | None diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..fef424346a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: - import numcodecs.abc - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.abc.numcodec import Numcodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -88,9 +87,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = () _serializer: ArrayBytesCodec | None = None - _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () + _compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 68a4694a55..85cd84ae1b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,15 +19,14 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import Numcodec, V2Codec +from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -1033,7 +1032,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1061,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1074,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2227,7 +2226,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2242,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2253,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3900,15 +3899,13 @@ def _build_parents( FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] @@ -4775,7 +4772,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ Given a data type, return the default filters for that data type. @@ -4797,7 +4794,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: """ Given a data type, return the default compressors for that data type. @@ -4805,7 +4802,7 @@ def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: """ from numcodecs import Zstd - return Zstd(level=0, checksum=False) + return Zstd(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4813,12 +4810,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Numcodec, ...] | None + _compressor: Numcodec | None if compressor is None or compressor == (): _compressor = None @@ -4839,7 +4836,7 @@ def _parse_chunk_encoding_v2( else: if isinstance(filters, Iterable): for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): + if not _is_numcodec(f): msg = ( "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." @@ -4852,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.get_config() for f in _filters], # type: ignore[arg-type] + _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] ) ) if object_codec_id is None: @@ -4944,7 +4941,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) # type: ignore[assignment] + compressors = (compressor,) elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 17af3538a9..0c5d09583f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,12 +5,12 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - from zarr.abc.metadata import Metadata +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.registry import get_numcodec if TYPE_CHECKING: from typing import Literal, Self @@ -30,7 +30,6 @@ import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -56,7 +55,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -66,9 +65,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Numcodec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Numcodec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -82,7 +81,7 @@ def __init__( order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + filters: Iterable[Numcodec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -197,12 +196,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + if _is_numcodec(zarray_dict["compressor"]): codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config + codec_config.pop("checksum") # type: ignore[typeddict-item] + zarray_dict["compressor"] = codec_config # type: ignore[assignment] if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -212,11 +211,11 @@ def to_dict(self) -> dict[str, JSON]: raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): + if _is_numcodec(f): new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters + zarray_dict["filters"] = new_filters # type: ignore[assignment] # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: @@ -262,20 +261,20 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[Numcodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Numcodec] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if _is_numcodec(val): out.append(val) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + out.append(get_numcodec(val)) # type: ignore[arg-type] else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -285,20 +284,20 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if _is_numcodec(data): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Numcodec | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if data is None or _is_numcodec(data): return data if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_numcodec(data) # type: ignore[arg-type] msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 189d42abed..879adb2058 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -16,8 +16,10 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON_V2, CodecPipeline, ) + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -278,3 +280,31 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: _collect_entrypoints() + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_codec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] diff --git a/tests/test_array.py b/tests/test_array.py index 6342ce6430..a6b829041a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -69,6 +69,7 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr.abc.codec import CodecJSON_V3 from zarr.core.metadata.v3 import ArrayV3Metadata @@ -1315,11 +1316,11 @@ async def test_v2_chunk_encoding( assert arr.metadata.filters == filters_expected # Normalize for property getters - compressor_expected = () if compressor_expected is None else (compressor_expected,) - filters_expected = () if filters_expected is None else filters_expected + arr_compressors_expected = () if compressor_expected is None else (compressor_expected,) + arr_filters_expected = () if filters_expected is None else filters_expected - assert arr.compressors == compressor_expected - assert arr.filters == filters_expected + assert arr.compressors == arr_compressors_expected + assert arr.filters == arr_filters_expected @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) @@ -1357,11 +1358,12 @@ async def test_default_filters_compressors( if default_filters is None: expected_filters = () else: - expected_filters = default_filters + expected_filters = default_filters # type: ignore[assignment] + if default_compressors is None: expected_compressors = () else: - expected_compressors = (default_compressors,) + expected_compressors = (default_compressors,) # type: ignore[assignment] expected_serializer = None else: raise ValueError(f"Invalid zarr_format: {zarr_format}") @@ -1665,7 +1667,7 @@ def test_roundtrip_numcodecs() -> None: {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, {"name": "numcodecs.zlib", "configuration": {"level": 4}}, ] - filters = [ + filters: list[CodecJSON_V3] = [ { "name": "numcodecs.fixedscaleoffset", "configuration": { @@ -1685,7 +1687,7 @@ def test_roundtrip_numcodecs() -> None: chunks=(720, 1440), dtype="float64", compressors=compressors, # type: ignore[arg-type] - filters=filters, + filters=filters, # type: ignore[arg-type] fill_value=-9.99, dimension_names=["lat", "lon"], ) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index bb381c615a..1c4d550587 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -2,8 +2,8 @@ from numcodecs import GZip -from zarr.codecs._numcodecs import get_numcodec -from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls +from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.registry import get_numcodec def test_get_numcodec() -> None: diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 9702ca7d23..ffe273490d 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal -import numcodecs import numpy as np import pytest from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd @@ -13,6 +12,7 @@ import zarr.abc import zarr.abc.codec import zarr.codecs as zarrcodecs +from zarr.abc.numcodec import Numcodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.bytes import VariableLengthBytes @@ -40,12 +40,12 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () + filters: tuple[Numcodec, ...] = () serializer: str | None = None - compressor: numcodecs.abc.Codec + compressor: Numcodec -basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_codecs: tuple[Numcodec, ...] = GZip(), Blosc(), LZ4(), LZMA(), Zstd() basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" string_dtypes = "U4" From 0d0712f0974c243e376ba2fd9de0078be1978bae Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:32:56 +0200 Subject: [PATCH 08/29] add tests for codecjson typeguard --- tests/test_abc/__init__.py | 0 tests/test_abc/test_codec.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 tests/test_abc/__init__.py create mode 100644 tests/test_abc/test_codec.py diff --git a/tests/test_abc/__init__.py b/tests/test_abc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_abc/test_codec.py b/tests/test_abc/test_codec.py new file mode 100644 index 0000000000..e0f9ddb7bb --- /dev/null +++ b/tests/test_abc/test_codec.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from zarr.abc.codec import _check_codecjson_v2 + + +def test_check_codecjson_v2_valid() -> None: + """ + Test that the _check_codecjson_v2 function works + """ + assert _check_codecjson_v2({"id": "gzip"}) + assert not _check_codecjson_v2({"id": 10}) + assert not _check_codecjson_v2([10, 11]) From 931bf2fd35f909a9eb2458fba25cb795207798de Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 17:01:52 +0200 Subject: [PATCH 09/29] avoid using zarr's buffer / ndbuffer for numcodec encode / decode --- src/zarr/abc/numcodec.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index d2c9380146..db6ff4655a 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,9 +1,8 @@ from typing import Self, TypeGuard -from typing_extensions import Protocol +from typing_extensions import Buffer, Protocol from zarr.abc.codec import CodecJSON_V2 -from zarr.core.buffer import Buffer, NDBuffer class Numcodec(Protocol): @@ -13,11 +12,9 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + def encode(self, buf: Buffer) -> Buffer: ... - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... + def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... def get_config(self) -> CodecJSON_V2[str]: ... From 01bd4b71d132f09673aeca5b8f9d983b1f2ea6b6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 19:12:35 +0200 Subject: [PATCH 10/29] use Any to model input / output types of numcodec protocol --- src/zarr/abc/numcodec.py | 14 ++++++-------- src/zarr/codecs/_v2.py | 14 ++++++-------- src/zarr/core/array.py | 6 +++--- src/zarr/core/metadata/v2.py | 6 +++--- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index db6ff4655a..c671428388 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,8 +1,6 @@ -from typing import Self, TypeGuard +from typing import Any, Self, TypeGuard -from typing_extensions import Buffer, Protocol - -from zarr.abc.codec import CodecJSON_V2 +from typing_extensions import Protocol class Numcodec(Protocol): @@ -12,14 +10,14 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer) -> Buffer: ... + def encode(self, buf: Any) -> Any: ... - def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... + def decode(self, buf: Any, out: Any | None = None) -> Any: ... - def get_config(self) -> CodecJSON_V2[str]: ... + def get_config(self) -> Any: ... @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + def from_config(cls, config: Any) -> Self: ... def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 92eda38226..3c6c99c21c 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -31,9 +31,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] + chunk = await asyncio.to_thread(self.compressor.decode, cdata) else: - chunk = cdata # type: ignore[assignment] + chunk = cdata # apply filters if self.filters: @@ -54,7 +54,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -83,18 +83,16 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] - + chunk = await asyncio.to_thread(f.encode, chunk) # check object encoding if ensure_ndarray_like(chunk).dtype == object: raise RuntimeError("cannot write object array without object codec") # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] + cdata = await asyncio.to_thread(self.compressor.encode, chunk) else: - cdata = chunk # type: ignore[assignment] - + cdata = chunk cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 85cd84ae1b..02a97aee0f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4849,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] + object_codec_id = get_object_codec_id((_compressor.get_config(),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], # type: ignore[arg-type] - _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] + *[f.get_config() for f in _filters], + _compressor.get_config() if _compressor is not None else None, ) ) if object_codec_id is None: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 0c5d09583f..ae1d44c44d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -200,8 +200,8 @@ def to_dict(self) -> dict[str, JSON]: codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") # type: ignore[typeddict-item] - zarray_dict["compressor"] = codec_config # type: ignore[assignment] + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -215,7 +215,7 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters # type: ignore[assignment] + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: From f06c6aa4bf30db5a768a374376d25b2a99289901 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 15:44:18 +0200 Subject: [PATCH 11/29] add numcodec protocol --- src/zarr/abc/codec.py | 28 ++++++++++++- src/zarr/api/asynchronous.py | 5 +-- src/zarr/api/synchronous.py | 4 +- src/zarr/codecs/_numcodecs.py | 37 ++++++++++++++++ src/zarr/codecs/_v2.py | 77 ++++++++++++++++++++++++++++------ src/zarr/core/array.py | 16 +++---- tests/test_api.py | 2 +- tests/test_codecs/test_vlen.py | 2 +- 8 files changed, 141 insertions(+), 30 deletions(-) create mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..d5c995d2ca 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,14 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar + +from typing_extensions import ReadOnly, TypedDict from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +37,27 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecJSON_V2(TypedDict, Generic[TName]): + """The JSON representation of a codec for Zarr V2""" + + id: ReadOnly[TName] + + +def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: + return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str) + + +CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]] +"""The JSON representation of a codec for Zarr V3.""" + +# The widest type we will *accept* for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] +"""The widest type of JSON-like input that could specify a codec.""" + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 78b68caf73..861557b5f3 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -52,9 +52,8 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc - from zarr.abc.codec import Codec + from zarr.codecs._v2 import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike @@ -877,7 +876,7 @@ async def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index ed1ae2cf2a..6db173d5d4 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -15,12 +15,12 @@ if TYPE_CHECKING: from collections.abc import Iterable - import numcodecs.abc import numpy as np import numpy.typing as npt from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, @@ -610,7 +610,7 @@ def create( overwrite: bool = False, path: PathLike | None = None, chunk_store: StoreLike | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, cache_metadata: bool | None = None, cache_attrs: bool | None = None, read_only: bool | None = None, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py new file mode 100644 index 0000000000..4a8f43b5c6 --- /dev/null +++ b/src/zarr/codecs/_numcodecs.py @@ -0,0 +1,37 @@ +import numcodecs.registry as numcodecs_registry + +from zarr.abc.codec import CodecJSON_V2 +from zarr.codecs._v2 import Numcodec + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + codec_id = data["id"] + cls = numcodecs_registry.codec_registry.get(codec_id) + if cls is None and data in numcodecs_registry.entries: + cls = numcodecs_registry.entries[data].load() + numcodecs_registry.register_codec(cls, codec_id=data) + if cls is not None: + return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] + raise KeyError(data) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..8deae99a6d 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,26 +2,77 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard -import numcodecs import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like +from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: - import numcodecs.abc - from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: ClassVar[str] + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): - filters: tuple[numcodecs.abc.Codec, ...] | None - compressor: numcodecs.abc.Codec | None + filters: tuple[Numcodec, ...] | None + compressor: Numcodec | None is_fixed_size = False @@ -33,9 +84,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) + chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] else: - chunk = cdata + chunk = cdata # type: ignore[assignment] # apply filters if self.filters: @@ -56,7 +107,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -85,7 +136,7 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) + chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] # check object encoding if ensure_ndarray_like(chunk).dtype == object: @@ -93,9 +144,9 @@ async def _encode_single( # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) + cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] else: - cdata = chunk + cdata = chunk # type: ignore[assignment] cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ca8bc414cc..1d939771cb 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -27,7 +27,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec +from zarr.codecs._v2 import Numcodec, V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -607,7 +607,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -818,7 +818,7 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLikev2 = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: @@ -856,7 +856,7 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, @@ -3898,7 +3898,7 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec @@ -3911,10 +3911,10 @@ def _build_parents( ) CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4944,7 +4944,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) + compressors = (compressor,) # type: ignore[assignment] elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/tests/test_api.py b/tests/test_api.py index 12acf80589..69fc9b5b16 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1283,7 +1283,7 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None: dtype=src.dtype, overwrite=True, zarr_format=zarr_format, - compressors=compressors, + compressors=compressors, # type: ignore[arg-type] ) z[:10, :10] = src[:10, :10] diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 6fe1863464..cf0905daca 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -40,7 +40,7 @@ def test_vlen_string( chunks=data.shape, dtype=data.dtype, fill_value="", - compressors=compressor, + compressors=compressor, # type: ignore[arg-type] ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy From b71e8ac1c41c7cb81195d22e72ef1a611f5cc815 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:25:36 +0200 Subject: [PATCH 12/29] add tests for numcodecs compatibility --- tests/test_codecs/test_numcodecs.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/test_codecs/test_numcodecs.py diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py new file mode 100644 index 0000000000..a3824cc386 --- /dev/null +++ b/tests/test_codecs/test_numcodecs.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from numcodecs import GZip + +from zarr.codecs._numcodecs import get_numcodec +from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls + + +def test_get_numcodec() -> None: + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + + +def test_is_numcodec() -> None: + """ + Test the _is_numcodec function + """ + assert _is_numcodec(GZip()) + + +def test_is_numcodec_cls() -> None: + """ + Test the _is_numcodec_cls function + """ + assert _is_numcodec_cls(GZip) From bcaa9ee5428312f03c44fb10d1d9ad7b3970d475 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:32:31 +0200 Subject: [PATCH 13/29] changelog --- changes/3318.misc.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3318.misc.rst diff --git a/changes/3318.misc.rst b/changes/3318.misc.rst new file mode 100644 index 0000000000..f8308e6b97 --- /dev/null +++ b/changes/3318.misc.rst @@ -0,0 +1,2 @@ +Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward +making ``numcodecs`` an optional dependency for ``zarr-python``. \ No newline at end of file From 7e49f39f46ebe262f4027ec45c9c5ed839872989 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 31 Jul 2025 23:43:41 +0200 Subject: [PATCH 14/29] ignore unknown key --- tests/test_codecs/test_numcodecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index a3824cc386..bb381c615a 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -7,7 +7,7 @@ def test_get_numcodec() -> None: - assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) + assert get_numcodec({"id": "gzip", "level": 2}) == GZip(level=2) # type: ignore[typeddict-unknown-key] def test_is_numcodec() -> None: From 4b53f5df298c3c6156685ac960ac502f501a49b4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 1 Aug 2025 11:05:43 +0200 Subject: [PATCH 15/29] remove re-implementation of get_codec --- src/zarr/codecs/_numcodecs.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py index 4a8f43b5c6..b00f258db5 100644 --- a/src/zarr/codecs/_numcodecs.py +++ b/src/zarr/codecs/_numcodecs.py @@ -1,5 +1,3 @@ -import numcodecs.registry as numcodecs_registry - from zarr.abc.codec import CodecJSON_V2 from zarr.codecs._v2 import Numcodec @@ -22,16 +20,11 @@ def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: Examples -------- - >>> codec = get_numcodec({'id': 'zlib', 'level': 1}) + >>> codec = get_codec({'id': 'zlib', 'level': 1}) >>> codec Zlib(level=1) """ - codec_id = data["id"] - cls = numcodecs_registry.codec_registry.get(codec_id) - if cls is None and data in numcodecs_registry.entries: - cls = numcodecs_registry.entries[data].load() - numcodecs_registry.register_codec(cls, codec_id=data) - if cls is not None: - return cls.from_config({k: v for k, v in data.items() if k != "id"}) # type: ignore[no-any-return] - raise KeyError(data) + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] From b35e6c96ac2b4ee48457ec5d5482dd1e813696c7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:15:37 +0200 Subject: [PATCH 16/29] avoid circular imports by importing lower-level routines exactly where needed --- src/zarr/abc/store.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 1fbdb3146c..31e9728f8a 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,10 +6,6 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable -from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.common import concurrent_map -from zarr.core.config import config - if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -438,6 +434,8 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. + from zarr.core.buffer.core import default_buffer_prototype + value = await self.get(key, prototype=default_buffer_prototype()) if value is None: raise FileNotFoundError(key) @@ -476,6 +474,9 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). + from zarr.core.common import concurrent_map + from zarr.core.config import config + keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) From deef94ae9ffc7505fb8b496fa140ee669f7b23a1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:16:26 +0200 Subject: [PATCH 17/29] push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations --- src/zarr/abc/numcodec.py | 59 +++++++++++++++++++ src/zarr/api/asynchronous.py | 2 +- src/zarr/api/synchronous.py | 2 +- src/zarr/codecs/_numcodecs.py | 30 ---------- src/zarr/codecs/_v2.py | 59 +------------------ src/zarr/core/_info.py | 7 +-- src/zarr/core/array.py | 47 +++++++-------- src/zarr/core/metadata/v2.py | 39 ++++++------ src/zarr/registry.py | 30 ++++++++++ tests/test_array.py | 16 ++--- tests/test_codecs/test_numcodecs.py | 4 +- .../test_v2_dtype_regression.py | 8 +-- 12 files changed, 153 insertions(+), 150 deletions(-) create mode 100644 src/zarr/abc/numcodec.py delete mode 100644 src/zarr/codecs/_numcodecs.py diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py new file mode 100644 index 0000000000..d2c9380146 --- /dev/null +++ b/src/zarr/abc/numcodec.py @@ -0,0 +1,59 @@ +from typing import Self, TypeGuard + +from typing_extensions import Protocol + +from zarr.abc.codec import CodecJSON_V2 +from zarr.core.buffer import Buffer, NDBuffer + + +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + + def decode( + self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None + ) -> Buffer | NDBuffer: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object is a class implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + +def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: + """ + Check if the given object implements the Numcodec protocol. + + The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method + members (i.e., attributes), so we use this function to manually check for the presence of the + required attributes and methods on a given object. + """ + return _is_numcodec_cls(type(obj)) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 861557b5f3..a044ba8594 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -53,7 +53,7 @@ from collections.abc import Iterable from zarr.abc.codec import Codec - from zarr.codecs._v2 import Numcodec + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import NDArrayLikeOrScalar from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.storage import StoreLike diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 6db173d5d4..50a1c0fa20 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -19,8 +19,8 @@ import numpy.typing as npt from zarr.abc.codec import Codec + from zarr.abc.numcodec import Numcodec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.codecs._v2 import Numcodec from zarr.core.array import ( CompressorsLike, FiltersLike, diff --git a/src/zarr/codecs/_numcodecs.py b/src/zarr/codecs/_numcodecs.py deleted file mode 100644 index b00f258db5..0000000000 --- a/src/zarr/codecs/_numcodecs.py +++ /dev/null @@ -1,30 +0,0 @@ -from zarr.abc.codec import CodecJSON_V2 -from zarr.codecs._v2 import Numcodec - - -def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: - """ - Resolve a numcodec codec from the numcodecs registry. - - This requires the Numcodecs package to be installed. - - Parameters - ---------- - data : CodecJSON_V2 - The JSON metadata for the codec. - - Returns - ------- - codec : Numcodec - - Examples - -------- - - >>> codec = get_codec({'id': 'zlib', 'level': 1}) - >>> codec - Zlib(level=1) - """ - - from numcodecs.registry import get_codec - - return get_codec(data) # type: ignore[no-any-return] diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 8deae99a6d..92eda38226 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -2,73 +2,20 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard +from typing import TYPE_CHECKING import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like -from typing_extensions import Protocol -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: + from zarr.abc.numcodec import Numcodec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, NDBuffer -class Numcodec(Protocol): - """ - A protocol that models the ``numcodecs.abc.Codec`` interface. - """ - - codec_id: ClassVar[str] - - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... - - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... - - def get_config(self) -> CodecJSON_V2[str]: ... - - @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... - - -def _is_numcodec(obj: object) -> TypeGuard[Numcodec]: - """ - Check if the given object implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return _is_numcodec_cls(type(obj)) - - -def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: - """ - Check if the given object is a class implements the Numcodec protocol. - - The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method - members (i.e., attributes), so we use this function to manually check for the presence of the - required attributes and methods on a given object. - """ - return ( - isinstance(obj, type) - and hasattr(obj, "codec_id") - and isinstance(obj.codec_id, str) - and hasattr(obj, "encode") - and callable(obj.encode) - and hasattr(obj, "decode") - and callable(obj.decode) - and hasattr(obj, "get_config") - and callable(obj.get_config) - and hasattr(obj, "from_config") - and callable(obj.from_config) - ) - - @dataclass(frozen=True) class V2Codec(ArrayBytesCodec): filters: tuple[Numcodec, ...] | None diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..fef424346a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,9 +5,8 @@ from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: - import numcodecs.abc - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.abc.numcodec import Numcodec from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -88,9 +87,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = () _serializer: ArrayBytesCodec | None = None - _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () + _compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1d939771cb..71de2f58f5 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,15 +19,14 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import Numcodec, V2Codec +from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec @@ -1033,7 +1032,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1061,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.", category=ZarrDeprecationWarning) - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1074,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2227,7 +2226,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2242,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.", category=ZarrDeprecationWarning) - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2253,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3900,15 +3899,13 @@ def _build_parents( FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] @@ -4775,7 +4772,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Numcodec] | None: """ Given a data type, return the default filters for that data type. @@ -4797,7 +4794,7 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> Numcodec: """ Given a data type, return the default compressors for that data type. @@ -4805,7 +4802,7 @@ def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: """ from numcodecs import Zstd - return Zstd(level=0, checksum=False) + return Zstd(level=0, checksum=False) # type: ignore[no-any-return] def _parse_chunk_encoding_v2( @@ -4813,12 +4810,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Numcodec, ...] | None, Numcodec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Numcodec, ...] | None + _compressor: Numcodec | None if compressor is None or compressor == (): _compressor = None @@ -4839,7 +4836,7 @@ def _parse_chunk_encoding_v2( else: if isinstance(filters, Iterable): for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): + if not _is_numcodec(f): msg = ( "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." @@ -4852,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.get_config() for f in _filters], # type: ignore[arg-type] + _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] ) ) if object_codec_id is None: @@ -4944,7 +4941,7 @@ def _parse_deprecated_compressor( # "no compression" compressors = () else: - compressors = (compressor,) # type: ignore[assignment] + compressors = (compressor,) elif zarr_format == 2 and compressor == compressors == "auto": compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 9ad6b3bc42..934befec91 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,13 +5,13 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - from zarr.abc.metadata import Metadata +from zarr.abc.numcodec import Numcodec, _is_numcodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 from zarr.errors import ZarrUserWarning +from zarr.registry import get_numcodec if TYPE_CHECKING: from typing import Literal, Self @@ -31,7 +31,6 @@ import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -57,7 +56,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLikev2: TypeAlias = dict[str, JSON] | Numcodec | None @dataclass(frozen=True, kw_only=True) @@ -67,9 +66,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Numcodec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Numcodec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -83,7 +82,7 @@ def __init__( order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + filters: Iterable[Numcodec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -198,12 +197,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + if _is_numcodec(zarray_dict["compressor"]): codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config + codec_config.pop("checksum") # type: ignore[typeddict-item] + zarray_dict["compressor"] = codec_config # type: ignore[assignment] if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -213,11 +212,11 @@ def to_dict(self) -> dict[str, JSON]: raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): + if _is_numcodec(f): new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters + zarray_dict["filters"] = new_filters # type: ignore[assignment] # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: @@ -263,20 +262,20 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[Numcodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Numcodec] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if _is_numcodec(val): out.append(val) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + out.append(get_numcodec(val)) # type: ignore[arg-type] else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -286,20 +285,20 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if _is_numcodec(data): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Numcodec | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if data is None or _is_numcodec(data): return data if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_numcodec(data) # type: ignore[arg-type] msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index fc3ffd7f7c..46216205f7 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -17,8 +17,10 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON_V2, CodecPipeline, ) + from zarr.abc.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -280,3 +282,31 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: _collect_entrypoints() + + +def get_numcodec(data: CodecJSON_V2[str]) -> Numcodec: + """ + Resolve a numcodec codec from the numcodecs registry. + + This requires the Numcodecs package to be installed. + + Parameters + ---------- + data : CodecJSON_V2 + The JSON metadata for the codec. + + Returns + ------- + codec : Numcodec + + Examples + -------- + + >>> codec = get_codec({'id': 'zlib', 'level': 1}) + >>> codec + Zlib(level=1) + """ + + from numcodecs.registry import get_codec + + return get_codec(data) # type: ignore[no-any-return] diff --git a/tests/test_array.py b/tests/test_array.py index 46b78de7bf..6a11682f98 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -73,6 +73,7 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr.abc.codec import CodecJSON_V3 from zarr.core.metadata.v3 import ArrayV3Metadata @@ -1322,11 +1323,11 @@ async def test_v2_chunk_encoding( assert arr.metadata.filters == filters_expected # Normalize for property getters - compressor_expected = () if compressor_expected is None else (compressor_expected,) - filters_expected = () if filters_expected is None else filters_expected + arr_compressors_expected = () if compressor_expected is None else (compressor_expected,) + arr_filters_expected = () if filters_expected is None else filters_expected - assert arr.compressors == compressor_expected - assert arr.filters == filters_expected + assert arr.compressors == arr_compressors_expected + assert arr.filters == arr_filters_expected @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) @@ -1364,11 +1365,12 @@ async def test_default_filters_compressors( if default_filters is None: expected_filters = () else: - expected_filters = default_filters + expected_filters = default_filters # type: ignore[assignment] + if default_compressors is None: expected_compressors = () else: - expected_compressors = (default_compressors,) + expected_compressors = (default_compressors,) # type: ignore[assignment] expected_serializer = None else: raise ValueError(f"Invalid zarr_format: {zarr_format}") @@ -1672,7 +1674,7 @@ def test_roundtrip_numcodecs() -> None: {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, {"name": "numcodecs.zlib", "configuration": {"level": 4}}, ] - filters = [ + filters: list[CodecJSON_V3] = [ { "name": "numcodecs.fixedscaleoffset", "configuration": { diff --git a/tests/test_codecs/test_numcodecs.py b/tests/test_codecs/test_numcodecs.py index bb381c615a..1c4d550587 100644 --- a/tests/test_codecs/test_numcodecs.py +++ b/tests/test_codecs/test_numcodecs.py @@ -2,8 +2,8 @@ from numcodecs import GZip -from zarr.codecs._numcodecs import get_numcodec -from zarr.codecs._v2 import _is_numcodec, _is_numcodec_cls +from zarr.abc.numcodec import _is_numcodec, _is_numcodec_cls +from zarr.registry import get_numcodec def test_get_numcodec() -> None: diff --git a/tests/test_regression/test_v2_dtype_regression.py b/tests/test_regression/test_v2_dtype_regression.py index 9702ca7d23..ffe273490d 100644 --- a/tests/test_regression/test_v2_dtype_regression.py +++ b/tests/test_regression/test_v2_dtype_regression.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal -import numcodecs import numpy as np import pytest from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd @@ -13,6 +12,7 @@ import zarr.abc import zarr.abc.codec import zarr.codecs as zarrcodecs +from zarr.abc.numcodec import Numcodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.bytes import VariableLengthBytes @@ -40,12 +40,12 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () + filters: tuple[Numcodec, ...] = () serializer: str | None = None - compressor: numcodecs.abc.Codec + compressor: Numcodec -basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_codecs: tuple[Numcodec, ...] = GZip(), Blosc(), LZ4(), LZMA(), Zstd() basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" string_dtypes = "U4" From f057525adf2bbcad643d6fe9bfc6d8328f299130 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 16:32:56 +0200 Subject: [PATCH 18/29] add tests for codecjson typeguard --- tests/test_abc/__init__.py | 0 tests/test_abc/test_codec.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 tests/test_abc/__init__.py create mode 100644 tests/test_abc/test_codec.py diff --git a/tests/test_abc/__init__.py b/tests/test_abc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_abc/test_codec.py b/tests/test_abc/test_codec.py new file mode 100644 index 0000000000..e0f9ddb7bb --- /dev/null +++ b/tests/test_abc/test_codec.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from zarr.abc.codec import _check_codecjson_v2 + + +def test_check_codecjson_v2_valid() -> None: + """ + Test that the _check_codecjson_v2 function works + """ + assert _check_codecjson_v2({"id": "gzip"}) + assert not _check_codecjson_v2({"id": 10}) + assert not _check_codecjson_v2([10, 11]) From 190e1b2744e6305db6fe0c54001d48d76d57b18f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 17:01:52 +0200 Subject: [PATCH 19/29] avoid using zarr's buffer / ndbuffer for numcodec encode / decode --- src/zarr/abc/numcodec.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index d2c9380146..db6ff4655a 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,9 +1,8 @@ from typing import Self, TypeGuard -from typing_extensions import Protocol +from typing_extensions import Buffer, Protocol from zarr.abc.codec import CodecJSON_V2 -from zarr.core.buffer import Buffer, NDBuffer class Numcodec(Protocol): @@ -13,11 +12,9 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ... + def encode(self, buf: Buffer) -> Buffer: ... - def decode( - self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None - ) -> Buffer | NDBuffer: ... + def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... def get_config(self) -> CodecJSON_V2[str]: ... From 82992c5710e9a83a266dffdce1a20d3f5a18606c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Aug 2025 19:12:35 +0200 Subject: [PATCH 20/29] use Any to model input / output types of numcodec protocol --- src/zarr/abc/numcodec.py | 14 ++++++-------- src/zarr/codecs/_v2.py | 14 ++++++-------- src/zarr/core/array.py | 6 +++--- src/zarr/core/metadata/v2.py | 6 +++--- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index db6ff4655a..c671428388 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -1,8 +1,6 @@ -from typing import Self, TypeGuard +from typing import Any, Self, TypeGuard -from typing_extensions import Buffer, Protocol - -from zarr.abc.codec import CodecJSON_V2 +from typing_extensions import Protocol class Numcodec(Protocol): @@ -12,14 +10,14 @@ class Numcodec(Protocol): codec_id: str - def encode(self, buf: Buffer) -> Buffer: ... + def encode(self, buf: Any) -> Any: ... - def decode(self, buf: Buffer, out: Buffer | None = None) -> Buffer: ... + def decode(self, buf: Any, out: Any | None = None) -> Any: ... - def get_config(self) -> CodecJSON_V2[str]: ... + def get_config(self) -> Any: ... @classmethod - def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + def from_config(cls, config: Any) -> Self: ... def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 92eda38226..3c6c99c21c 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -31,9 +31,9 @@ async def _decode_single( cdata = chunk_bytes.as_array_like() # decompress if self.compressor: - chunk = await asyncio.to_thread(self.compressor.decode, cdata) # type: ignore[arg-type] + chunk = await asyncio.to_thread(self.compressor.decode, cdata) else: - chunk = cdata # type: ignore[assignment] + chunk = cdata # apply filters if self.filters: @@ -54,7 +54,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) # type: ignore[assignment] + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -83,18 +83,16 @@ async def _encode_single( # apply filters if self.filters: for f in self.filters: - chunk = await asyncio.to_thread(f.encode, chunk) # type: ignore[arg-type] - + chunk = await asyncio.to_thread(f.encode, chunk) # check object encoding if ensure_ndarray_like(chunk).dtype == object: raise RuntimeError("cannot write object array without object codec") # compress if self.compressor: - cdata = await asyncio.to_thread(self.compressor.encode, chunk) # type: ignore[arg-type] + cdata = await asyncio.to_thread(self.compressor.encode, chunk) else: - cdata = chunk # type: ignore[assignment] - + cdata = chunk cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 71de2f58f5..9e84e68d46 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4849,12 +4849,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) # type: ignore[arg-type] + object_codec_id = get_object_codec_id((_compressor.get_config(),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], # type: ignore[arg-type] - _compressor.get_config() if _compressor is not None else None, # type: ignore[arg-type] + *[f.get_config() for f in _filters], + _compressor.get_config() if _compressor is not None else None, ) ) if object_codec_id is None: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 934befec91..efc6bd7949 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -201,8 +201,8 @@ def to_dict(self) -> dict[str, JSON]: codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") # type: ignore[typeddict-item] - zarray_dict["compressor"] = codec_config # type: ignore[assignment] + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] @@ -216,7 +216,7 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f.get_config()) else: new_filters.append(f) - zarray_dict["filters"] = new_filters # type: ignore[assignment] + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: From c86be013663f69413bfea48615b2232aedd0c37e Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sun, 10 Aug 2025 19:49:02 +0200 Subject: [PATCH 21/29] Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/abc/numcodec.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index c671428388..5a0638204d 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -18,6 +18,9 @@ def get_config(self) -> Any: ... @classmethod def from_config(cls, config: Any) -> Self: ... + """ + Instantiate codec from a configuration object. + """ def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: From dba39f551d8e9ce5cdf08af81c9560972b44af7d Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sun, 10 Aug 2025 19:49:13 +0200 Subject: [PATCH 22/29] Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/abc/numcodec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index 5a0638204d..6d40803e65 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -6,6 +6,8 @@ class Numcodec(Protocol): """ A protocol that models the ``numcodecs.abc.Codec`` interface. + + This protocol should be considered experimental; expect the typing for `buf`, and `out` to become stricter. """ codec_id: str From a857fc2ebf975a336660f766481b4e2c19b8e3dc Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sun, 10 Aug 2025 19:49:26 +0200 Subject: [PATCH 23/29] Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/abc/numcodec.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index 6d40803e65..91de8fed99 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -15,6 +15,24 @@ class Numcodec(Protocol): def encode(self, buf: Any) -> Any: ... def decode(self, buf: Any, out: Any | None = None) -> Any: ... + """ + Decode data in `buf`. + + Parameters + ---------- + buf : Any + Encoded data. May be any object supporting the new-style buffer + protocol. + out : Any + Writeable buffer to store decoded data. N.B. if provided, this buffer must + be exactly the right size to store the decoded data. + + Returns + ------- + dec : Any + Decoded data. May be any object supporting the new-style + buffer protocol. + """ def get_config(self) -> Any: ... From a082222de788dd0e9744b4edf550275ae0d767bc Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sun, 10 Aug 2025 19:49:36 +0200 Subject: [PATCH 24/29] Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/abc/numcodec.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index 91de8fed99..c12662eb7d 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -13,6 +13,20 @@ class Numcodec(Protocol): codec_id: str def encode(self, buf: Any) -> Any: ... + """Encode data in `buf`. + + Parameters + ---------- + buf + Data to be encoded. May be any object supporting the new-style + buffer protocol. + + Returns + ------- + enc + Encoded data. May be any object supporting the new-style buffer + protocol. + """ def decode(self, buf: Any, out: Any | None = None) -> Any: ... """ From ccaaa656286364fd9d02ebe006ee1ed16eac48ad Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sun, 10 Aug 2025 19:49:50 +0200 Subject: [PATCH 25/29] Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --- src/zarr/abc/numcodec.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index c12662eb7d..b369f03cdd 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -49,6 +49,11 @@ def decode(self, buf: Any, out: Any | None = None) -> Any: ... """ def get_config(self) -> Any: ... + """ + Return a dictionary holding configuration parameters for this + codec. Must include an 'id' field with the codec identifier. All + values must be compatible with JSON encoding. + """ @classmethod def from_config(cls, config: Any) -> Self: ... From bb28d1d4830971e1b870a883dc21e9eec33dcb79 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 10:48:30 +0200 Subject: [PATCH 26/29] fix docstrings --- src/zarr/abc/numcodec.py | 49 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index b369f03cdd..c5cb50a868 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -7,59 +7,64 @@ class Numcodec(Protocol): """ A protocol that models the ``numcodecs.abc.Codec`` interface. - This protocol should be considered experimental; expect the typing for `buf`, and `out` to become stricter. + This protocol should be considered experimental. Expect the type annotations for ``buf`` and + ``out`` to narrow in the future. """ codec_id: str - def encode(self, buf: Any) -> Any: ... - """Encode data in `buf`. + def encode(self, buf: Any) -> Any: + """Encode data from ``buf``. Parameters ---------- - buf - Data to be encoded. May be any object supporting the new-style - buffer protocol. + buf : Any + Data to be encoded. Returns ------- - enc - Encoded data. May be any object supporting the new-style buffer - protocol. + enc: Any + Encoded data. """ + ... - def decode(self, buf: Any, out: Any | None = None) -> Any: ... + def decode(self, buf: Any, out: Any | None = None) -> Any: """ - Decode data in `buf`. + Decode data in ``buf``. Parameters ---------- buf : Any - Encoded data. May be any object supporting the new-style buffer - protocol. + Encoded data. out : Any - Writeable buffer to store decoded data. N.B. if provided, this buffer must + Writeable buffer to store decoded data. If provided, this buffer must be exactly the right size to store the decoded data. Returns ------- dec : Any - Decoded data. May be any object supporting the new-style - buffer protocol. + Decoded data. """ + ... - def get_config(self) -> Any: ... + def get_config(self) -> Any: """ - Return a dictionary holding configuration parameters for this - codec. Must include an 'id' field with the codec identifier. All - values must be compatible with JSON encoding. + Return a JSON-serializable configuration dictionary for this + codec. Must include an ``'id'`` field with the codec identifier. """ + ... @classmethod - def from_config(cls, config: Any) -> Self: ... + def from_config(cls, config: Any) -> Self: """ - Instantiate codec from a configuration object. + Instantiate a codec from a configuration dictionary. + + Parameters + ---------- + config : Any + A configuration dictionary for this codec. """ + ... def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: From eedea844cf5e044f82d8a20fd6870982a311c6d9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 10:56:17 +0200 Subject: [PATCH 27/29] revert changes to store imports --- src/zarr/abc/store.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 31e9728f8a..6c6d9538ab 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -6,6 +6,10 @@ from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable +from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.common import concurrent_map +from zarr.core.config import config + if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator, Iterable from types import TracebackType @@ -434,7 +438,6 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. - from zarr.core.buffer.core import default_buffer_prototype value = await self.get(key, prototype=default_buffer_prototype()) if value is None: @@ -474,8 +477,6 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). - from zarr.core.common import concurrent_map - from zarr.core.config import config keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") From fcc010b45598fef494f3b4fa1f8e71b2b99a24a4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 11:12:26 +0200 Subject: [PATCH 28/29] remove whitespace --- src/zarr/abc/store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 6c6d9538ab..1fbdb3146c 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -438,7 +438,6 @@ async def getsize(self, key: str) -> int: # Note to implementers: this default implementation is very inefficient since # it requires reading the entire object. Many systems will have ways to get the # size of an object without reading it. - value = await self.get(key, prototype=default_buffer_prototype()) if value is None: raise FileNotFoundError(key) @@ -477,7 +476,6 @@ async def getsize_prefix(self, prefix: str) -> int: # on to getting sizes. Ideally we would overlap those two, which should # improve tail latency and might reduce memory pressure (since not all keys # would be in memory at once). - keys = [(x,) async for x in self.list_prefix(prefix)] limit = config.get("async.concurrency") sizes = await concurrent_map(keys, self.getsize, limit=limit) From 0166d4489600fbe8892fb128c34570c070db1e35 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 13 Aug 2025 11:56:10 +0200 Subject: [PATCH 29/29] fix docstring --- src/zarr/abc/numcodec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/abc/numcodec.py b/src/zarr/abc/numcodec.py index c5cb50a868..76eac1d898 100644 --- a/src/zarr/abc/numcodec.py +++ b/src/zarr/abc/numcodec.py @@ -7,7 +7,7 @@ class Numcodec(Protocol): """ A protocol that models the ``numcodecs.abc.Codec`` interface. - This protocol should be considered experimental. Expect the type annotations for ``buf`` and + This protocol should be considered experimental. Expect the type annotations for ``buf`` and ``out`` to narrow in the future. """