Skip to content

Commit 90fb2bd

Browse files
d-v-bmaxrjones
andauthored
add numcodec protocol (#3318)
* add numcodec protocol * add tests for numcodecs compatibility * changelog * ignore unknown key * remove re-implementation of get_codec * avoid circular imports by importing lower-level routines exactly where needed * push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations * add tests for codecjson typeguard * avoid using zarr's buffer / ndbuffer for numcodec encode / decode * use Any to model input / output types of numcodec protocol * add numcodec protocol * add tests for numcodecs compatibility * changelog * ignore unknown key * remove re-implementation of get_codec * avoid circular imports by importing lower-level routines exactly where needed * push numcodec prototol into abcs; remove all numcodecs.abc.Codec type annotations * add tests for codecjson typeguard * avoid using zarr's buffer / ndbuffer for numcodec encode / decode * use Any to model input / output types of numcodec protocol * Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <[email protected]> * Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <[email protected]> * Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <[email protected]> * Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <[email protected]> * Update src/zarr/abc/numcodec.py Co-authored-by: Max Jones <[email protected]> * fix docstrings * revert changes to store imports * remove whitespace * fix docstring --------- Co-authored-by: Max Jones <[email protected]>
1 parent 18419f0 commit 90fb2bd

File tree

17 files changed

+261
-76
lines changed

17 files changed

+261
-76
lines changed

changes/3318.misc.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward
2+
making ``numcodecs`` an optional dependency for ``zarr-python``.

src/zarr/abc/codec.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

33
from abc import abstractmethod
4-
from typing import TYPE_CHECKING, Generic, TypeVar
4+
from collections.abc import Mapping
5+
from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar
6+
7+
from typing_extensions import ReadOnly, TypedDict
58

69
from zarr.abc.metadata import Metadata
710
from zarr.core.buffer import Buffer, NDBuffer
8-
from zarr.core.common import ChunkCoords, concurrent_map
11+
from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map
912
from zarr.core.config import config
1013

1114
if TYPE_CHECKING:
@@ -34,6 +37,27 @@
3437
CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer)
3538
CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer)
3639

40+
TName = TypeVar("TName", bound=str, covariant=True)
41+
42+
43+
class CodecJSON_V2(TypedDict, Generic[TName]):
44+
"""The JSON representation of a codec for Zarr V2"""
45+
46+
id: ReadOnly[TName]
47+
48+
49+
def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]:
50+
return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str)
51+
52+
53+
CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]]
54+
"""The JSON representation of a codec for Zarr V3."""
55+
56+
# The widest type we will *accept* for a codec JSON
57+
# This covers v2 and v3
58+
CodecJSON = str | Mapping[str, object]
59+
"""The widest type of JSON-like input that could specify a codec."""
60+
3761

3862
class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]):
3963
"""Generic base class for codecs.

src/zarr/abc/numcodec.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from typing import Any, Self, TypeGuard
2+
3+
from typing_extensions import Protocol
4+
5+
6+
class Numcodec(Protocol):
7+
"""
8+
A protocol that models the ``numcodecs.abc.Codec`` interface.
9+
10+
This protocol should be considered experimental. Expect the type annotations for ``buf`` and
11+
``out`` to narrow in the future.
12+
"""
13+
14+
codec_id: str
15+
16+
def encode(self, buf: Any) -> Any:
17+
"""Encode data from ``buf``.
18+
19+
Parameters
20+
----------
21+
buf : Any
22+
Data to be encoded.
23+
24+
Returns
25+
-------
26+
enc: Any
27+
Encoded data.
28+
"""
29+
...
30+
31+
def decode(self, buf: Any, out: Any | None = None) -> Any:
32+
"""
33+
Decode data in ``buf``.
34+
35+
Parameters
36+
----------
37+
buf : Any
38+
Encoded data.
39+
out : Any
40+
Writeable buffer to store decoded data. If provided, this buffer must
41+
be exactly the right size to store the decoded data.
42+
43+
Returns
44+
-------
45+
dec : Any
46+
Decoded data.
47+
"""
48+
...
49+
50+
def get_config(self) -> Any:
51+
"""
52+
Return a JSON-serializable configuration dictionary for this
53+
codec. Must include an ``'id'`` field with the codec identifier.
54+
"""
55+
...
56+
57+
@classmethod
58+
def from_config(cls, config: Any) -> Self:
59+
"""
60+
Instantiate a codec from a configuration dictionary.
61+
62+
Parameters
63+
----------
64+
config : Any
65+
A configuration dictionary for this codec.
66+
"""
67+
...
68+
69+
70+
def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]:
71+
"""
72+
Check if the given object is a class implements the Numcodec protocol.
73+
74+
The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method
75+
members (i.e., attributes), so we use this function to manually check for the presence of the
76+
required attributes and methods on a given object.
77+
"""
78+
return (
79+
isinstance(obj, type)
80+
and hasattr(obj, "codec_id")
81+
and isinstance(obj.codec_id, str)
82+
and hasattr(obj, "encode")
83+
and callable(obj.encode)
84+
and hasattr(obj, "decode")
85+
and callable(obj.decode)
86+
and hasattr(obj, "get_config")
87+
and callable(obj.get_config)
88+
and hasattr(obj, "from_config")
89+
and callable(obj.from_config)
90+
)
91+
92+
93+
def _is_numcodec(obj: object) -> TypeGuard[Numcodec]:
94+
"""
95+
Check if the given object implements the Numcodec protocol.
96+
97+
The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method
98+
members (i.e., attributes), so we use this function to manually check for the presence of the
99+
required attributes and methods on a given object.
100+
"""
101+
return _is_numcodec_cls(type(obj))

src/zarr/api/asynchronous.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,8 @@
5252
if TYPE_CHECKING:
5353
from collections.abc import Iterable
5454

55-
import numcodecs.abc
56-
5755
from zarr.abc.codec import Codec
56+
from zarr.abc.numcodec import Numcodec
5857
from zarr.core.buffer import NDArrayLikeOrScalar
5958
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
6059
from zarr.storage import StoreLike
@@ -877,7 +876,7 @@ async def create(
877876
overwrite: bool = False,
878877
path: PathLike | None = None,
879878
chunk_store: StoreLike | None = None,
880-
filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None,
879+
filters: Iterable[dict[str, JSON] | Numcodec] | None = None,
881880
cache_metadata: bool | None = None,
882881
cache_attrs: bool | None = None,
883882
read_only: bool | None = None,

src/zarr/api/synchronous.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
if TYPE_CHECKING:
1616
from collections.abc import Iterable
1717

18-
import numcodecs.abc
1918
import numpy as np
2019
import numpy.typing as npt
2120

2221
from zarr.abc.codec import Codec
22+
from zarr.abc.numcodec import Numcodec
2323
from zarr.api.asynchronous import ArrayLike, PathLike
2424
from zarr.core.array import (
2525
CompressorsLike,
@@ -610,7 +610,7 @@ def create(
610610
overwrite: bool = False,
611611
path: PathLike | None = None,
612612
chunk_store: StoreLike | None = None,
613-
filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None,
613+
filters: Iterable[dict[str, JSON] | Numcodec] | None = None,
614614
cache_metadata: bool | None = None,
615615
cache_attrs: bool | None = None,
616616
read_only: bool | None = None,

src/zarr/codecs/_v2.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,22 @@
44
from dataclasses import dataclass
55
from typing import TYPE_CHECKING
66

7-
import numcodecs
87
import numpy as np
98
from numcodecs.compat import ensure_bytes, ensure_ndarray_like
109

1110
from zarr.abc.codec import ArrayBytesCodec
1211
from zarr.registry import get_ndbuffer_class
1312

1413
if TYPE_CHECKING:
15-
import numcodecs.abc
16-
14+
from zarr.abc.numcodec import Numcodec
1715
from zarr.core.array_spec import ArraySpec
1816
from zarr.core.buffer import Buffer, NDBuffer
1917

2018

2119
@dataclass(frozen=True)
2220
class V2Codec(ArrayBytesCodec):
23-
filters: tuple[numcodecs.abc.Codec, ...] | None
24-
compressor: numcodecs.abc.Codec | None
21+
filters: tuple[Numcodec, ...] | None
22+
compressor: Numcodec | None
2523

2624
is_fixed_size = False
2725

@@ -86,7 +84,6 @@ async def _encode_single(
8684
if self.filters:
8785
for f in self.filters:
8886
chunk = await asyncio.to_thread(f.encode, chunk)
89-
9087
# check object encoding
9188
if ensure_ndarray_like(chunk).dtype == object:
9289
raise RuntimeError("cannot write object array without object codec")
@@ -96,7 +93,6 @@ async def _encode_single(
9693
cdata = await asyncio.to_thread(self.compressor.encode, chunk)
9794
else:
9895
cdata = chunk
99-
10096
cdata = ensure_bytes(cdata)
10197
return chunk_spec.prototype.buffer.from_bytes(cdata)
10298

src/zarr/core/_info.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
from typing import TYPE_CHECKING, Literal
66

77
if TYPE_CHECKING:
8-
import numcodecs.abc
9-
108
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
9+
from zarr.abc.numcodec import Numcodec
1110
from zarr.core.common import ZarrFormat
1211
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
1312

@@ -88,9 +87,9 @@ class ArrayInfo:
8887
_order: Literal["C", "F"]
8988
_read_only: bool
9089
_store_type: str
91-
_filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = ()
90+
_filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = ()
9291
_serializer: ArrayBytesCodec | None = None
93-
_compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = ()
92+
_compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = ()
9493
_count_bytes: int | None = None
9594
_count_bytes_stored: int | None = None
9695
_count_chunks_initialized: int | None = None

0 commit comments

Comments
 (0)