Skip to content

Commit f1ca290

Browse files
committed
Wip implementation of v2 / v3 codec behavior
1 parent 94ba77a commit f1ca290

File tree

11 files changed

+494
-231
lines changed

11 files changed

+494
-231
lines changed

src/zarr/abc/codec.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from abc import abstractmethod
44
from collections.abc import Mapping
5+
from dataclasses import dataclass
56
from typing import (
67
TYPE_CHECKING,
78
ClassVar,
@@ -525,3 +526,5 @@ def get_config(self) -> CodecJSON_V2[str]: ...
525526

526527
@classmethod
527528
def from_config(cls, config: CodecJSON_V2[str]) -> Self: ...
529+
530+

src/zarr/abc/store.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@
66
from itertools import starmap
77
from typing import TYPE_CHECKING, Protocol, runtime_checkable
88

9-
from zarr.core.buffer.core import default_buffer_prototype
10-
from zarr.core.common import concurrent_map
11-
from zarr.core.config import config
12-
139
if TYPE_CHECKING:
1410
from collections.abc import AsyncGenerator, AsyncIterator, Iterable
1511
from types import TracebackType
@@ -435,6 +431,7 @@ async def getsize(self, key: str) -> int:
435431
FileNotFoundError
436432
When the given key does not exist in the store.
437433
"""
434+
from zarr.core.buffer.core import default_buffer_prototype
438435
# Note to implementers: this default implementation is very inefficient since
439436
# it requires reading the entire object. Many systems will have ways to get the
440437
# size of an object without reading it.
@@ -476,6 +473,8 @@ async def getsize_prefix(self, prefix: str) -> int:
476473
# on to getting sizes. Ideally we would overlap those two, which should
477474
# improve tail latency and might reduce memory pressure (since not all keys
478475
# would be in memory at once).
476+
from zarr.core.common import concurrent_map
477+
from zarr.core.config import config
479478
keys = [(x,) async for x in self.list_prefix(prefix)]
480479
limit = config.get("async.concurrency")
481480
sizes = await concurrent_map(keys, self.getsize, limit=limit)

src/zarr/codecs/_v2.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33
import asyncio
44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING, TypeGuard
5+
from typing import TYPE_CHECKING, Self, TypeGuard, overload
6+
from typing_extensions import Literal
67

78
import numpy as np
89
from numcodecs.compat import ensure_bytes, ensure_ndarray_like
910

10-
from zarr.abc.codec import ArrayBytesCodec, Numcodec
11+
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BaseCodec, BytesBytesCodec, CodecJSON, CodecJSON_V2, Numcodec
12+
from zarr.core.buffer.core import BufferPrototype
13+
from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat
1114
from zarr.registry import get_ndbuffer_class
1215

1316
if TYPE_CHECKING:
@@ -133,3 +136,100 @@ async def _encode_single(
133136

134137
def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
135138
raise NotImplementedError
139+
140+
141+
@dataclass(frozen=True, kw_only=True)
142+
class NumcodecsWrapper:
143+
codec: Numcodec
144+
145+
@overload
146+
def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ...
147+
@overload
148+
def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ...
149+
150+
def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]:
151+
if zarr_format == 2:
152+
return self.codec.get_config()
153+
elif zarr_format == 3:
154+
config = self.codec.get_config()
155+
config_no_id = {k: v for k, v in config.items() if k != "id"}
156+
return {"name": config["id"], "configuration": config_no_id}
157+
raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover
158+
159+
@classmethod
160+
def _from_json_v2(cls, data: CodecJSON) -> Self:
161+
raise NotADirectoryError(
162+
"This class does not support creating instances from JSON data for Zarr format 2."
163+
)
164+
165+
@classmethod
166+
def _from_json_v3(cls, data: CodecJSON) -> Self:
167+
raise NotImplementedError(
168+
"This class does not support creating instances from JSON data for Zarr format 3."
169+
)
170+
171+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
172+
raise NotImplementedError
173+
174+
def to_array_array(self) -> NumcodecsArrayArrayCodec:
175+
"""
176+
Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec.
177+
"""
178+
return NumcodecsArrayArrayCodec(codec=self.codec)
179+
180+
def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec:
181+
"""
182+
Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec.
183+
"""
184+
return NumcodecsBytesBytesCodec(codec=self.codec)
185+
186+
def to_array_bytes(self) -> NumcodecsArrayBytesCodec:
187+
"""
188+
Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec.
189+
"""
190+
return NumcodecsArrayBytesCodec(codec=self.codec)
191+
192+
193+
class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec):
194+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
195+
return await asyncio.to_thread(
196+
as_numpy_array_wrapper,
197+
self.codec.decode,
198+
chunk_data,
199+
chunk_spec.prototype,
200+
)
201+
202+
def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer:
203+
encoded = self.codec.encode(chunk_bytes.as_array_like())
204+
if isinstance(encoded, np.ndarray): # Required for checksum codecs
205+
return prototype.buffer.from_bytes(encoded.tobytes())
206+
return prototype.buffer.from_bytes(encoded)
207+
208+
async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
209+
return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)
210+
211+
212+
@dataclass(kw_only=True, frozen=True)
213+
class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec):
214+
async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
215+
chunk_ndarray = chunk_data.as_ndarray_like()
216+
out = await asyncio.to_thread(self.codec.decode, chunk_ndarray)
217+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr]
218+
219+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
220+
chunk_ndarray = chunk_data.as_ndarray_like()
221+
out = await asyncio.to_thread(self.codec.encode, chunk_ndarray)
222+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type]
223+
224+
225+
@dataclass(kw_only=True, frozen=True)
226+
class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec):
227+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
228+
chunk_bytes = chunk_data.to_bytes()
229+
out = await asyncio.to_thread(self.codec.decode, chunk_bytes)
230+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
231+
232+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
233+
chunk_ndarray = chunk_data.as_ndarray_like()
234+
out = await asyncio.to_thread(self.codec.encode, chunk_ndarray)
235+
return chunk_spec.prototype.buffer.from_bytes(out)

src/zarr/codecs/blosc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from typing_extensions import ReadOnly
2121

2222
from zarr.abc.codec import BytesBytesCodec, CodecJSON
23-
from zarr.core.buffer.cpu import as_numpy_array_wrapper
2423
from zarr.core.common import (
2524
JSON,
2625
NamedRequiredConfig,
@@ -266,6 +265,7 @@ async def _decode_single(
266265
chunk_bytes: Buffer,
267266
chunk_spec: ArraySpec,
268267
) -> Buffer:
268+
from zarr.core.buffer.cpu import as_numpy_array_wrapper
269269
return await asyncio.to_thread(
270270
as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype
271271
)

src/zarr/core/_info.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44
import textwrap
55
from typing import TYPE_CHECKING, Literal
66

7-
from zarr.abc.codec import Numcodec
8-
97
if TYPE_CHECKING:
10-
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
8+
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Numcodec
119
from zarr.core.common import ZarrFormat
1210
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
1311

0 commit comments

Comments
 (0)