Skip to content

Add v2 and v3 metadata support to codecs #3332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 46 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
a367268
add numcodec protocol
d-v-b Jul 31, 2025
1d424c0
add tests for numcodecs compatibility
d-v-b Jul 31, 2025
41dd6ff
changelog
d-v-b Jul 31, 2025
c435a59
ignore unknown key
d-v-b Jul 31, 2025
8e50ef8
remove re-implementation of get_codec
d-v-b Aug 1, 2025
ef31c5b
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 1, 2025
4ba7914
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
ab52539
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
95c9c8b
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
156134f
add to_json methods to codecs
d-v-b Aug 4, 2025
486f837
add codecvalidationerror
d-v-b Aug 4, 2025
dd53981
fix v2 codec json models to avoid inheritance
d-v-b Aug 4, 2025
678889a
add blosc json test
d-v-b Aug 4, 2025
dfca3ec
distinguish namedconfig from namedrequiredconfig
d-v-b Aug 4, 2025
262e369
lint
d-v-b Aug 4, 2025
4c7fe8a
make codecvalidationerror effectively single-argument
d-v-b Aug 4, 2025
1e23a91
rename test_endian to test_bytes
d-v-b Aug 4, 2025
d7d4e02
bring in update codec abc
d-v-b Aug 4, 2025
9980823
add to_json_tests
d-v-b Aug 4, 2025
fcf84b3
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
cbb32d7
lint
d-v-b Aug 4, 2025
e2d4df8
fix broken tests that used invalid codec JSON
d-v-b Aug 4, 2025
d91b0e9
update test_info
d-v-b Aug 4, 2025
1eb5b3c
avoid circular imports by moving numcodec protocol to codec abc
d-v-b Aug 4, 2025
94ba77a
use Numcodec instead of numcodecs.abc.Codec
d-v-b Aug 4, 2025
f1ca290
Wip implementation of v2 / v3 codec behavior
d-v-b Aug 4, 2025
5b0c3ac
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 5, 2025
84c9780
avoid circular imports by importing lower-level routines exactly wher…
d-v-b Aug 5, 2025
9a2f35b
push numcodec prototol into abcs; remove all numcodecs.abc.Codec type…
d-v-b Aug 5, 2025
0d0712f
add tests for codecjson typeguard
d-v-b Aug 5, 2025
931bf2f
avoid using zarr's buffer / ndbuffer for numcodec encode / decode
d-v-b Aug 5, 2025
01bd4b7
use Any to model input / output types of numcodec protocol
d-v-b Aug 5, 2025
f06c6aa
add numcodec protocol
d-v-b Jul 31, 2025
b71e8ac
add tests for numcodecs compatibility
d-v-b Jul 31, 2025
bcaa9ee
changelog
d-v-b Jul 31, 2025
7e49f39
ignore unknown key
d-v-b Jul 31, 2025
4b53f5d
remove re-implementation of get_codec
d-v-b Aug 1, 2025
b35e6c9
avoid circular imports by importing lower-level routines exactly wher…
d-v-b Aug 5, 2025
deef94a
push numcodec prototol into abcs; remove all numcodecs.abc.Codec type…
d-v-b Aug 5, 2025
f057525
add tests for codecjson typeguard
d-v-b Aug 5, 2025
190e1b2
avoid using zarr's buffer / ndbuffer for numcodec encode / decode
d-v-b Aug 5, 2025
82992c5
use Any to model input / output types of numcodec protocol
d-v-b Aug 5, 2025
7ea7e91
Merge branch 'feat/numcodecs-protocol' of github.com:d-v-b/zarr-pytho…
d-v-b Aug 5, 2025
cee4389
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 6, 2025
51b9431
Merge branch 'feat/numcodecs-protocol' of github.com:d-v-b/zarr-pytho…
d-v-b Aug 6, 2025
a971e9c
Merge branch 'main' of https://github.com/zarr-developers/zarr-python…
d-v-b Aug 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,23 @@

from abc import abstractmethod
from collections.abc import Mapping
from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar

from typing_extensions import ReadOnly, TypedDict
from typing import (
TYPE_CHECKING,
ClassVar,
Generic,
Literal,
Self,
TypedDict,
TypeGuard,
TypeVar,
overload,
)

from typing_extensions import Protocol, ReadOnly

from zarr.abc.metadata import Metadata
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map
from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map
from zarr.core.config import config

if TYPE_CHECKING:
Expand Down Expand Up @@ -181,6 +191,34 @@ async def encode(
"""
return await _batching_helper(self._encode_single, chunks_and_specs)

@overload
def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ...

def to_json(
self, zarr_format: ZarrFormat
) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]:
raise NotImplementedError

@classmethod
def _from_json_v2(cls, data: CodecJSON) -> Self:
raise NotImplementedError

@classmethod
def _from_json_v3(cls, data: CodecJSON) -> Self:
raise NotImplementedError

@classmethod
def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self:
if zarr_format == 2:
return cls._from_json_v2(data)
elif zarr_format == 3:
return cls._from_json_v3(data)
raise ValueError(
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
) # pragma: no cover


class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
"""Base class for array-to-array codecs."""
Expand Down Expand Up @@ -471,3 +509,22 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput |
return await func(chunk, chunk_spec)

return wrap


class Numcodec(Protocol):
"""
A protocol that models the ``numcodecs.abc.Codec`` interface.
"""

codec_id: ClassVar[str]

def encode(self, buf: Buffer | NDBuffer) -> Buffer | NDBuffer: ...

def decode(
self, buf: Buffer | NDBuffer, out: Buffer | NDBuffer | None = None
) -> Buffer | NDBuffer: ...

def get_config(self) -> CodecJSON_V2[str]: ...

@classmethod
def from_config(cls, config: CodecJSON_V2[str]) -> Self: ...
1 change: 1 addition & 0 deletions src/zarr/abc/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ async def getsize(self, key: str) -> int:
FileNotFoundError
When the given key does not exist in the store.
"""

# Note to implementers: this default implementation is very inefficient since
# it requires reading the entire object. Many systems will have ways to get the
# size of an object without reading it.
Expand Down
111 changes: 109 additions & 2 deletions src/zarr/codecs/_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,26 @@

import asyncio
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal, Self, overload

import numpy as np
from numcodecs.compat import ensure_bytes, ensure_ndarray_like

from zarr.abc.codec import ArrayBytesCodec
from zarr.abc.codec import (
ArrayArrayCodec,
ArrayBytesCodec,
BytesBytesCodec,
CodecJSON,
CodecJSON_V2,
)
from zarr.registry import get_ndbuffer_class

if TYPE_CHECKING:
from zarr.abc.numcodec import Numcodec
from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.buffer.core import BufferPrototype
from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat


@dataclass(frozen=True)
Expand Down Expand Up @@ -98,3 +106,102 @@ async def _encode_single(

def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
raise NotImplementedError


@dataclass(frozen=True, kw_only=True)
class NumcodecsWrapper:
codec: Numcodec

@overload
def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ...

def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]:
if zarr_format == 2:
return self.codec.get_config()
elif zarr_format == 3:
config = self.codec.get_config()
config_no_id = {k: v for k, v in config.items() if k != "id"}
return {"name": config["id"], "configuration": config_no_id}
raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover

@classmethod
def _from_json_v2(cls, data: CodecJSON) -> Self:
raise NotADirectoryError(
"This class does not support creating instances from JSON data for Zarr format 2."
)

@classmethod
def _from_json_v3(cls, data: CodecJSON) -> Self:
raise NotImplementedError(
"This class does not support creating instances from JSON data for Zarr format 3."
)

def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
raise NotImplementedError

def to_array_array(self) -> NumcodecsArrayArrayCodec:
"""
Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec.
"""
return NumcodecsArrayArrayCodec(codec=self.codec)

def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec:
"""
Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec.
"""
return NumcodecsBytesBytesCodec(codec=self.codec)

def to_array_bytes(self) -> NumcodecsArrayBytesCodec:
"""
Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec.
"""
return NumcodecsArrayBytesCodec(codec=self.codec)


class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec):
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
from zarr.core.buffer.cpu import as_numpy_array_wrapper

return await asyncio.to_thread(
as_numpy_array_wrapper,
self.codec.decode,
chunk_data,
chunk_spec.prototype,
)

def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer:
encoded = self.codec.encode(chunk_bytes.as_array_like())
if isinstance(encoded, np.ndarray): # Required for checksum codecs
return prototype.buffer.from_bytes(encoded.tobytes())
return prototype.buffer.from_bytes(encoded)

async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)


@dataclass(kw_only=True, frozen=True)
class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec):
async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
chunk_ndarray = chunk_data.as_ndarray_like()
out = await asyncio.to_thread(self.codec.decode, chunk_ndarray)
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr]

async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
chunk_ndarray = chunk_data.as_ndarray_like()
out = await asyncio.to_thread(self.codec.encode, chunk_ndarray)
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type]


@dataclass(kw_only=True, frozen=True)
class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec):
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
chunk_bytes = chunk_data.to_bytes()
out = await asyncio.to_thread(self.codec.decode, chunk_bytes)
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))

async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
chunk_ndarray = chunk_data.as_ndarray_like()
out = await asyncio.to_thread(self.codec.encode, chunk_ndarray)
return chunk_spec.prototype.buffer.from_bytes(out)
Loading
Loading