From 0289779dc9636818ed3a6a7d4f7f30727725b702 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Fri, 5 Sep 2025 12:25:07 +0200 Subject: [PATCH 01/11] Add registry for chunk key encodings. --- src/zarr/core/array.py | 11 +++-- src/zarr/core/chunk_key_encodings.py | 60 ++++++++++++++++------------ src/zarr/core/metadata/v3.py | 8 +++- src/zarr/registry.py | 30 +++++++++++++- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f31b0cc0a4..0b6fedeeff 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -47,6 +47,7 @@ ChunkKeyEncodingLike, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, + parse_chunk_key_encoding, ) from zarr.core.common import ( JSON, @@ -4934,13 +4935,11 @@ def _parse_chunk_key_encoding( """ if data is None: if zarr_format == 2: - result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "."}) + data = {"name": "v2", "configuration": {"separator": "."}} else: - result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) - elif isinstance(data, ChunkKeyEncoding): - result = data - else: - result = ChunkKeyEncoding.from_dict(data) + data = {"name": "default", "configuration": {"separator": "/"}} + result = parse_chunk_key_encoding(data) + if zarr_format == 2 and result.name != "v2": msg = ( "Invalid chunk key encoding. For Zarr format 2 arrays, the `name` field of the " diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 89a34e6052..64442dad28 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -5,13 +5,14 @@ from typing import TYPE_CHECKING, Literal, TypeAlias, TypedDict, cast if TYPE_CHECKING: - from typing import NotRequired + from typing import NotRequired, Self from zarr.abc.metadata import Metadata from zarr.core.common import ( JSON, parse_named_configuration, ) +from zarr.registry import get_chunk_key_encoding_class, register_chunk_key_encoding SeparatorLiteral = Literal[".", "/"] @@ -38,31 +39,9 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: object.__setattr__(self, "separator", separator_parsed) @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEncoding: - if isinstance(data, ChunkKeyEncoding): - return data - - # handle ChunkKeyEncodingParams - if "name" in data and "separator" in data: - data = {"name": data["name"], "configuration": {"separator": data["separator"]}} - - # TODO: remove this cast when we are statically typing the JSON metadata completely. - data = cast("dict[str, JSON]", data) - - # configuration is optional for chunk key encodings + def from_dict(cls, data: dict[str, JSON]) -> Self: name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) - if name_parsed == "default": - if config_parsed is None: - # for default, normalize missing configuration to use the "/" separator. - config_parsed = {"separator": "/"} - return DefaultChunkKeyEncoding(**config_parsed) # type: ignore[arg-type] - if name_parsed == "v2": - if config_parsed is None: - # for v2, normalize missing configuration to use the "." separator. - config_parsed = {"separator": "."} - return V2ChunkKeyEncoding(**config_parsed) # type: ignore[arg-type] - msg = f"Unknown chunk key encoding. Got {name_parsed}, expected one of ('v2', 'default')." - raise ValueError(msg) + return cls(**config_parsed if config_parsed else {}) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"separator": self.separator}} @@ -76,12 +55,13 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: pass -ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding +ChunkKeyEncodingLike: TypeAlias = dict[str, JSON] | ChunkKeyEncodingParams | ChunkKeyEncoding @dataclass(frozen=True) class DefaultChunkKeyEncoding(ChunkKeyEncoding): name: Literal["default"] = "default" + separator: SeparatorLiteral = "/" # default def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: if chunk_key == "c": @@ -95,6 +75,7 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: @dataclass(frozen=True) class V2ChunkKeyEncoding(ChunkKeyEncoding): name: Literal["v2"] = "v2" + separator: SeparatorLiteral = "." # default def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: return tuple(map(int, chunk_key.split(self.separator))) @@ -102,3 +83,30 @@ def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: chunk_identifier = self.separator.join(map(str, chunk_coords)) return "0" if chunk_identifier == "" else chunk_identifier + + +def parse_chunk_key_encoding(data: ChunkKeyEncodingLike) -> ChunkKeyEncoding: + """ + Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. + """ + if isinstance(data, ChunkKeyEncoding): + return data + + # handle ChunkKeyEncodingParams + if "name" in data and "separator" in data: + data = {"name": data["name"], "configuration": {"separator": data["separator"]}} + + # Now must be a named config + data = cast("dict[str, JSON]", data) + + name_parsed, _ = parse_named_configuration(data, require_configuration=False) + try: + chunk_key_encoding = get_chunk_key_encoding_class(name_parsed).from_dict(data) + except KeyError as e: + raise ValueError(f"Unknown chunk key encoding: {e.args[0]!r}") from e + + return chunk_key_encoding + + +register_chunk_key_encoding(DefaultChunkKeyEncoding, qualname="default") +register_chunk_key_encoding(V2ChunkKeyEncoding, qualname="v2") diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 649a490409..cafcb99281 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,7 +24,11 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid -from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike +from zarr.core.chunk_key_encodings import ( + ChunkKeyEncoding, + ChunkKeyEncodingLike, + parse_chunk_key_encoding, +) from zarr.core.common import ( JSON, ZARR_JSON, @@ -174,7 +178,7 @@ def __init__( shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) - chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) + chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.cast_scalar(fill_value) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 46216205f7..2ae4ff9dac 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -22,15 +22,19 @@ ) from zarr.abc.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import JSON +# CHANGE: Consider adding here __all__ = [ "Registry", "get_buffer_class", + "get_chunk_key_encoding_class", "get_codec_class", "get_ndbuffer_class", "get_pipeline_class", "register_buffer", + "register_chunk_key_encoding", "register_codec", "register_ndbuffer", "register_pipeline", @@ -60,10 +64,12 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() +__chunk_key_encoding_registry: Registry[ChunkKeyEncoding] = Registry() +# CHANGE: Consider updating docstring """ The registry module is responsible for managing implementations of codecs, -pipelines, buffers and ndbuffers and collecting them from entrypoints. +pipelines, buffers, ndbuffers, and chunk key encodings and collecting them from entrypoints. The implementation used is determined by the config. The registry module is also responsible for managing dtypes. @@ -99,6 +105,13 @@ def _collect_entrypoints() -> list[Registry[Any]]: data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + __chunk_key_encoding_registry.lazy_load_list.extend( + entry_points.select(group="zarr.chunk_key_encoding") + ) + __chunk_key_encoding_registry.lazy_load_list.extend( + entry_points.select(group="zarr", name="chunk_key_encoding") + ) + __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( entry_points.select(group="zarr", name="codec_pipeline") @@ -114,6 +127,7 @@ def _collect_entrypoints() -> list[Registry[Any]]: __pipeline_registry, __buffer_registry, __ndbuffer_registry, + __chunk_key_encoding_registry, ] @@ -144,6 +158,10 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) +def register_chunk_key_encoding(cls: type, qualname: str | None = None) -> None: + __chunk_key_encoding_registry.register(cls, qualname) + + def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -281,6 +299,16 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) +def get_chunk_key_encoding_class(key: str) -> type[ChunkKeyEncoding]: + __chunk_key_encoding_registry.lazy_load() + if key not in __chunk_key_encoding_registry: + raise KeyError( + f"Chunk key encoding '{key}' not found in registered chunk key encodings: {list(__chunk_key_encoding_registry)}." + ) + + return __chunk_key_encoding_registry[key] + + _collect_entrypoints() From 9d781661b277e6f2de4b34b3efaa403ab7ce9e0b Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Fri, 5 Sep 2025 12:30:32 +0200 Subject: [PATCH 02/11] Fix error message for unknown chunk key encoding in create_array test --- tests/test_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index cf201ce0c7..957ca1b474 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1247,11 +1247,11 @@ async def test_chunk_key_encoding( chunk_key_encoding = ChunkKeyEncodingParams(name=name, separator=separator) # type: ignore[typeddict-item] error_msg = "" if name == "invalid": - error_msg = "Unknown chunk key encoding." + error_msg = r'Unknown chunk key encoding: "Chunk key encoding \'invalid\' not found in registered chunk key encodings: \[.*\]."' if zarr_format == 2 and name == "default": error_msg = "Invalid chunk key encoding. For Zarr format 2 arrays, the `name` field of the chunk key encoding must be 'v2'." if error_msg: - with pytest.raises(ValueError, match=re.escape(error_msg)): + with pytest.raises(ValueError, match=error_msg): arr = await create_array( store=store, dtype="uint8", From c6ee5c6bedc9299b353a863a037658c2a447fdd4 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Fri, 5 Sep 2025 12:56:04 +0200 Subject: [PATCH 03/11] Removed unneccsary type ignore --- tests/test_codecs/test_codecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index dfedbb83de..1884d501a5 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -308,7 +308,7 @@ def test_invalid_metadata(codecs: tuple[Codec, ...]) -> None: ArrayV3Metadata( shape=shape, chunk_grid={"name": "regular", "configuration": {"chunk_shape": chunks}}, - chunk_key_encoding={"name": "default", "configuration": {"separator": "/"}}, # type: ignore[arg-type] + chunk_key_encoding={"name": "default", "configuration": {"separator": "/"}}, fill_value=0, data_type=data_type, codecs=codecs, From ec607aeb0120f44f226ad20a4d70efd35b03fe3b Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 6 Sep 2025 14:01:37 +0200 Subject: [PATCH 04/11] Use entrypoint.name as the key for registering chunk key encodings. - Change register_chunk_key_encoding function to take key as first arg similar to codec. --- src/zarr/core/chunk_key_encodings.py | 4 ++-- src/zarr/registry.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 64442dad28..ebefc5157c 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -108,5 +108,5 @@ def parse_chunk_key_encoding(data: ChunkKeyEncodingLike) -> ChunkKeyEncoding: return chunk_key_encoding -register_chunk_key_encoding(DefaultChunkKeyEncoding, qualname="default") -register_chunk_key_encoding(V2ChunkKeyEncoding, qualname="v2") +register_chunk_key_encoding("default", DefaultChunkKeyEncoding) +register_chunk_key_encoding("v2", V2ChunkKeyEncoding) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 2ae4ff9dac..c1d60e561c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -25,7 +25,6 @@ from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import JSON -# CHANGE: Consider adding here __all__ = [ "Registry", "get_buffer_class", @@ -48,9 +47,9 @@ def __init__(self) -> None: super().__init__() self.lazy_load_list: list[EntryPoint] = [] - def lazy_load(self) -> None: + def lazy_load(self, use_entrypoint_name: bool = False) -> None: for e in self.lazy_load_list: - self.register(e.load()) + self.register(e.load(), qualname=e.name if use_entrypoint_name else None) self.lazy_load_list.clear() @@ -158,8 +157,8 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def register_chunk_key_encoding(cls: type, qualname: str | None = None) -> None: - __chunk_key_encoding_registry.register(cls, qualname) +def register_chunk_key_encoding(key: str, cls: type) -> None: + __chunk_key_encoding_registry.register(cls, key) def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: @@ -300,12 +299,11 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: def get_chunk_key_encoding_class(key: str) -> type[ChunkKeyEncoding]: - __chunk_key_encoding_registry.lazy_load() + __chunk_key_encoding_registry.lazy_load(use_entrypoint_name=True) if key not in __chunk_key_encoding_registry: raise KeyError( f"Chunk key encoding '{key}' not found in registered chunk key encodings: {list(__chunk_key_encoding_registry)}." ) - return __chunk_key_encoding_registry[key] From c4ce5df2f60d3ecf2391555c5d58123477f0f8c7 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 6 Sep 2025 14:25:25 +0200 Subject: [PATCH 05/11] Move parsing of init args in CKE to __post_init__. This enables users to add additional fields to a custom ChunkKeyEncoding without having to override __init__ and taking care of immutability of the attrs. --- src/zarr/core/chunk_key_encodings.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index ebefc5157c..78995e9c03 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -33,9 +33,8 @@ class ChunkKeyEncoding(Metadata): name: str separator: SeparatorLiteral = "." - def __init__(self, *, separator: SeparatorLiteral) -> None: - separator_parsed = parse_separator(separator) - + def __post_init__(self) -> None: + separator_parsed = parse_separator(self.separator) object.__setattr__(self, "separator", separator_parsed) @classmethod From 056613157bd4985a1095746b55a6d2ebfdf54746 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 13 Sep 2025 14:33:24 +0200 Subject: [PATCH 06/11] Clarify ChunkKeyEncoding base class - Enforce encode_chunk_key to be implemented (abstractmethod in ABC) - Make decode_chunk_key optional (raise NotImplementedError by default) Note, the latter is never raised by the current zarr implementation. --- src/zarr/core/chunk_key_encodings.py | 16 +++++++++++----- src/zarr/registry.py | 1 - 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 78995e9c03..3a25052d06 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from abc import abstractmethod +from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, Literal, TypeAlias, TypedDict, cast @@ -29,7 +29,7 @@ class ChunkKeyEncodingParams(TypedDict): @dataclass(frozen=True) -class ChunkKeyEncoding(Metadata): +class ChunkKeyEncoding(ABC, Metadata): name: str separator: SeparatorLiteral = "." @@ -45,13 +45,19 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"separator": self.separator}} - @abstractmethod def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: - pass + """ + Optional: decode a chunk key string into chunk coordinates. + Not required for normal operation; override if needed for testing or debugging. + """ + raise NotImplementedError(f"{self.__class__.__name__} does not implement decode_chunk_key.") @abstractmethod def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: - pass + """ + Encode chunk coordinates into a chunk key string. + Must be implemented by subclasses. + """ ChunkKeyEncodingLike: TypeAlias = dict[str, JSON] | ChunkKeyEncodingParams | ChunkKeyEncoding diff --git a/src/zarr/registry.py b/src/zarr/registry.py index c1d60e561c..417b2202a7 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -65,7 +65,6 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: __ndbuffer_registry: Registry[NDBuffer] = Registry() __chunk_key_encoding_registry: Registry[ChunkKeyEncoding] = Registry() -# CHANGE: Consider updating docstring """ The registry module is responsible for managing implementations of codecs, pipelines, buffers, ndbuffers, and chunk key encodings and collecting them from entrypoints. From 1f09f76409cd40c02ae11448d195ab06df72e6cb Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sun, 14 Sep 2025 11:58:40 +0200 Subject: [PATCH 07/11] Make `name` a ClassVar in ChunkKeyEncoding. This automatically removes it as an init argument. --- src/zarr/core/chunk_key_encodings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 3a25052d06..974c4baa5d 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, TypeAlias, TypedDict, cast +from typing import TYPE_CHECKING, ClassVar, Literal, TypeAlias, TypedDict, cast if TYPE_CHECKING: from typing import NotRequired, Self @@ -30,7 +30,7 @@ class ChunkKeyEncodingParams(TypedDict): @dataclass(frozen=True) class ChunkKeyEncoding(ABC, Metadata): - name: str + name: ClassVar[str] separator: SeparatorLiteral = "." def __post_init__(self) -> None: @@ -65,7 +65,7 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: @dataclass(frozen=True) class DefaultChunkKeyEncoding(ChunkKeyEncoding): - name: Literal["default"] = "default" + name: ClassVar[Literal["default"]] = "default" separator: SeparatorLiteral = "/" # default def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: @@ -79,7 +79,7 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: @dataclass(frozen=True) class V2ChunkKeyEncoding(ChunkKeyEncoding): - name: Literal["v2"] = "v2" + name: ClassVar[Literal["v2"]] = "v2" separator: SeparatorLiteral = "." # default def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: From e89af81e05808fec43497f48b30375ca1efd81ad Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sun, 14 Sep 2025 12:14:46 +0200 Subject: [PATCH 08/11] Remove `separator` from ChunkKeyEncoding base. --- src/zarr/core/chunk_key_encodings.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 974c4baa5d..75e8ccc9e3 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -30,20 +30,21 @@ class ChunkKeyEncodingParams(TypedDict): @dataclass(frozen=True) class ChunkKeyEncoding(ABC, Metadata): - name: ClassVar[str] - separator: SeparatorLiteral = "." + """ + Defines how chunk coordinates are mapped to store keys. - def __post_init__(self) -> None: - separator_parsed = parse_separator(self.separator) - object.__setattr__(self, "separator", separator_parsed) + Subclasses must define a class variable `name` and implement `encode_chunk_key`. + """ + + name: ClassVar[str] @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) + _, config_parsed = parse_named_configuration(data, require_configuration=False) return cls(**config_parsed if config_parsed else {}) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"separator": self.separator}} + return {"name": self.name, "configuration": super().to_dict()} def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: """ @@ -66,7 +67,11 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: @dataclass(frozen=True) class DefaultChunkKeyEncoding(ChunkKeyEncoding): name: ClassVar[Literal["default"]] = "default" - separator: SeparatorLiteral = "/" # default + separator: SeparatorLiteral = "/" + + def __post_init__(self) -> None: + separator_parsed = parse_separator(self.separator) + object.__setattr__(self, "separator", separator_parsed) def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: if chunk_key == "c": @@ -80,7 +85,11 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: @dataclass(frozen=True) class V2ChunkKeyEncoding(ChunkKeyEncoding): name: ClassVar[Literal["v2"]] = "v2" - separator: SeparatorLiteral = "." # default + separator: SeparatorLiteral = "." + + def __post_init__(self) -> None: + separator_parsed = parse_separator(self.separator) + object.__setattr__(self, "separator", separator_parsed) def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: return tuple(map(int, chunk_key.split(self.separator))) From 9a1de9516a69093015d7ab8e92646acd67f5aa6e Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sun, 14 Sep 2025 13:27:59 +0200 Subject: [PATCH 09/11] Fix typing errors. --- src/zarr/core/array.py | 1 + src/zarr/core/chunk_key_encodings.py | 2 +- tests/conftest.py | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0b6fedeeff..cc41f236fa 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4588,6 +4588,7 @@ async def init_array( order_parsed = zarr_config.get("array.order") else: order_parsed = order + chunk_key_encoding_parsed = cast("V2ChunkKeyEncoding", chunk_key_encoding_parsed) meta = AsyncArray._create_metadata_v2( shape=shape_parsed, diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 75e8ccc9e3..42d7615c61 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -41,7 +41,7 @@ class ChunkKeyEncoding(ABC, Metadata): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, config_parsed = parse_named_configuration(data, require_configuration=False) - return cls(**config_parsed if config_parsed else {}) # type: ignore[arg-type] + return cls(**config_parsed if config_parsed else {}) def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": super().to_dict()} diff --git a/tests/conftest.py b/tests/conftest.py index 839be34e01..c18a221f46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import pathlib from collections.abc import Mapping, Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np import numpy.typing as npt @@ -50,6 +50,7 @@ from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, + V2ChunkKeyEncoding, ) from zarr.core.dtype.wrapper import ZDType @@ -316,6 +317,7 @@ def create_array_metadata( filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=dtype_parsed ) + chunk_key_encoding_parsed = cast("V2ChunkKeyEncoding", chunk_key_encoding_parsed) return ArrayV2Metadata( shape=shape_parsed, dtype=dtype_parsed, From 472899f2306b72b267fbad4d707471c876295736 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sun, 14 Sep 2025 13:52:12 +0200 Subject: [PATCH 10/11] Update docs output to match code changes. --- docs/user-guide/consolidated_metadata.rst | 73 +++++++++++------------ 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 9d05231f4a..ae50c602ca 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -49,44 +49,41 @@ that can be used.: >>> from pprint import pprint >>> pprint(dict(consolidated_metadata.items())) {'a': ArrayV3Metadata(shape=(1,), - data_type=Float64(endianness='little'), - chunk_grid=RegularChunkGrid(chunk_shape=(1,)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=()), - 'b': ArrayV3Metadata(shape=(2, 2), - data_type=Float64(endianness='little'), - chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=()), - 'c': ArrayV3Metadata(shape=(3, 3, 3), - data_type=Float64(endianness='little'), - chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=())} + data_type=Float64(endianness='little'), + chunk_grid=RegularChunkGrid(chunk_shape=(1,)), + chunk_key_encoding=DefaultChunkKeyEncoding(separator='/'), + fill_value=np.float64(0.0), + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=()), + 'b': ArrayV3Metadata(shape=(2, 2), + data_type=Float64(endianness='little'), + chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), + chunk_key_encoding=DefaultChunkKeyEncoding(separator='/'), + fill_value=np.float64(0.0), + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=()), + 'c': ArrayV3Metadata(shape=(3, 3, 3), + data_type=Float64(endianness='little'), + chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), + chunk_key_encoding=DefaultChunkKeyEncoding(separator='/'), + fill_value=np.float64(0.0), + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=())} Operations on the group to get children automatically use the consolidated metadata.: From 46d17804ce007e385b42cfe8d34db8db2f1cc085 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:02:13 +0200 Subject: [PATCH 11/11] Add release notes --- changes/3436.feature.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3436.feature.rst diff --git a/changes/3436.feature.rst b/changes/3436.feature.rst new file mode 100644 index 0000000000..85e28bb8b1 --- /dev/null +++ b/changes/3436.feature.rst @@ -0,0 +1,2 @@ +Adds a registry for chunk key encodings for extensibility. +This allows users to implement a custom `ChunkKeyEncoding`, which can be registered via `register_chunk_key_encoding` or as an entry point under `zarr.chunk_key_encoding`.