-
-
Notifications
You must be signed in to change notification settings - Fork 364
Add registry for chunk key encodings for extensibility #3436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
0289779
9d78166
c6ee5c6
ec607ae
c4ce5df
0566131
70d99e4
1f09f76
e89af81
9a1de95
472899f
0c67e37
7b31ed4
46d1780
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,18 @@ | ||
from __future__ import annotations | ||
|
||
from abc import abstractmethod | ||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING, Literal, TypeAlias, TypedDict, cast | ||
|
||
if TYPE_CHECKING: | ||
from typing import NotRequired | ||
from typing import NotRequired, Self | ||
|
||
from zarr.abc.metadata import Metadata | ||
from zarr.core.common import ( | ||
JSON, | ||
parse_named_configuration, | ||
) | ||
from zarr.registry import get_chunk_key_encoding_class, register_chunk_key_encoding | ||
|
||
SeparatorLiteral = Literal[".", "/"] | ||
|
||
|
@@ -28,60 +29,44 @@ class ChunkKeyEncodingParams(TypedDict): | |
|
||
|
||
@dataclass(frozen=True) | ||
class ChunkKeyEncoding(Metadata): | ||
class ChunkKeyEncoding(ABC, Metadata): | ||
name: str | ||
separator: SeparatorLiteral = "." | ||
|
||
def __init__(self, *, separator: SeparatorLiteral) -> None: | ||
separator_parsed = parse_separator(separator) | ||
|
||
def __post_init__(self) -> None: | ||
separator_parsed = parse_separator(self.separator) | ||
object.__setattr__(self, "separator", separator_parsed) | ||
|
||
@classmethod | ||
def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEncoding: | ||
if isinstance(data, ChunkKeyEncoding): | ||
return data | ||
|
||
# handle ChunkKeyEncodingParams | ||
if "name" in data and "separator" in data: | ||
data = {"name": data["name"], "configuration": {"separator": data["separator"]}} | ||
|
||
# TODO: remove this cast when we are statically typing the JSON metadata completely. | ||
data = cast("dict[str, JSON]", data) | ||
|
||
# configuration is optional for chunk key encodings | ||
def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) | ||
if name_parsed == "default": | ||
if config_parsed is None: | ||
# for default, normalize missing configuration to use the "/" separator. | ||
config_parsed = {"separator": "/"} | ||
return DefaultChunkKeyEncoding(**config_parsed) # type: ignore[arg-type] | ||
if name_parsed == "v2": | ||
if config_parsed is None: | ||
# for v2, normalize missing configuration to use the "." separator. | ||
config_parsed = {"separator": "."} | ||
return V2ChunkKeyEncoding(**config_parsed) # type: ignore[arg-type] | ||
msg = f"Unknown chunk key encoding. Got {name_parsed}, expected one of ('v2', 'default')." | ||
raise ValueError(msg) | ||
return cls(**config_parsed if config_parsed else {}) # type: ignore[arg-type] | ||
|
||
def to_dict(self) -> dict[str, JSON]: | ||
return {"name": self.name, "configuration": {"separator": self.separator}} | ||
|
||
@abstractmethod | ||
def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: | ||
pass | ||
""" | ||
Optional: decode a chunk key string into chunk coordinates. | ||
Not required for normal operation; override if needed for testing or debugging. | ||
""" | ||
raise NotImplementedError(f"{self.__class__.__name__} does not implement decode_chunk_key.") | ||
|
||
@abstractmethod | ||
def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: | ||
pass | ||
""" | ||
Encode chunk coordinates into a chunk key string. | ||
Must be implemented by subclasses. | ||
""" | ||
|
||
|
||
ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding | ||
ChunkKeyEncodingLike: TypeAlias = dict[str, JSON] | ChunkKeyEncodingParams | ChunkKeyEncoding | ||
|
||
|
||
@dataclass(frozen=True) | ||
class DefaultChunkKeyEncoding(ChunkKeyEncoding): | ||
name: Literal["default"] = "default" | ||
separator: SeparatorLiteral = "/" # default | ||
|
||
def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: | ||
if chunk_key == "c": | ||
|
@@ -95,10 +80,38 @@ def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: | |
@dataclass(frozen=True) | ||
class V2ChunkKeyEncoding(ChunkKeyEncoding): | ||
name: Literal["v2"] = "v2" | ||
|
||
separator: SeparatorLiteral = "." # default | ||
|
||
def decode_chunk_key(self, chunk_key: str) -> tuple[int, ...]: | ||
return tuple(map(int, chunk_key.split(self.separator))) | ||
|
||
def encode_chunk_key(self, chunk_coords: tuple[int, ...]) -> str: | ||
chunk_identifier = self.separator.join(map(str, chunk_coords)) | ||
return "0" if chunk_identifier == "" else chunk_identifier | ||
|
||
|
||
def parse_chunk_key_encoding(data: ChunkKeyEncodingLike) -> ChunkKeyEncoding: | ||
""" | ||
Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. | ||
""" | ||
if isinstance(data, ChunkKeyEncoding): | ||
return data | ||
|
||
# handle ChunkKeyEncodingParams | ||
if "name" in data and "separator" in data: | ||
data = {"name": data["name"], "configuration": {"separator": data["separator"]}} | ||
|
||
# Now must be a named config | ||
data = cast("dict[str, JSON]", data) | ||
|
||
name_parsed, _ = parse_named_configuration(data, require_configuration=False) | ||
try: | ||
chunk_key_encoding = get_chunk_key_encoding_class(name_parsed).from_dict(data) | ||
except KeyError as e: | ||
raise ValueError(f"Unknown chunk key encoding: {e.args[0]!r}") from e | ||
|
||
return chunk_key_encoding | ||
|
||
|
||
register_chunk_key_encoding("default", DefaultChunkKeyEncoding) | ||
register_chunk_key_encoding("v2", V2ChunkKeyEncoding) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be better to remove
separator
as a standard configuration parameter. For example, a hash-based encoding does not need a separator.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, the base ABC probably only needs to require a
name
attribute, and everything else can be methods