Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d3b19cf
prepare
brokkoli71 Feb 26, 2025
50cd5e0
Merge branch 'main' into zarr-extensions
brokkoli71 Feb 26, 2025
786669c
check for unexpected zarr metadata keys and codec configuration
brokkoli71 Feb 28, 2025
26b658a
format
brokkoli71 Feb 28, 2025
1e50587
Merge branch 'main' into zarr-extensions
brokkoli71 Feb 28, 2025
4967003
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 2, 2025
72a28e2
if data type has endianness, then codecs must specify endian attribute
brokkoli71 Apr 2, 2025
ac9f8d5
codec.from_dict does not select endian automatically
brokkoli71 Apr 4, 2025
36c4d33
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 4, 2025
54c13a0
fix for single byte data types
brokkoli71 Apr 4, 2025
d46176e
fix test_fail_on_invalid_key
brokkoli71 Apr 4, 2025
a38f25e
add testcase for test_codec_requires_endian
brokkoli71 Apr 4, 2025
0c82437
metadata: unknown configuration keys will get rejected except must_un…
brokkoli71 Apr 4, 2025
de8f5b1
codecs: unknown configuration keys will get rejected except must_unde…
brokkoli71 Apr 4, 2025
151796f
fix test_special_float_fill_values
brokkoli71 Apr 4, 2025
0353ae9
fix kwargs typing
brokkoli71 Apr 7, 2025
08fa7f5
objects for datatype, chunk_key_encodings, chunk_grid
brokkoli71 Apr 9, 2025
0cef5e6
document changes
brokkoli71 Apr 9, 2025
5ad18ec
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 9, 2025
385b7ca
extract helper reject_must_understand_metadata
brokkoli71 Apr 10, 2025
8812479
Merge remote-tracking branch 'origin/zarr-extensions' into zarr-exten…
brokkoli71 Apr 10, 2025
0d91dc6
fix circular import
brokkoli71 Apr 10, 2025
833f408
set kwargs type
brokkoli71 Apr 10, 2025
bc80e51
fix test_fail_on_invalid_metadata_key
brokkoli71 Apr 10, 2025
c801088
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 10, 2025
c9f6ae4
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 11, 2025
9048e03
Merge branch 'main' into zarr-extensions
brokkoli71 May 8, 2025
2f67624
Merge branch 'main' into zarr-extensions
brokkoli71 May 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changes/2866.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
If metadata JSON contains invalid keys, or if a value object contains invalid keys, the zarr array will be rejected unless it contains the key-value-pair ``"must_understand": false``.
New codecs are required to validate their metadata and raise an error if it is invalid.
3 changes: 3 additions & 0 deletions docs/user-guide/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Custom codecs should also implement the following methods:
- ``evolve_from_array_spec`` (optional), which can be useful for automatically filling in
codec configuration metadata from the array metadata.

On initialization, custom codecs should reject unexpected metadata keys except for objects
marked with ``"must_understand": false``.

To use custom codecs in Zarr, they need to be registered using the
`entrypoint mechanism <https://packaging.python.org/en/latest/specifications/entry-points/>`_.
Commonly, entrypoints are declared in the ``pyproject.toml`` of your package under the
Expand Down
10 changes: 9 additions & 1 deletion src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@

from zarr.abc.codec import BytesBytesCodec
from zarr.core.buffer.cpu import as_numpy_array_wrapper
from zarr.core.common import JSON, parse_enum, parse_named_configuration
from zarr.core.common import (
JSON,
parse_enum,
parse_named_configuration,
reject_must_understand_metadata,
)
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand Down Expand Up @@ -102,7 +107,10 @@ def __init__(
clevel: int = 5,
shuffle: BloscShuffle | str | None = None,
blocksize: int = 0,
**kwargs: JSON,
) -> None:
reject_must_understand_metadata(kwargs, "`blosc` codec configuration")

typesize_parsed = parse_typesize(typesize) if typesize is not None else None
cname_parsed = parse_enum(cname, BloscCname)
clevel_parsed = parse_clevel(clevel)
Expand Down
20 changes: 16 additions & 4 deletions src/zarr/codecs/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@

from zarr.abc.codec import ArrayBytesCodec
from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
from zarr.core.common import JSON, parse_enum, parse_named_configuration
from zarr.core.common import (
JSON,
parse_enum,
parse_named_configuration,
reject_must_understand_metadata,
)
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand All @@ -36,9 +41,15 @@ class BytesCodec(ArrayBytesCodec):

endian: Endian | None

def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None:
endian_parsed = None if endian is None else parse_enum(endian, Endian)
def __init__(
self,
*,
endian: Endian | str | None = default_system_endian,
**kwargs: JSON,
) -> None:
reject_must_understand_metadata(kwargs, "`bytes` codec configuration")

endian_parsed = None if endian is None else parse_enum(endian, Endian)
object.__setattr__(self, "endian", endian_parsed)

@classmethod
Expand All @@ -47,6 +58,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
data, "bytes", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
configuration_parsed.setdefault("endian", None)
return cls(**configuration_parsed) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
Expand All @@ -59,7 +71,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
if array_spec.dtype.itemsize == 0:
if self.endian is not None:
return replace(self, endian=None)
elif self.endian is None:
elif self.endian is None and array_spec.dtype.itemsize > 1:
raise ValueError(
"The `endian` configuration needs to be specified for multi-byte data types."
)
Expand Down
7 changes: 5 additions & 2 deletions src/zarr/codecs/crc32c_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from crc32c import crc32c

from zarr.abc.codec import BytesBytesCodec
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.common import JSON, parse_named_configuration, reject_must_understand_metadata
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand All @@ -24,7 +24,10 @@

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
parse_named_configuration(data, "crc32c", require_configuration=False)
_, configuration_parsed = parse_named_configuration(

Check warning on line 27 in src/zarr/codecs/crc32c_.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/codecs/crc32c_.py#L27

Added line #L27 was not covered by tests
data, "crc32c", require_configuration=False
)
reject_must_understand_metadata(configuration_parsed, "`crc32c` codec configuration")

Check warning on line 30 in src/zarr/codecs/crc32c_.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/codecs/crc32c_.py#L30

Added line #L30 was not covered by tests
return cls()

def to_dict(self) -> dict[str, JSON]:
Expand Down
5 changes: 3 additions & 2 deletions src/zarr/codecs/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from zarr.abc.codec import BytesBytesCodec
from zarr.core.buffer.cpu import as_numpy_array_wrapper
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.common import JSON, parse_named_configuration, reject_must_understand_metadata
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand All @@ -34,7 +34,8 @@ class GzipCodec(BytesBytesCodec):

level: int = 5

def __init__(self, *, level: int = 5) -> None:
def __init__(self, *, level: int = 5, **kwargs: JSON) -> None:
reject_must_understand_metadata(kwargs, "`gzip` codec configuration")
level_parsed = parse_gzip_level(level)

object.__setattr__(self, "level", level_parsed)
Expand Down
6 changes: 6 additions & 0 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
parse_named_configuration,
parse_shapelike,
product,
reject_must_understand_metadata,
)
from zarr.core.indexing import (
BasicIndexer,
Expand Down Expand Up @@ -343,7 +344,10 @@
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
**kwargs: JSON,
) -> None:
reject_must_understand_metadata(kwargs, "`sharding` codec configuration")

chunk_shape_parsed = parse_shapelike(chunk_shape)
codecs_parsed = parse_codecs(codecs)
index_codecs_parsed = parse_codecs(index_codecs)
Expand Down Expand Up @@ -378,6 +382,8 @@
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(data, "sharding_indexed")
configuration_parsed.setdefault("codecs", "bytes")
configuration_parsed.setdefault("index_codecs", ("bytes", "crc32c"))

Check warning on line 386 in src/zarr/codecs/sharding.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/codecs/sharding.py#L385-L386

Added lines #L385 - L386 were not covered by tests
return cls(**configuration_parsed) # type: ignore[arg-type]

@property
Expand Down
10 changes: 8 additions & 2 deletions src/zarr/codecs/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@

from zarr.abc.codec import ArrayArrayCodec
from zarr.core.array_spec import ArraySpec
from zarr.core.common import JSON, ChunkCoordsLike, parse_named_configuration
from zarr.core.common import (
JSON,
ChunkCoordsLike,
parse_named_configuration,
reject_must_understand_metadata,
)
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand All @@ -32,7 +37,8 @@ class TransposeCodec(ArrayArrayCodec):

order: tuple[int, ...]

def __init__(self, *, order: ChunkCoordsLike) -> None:
def __init__(self, *, order: ChunkCoordsLike, **kwargs: JSON) -> None:
reject_must_understand_metadata(kwargs, "`transpose` codec configuration")
order_parsed = parse_transpose_order(order)

object.__setattr__(self, "order", order_parsed)
Expand Down
8 changes: 5 additions & 3 deletions src/zarr/codecs/vlen_utf8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from zarr.abc.codec import ArrayBytesCodec
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.common import JSON, parse_named_configuration, reject_must_understand_metadata
from zarr.core.strings import cast_to_string_dtype
from zarr.registry import register_codec

Expand All @@ -26,7 +26,8 @@

@dataclass(frozen=True)
class VLenUTF8Codec(ArrayBytesCodec):
def __init__(self) -> None:
def __init__(self, **kwargs: JSON) -> None:
reject_must_understand_metadata(kwargs, "`vlen-utf8` codec configuration")

Check warning on line 30 in src/zarr/codecs/vlen_utf8.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/codecs/vlen_utf8.py#L30

Added line #L30 was not covered by tests
warn(
"The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It "
"may not be supported by other zarr implementations and may change in the future.",
Expand Down Expand Up @@ -81,7 +82,8 @@

@dataclass(frozen=True)
class VLenBytesCodec(ArrayBytesCodec):
def __init__(self) -> None:
def __init__(self, **kwargs: JSON) -> None:
reject_must_understand_metadata(kwargs, "`vlen-bytes` codec configuration")

Check warning on line 86 in src/zarr/codecs/vlen_utf8.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/codecs/vlen_utf8.py#L86

Added line #L86 was not covered by tests
warn(
"The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It "
"may not be supported by other zarr implementations and may change in the future.",
Expand Down
6 changes: 4 additions & 2 deletions src/zarr/codecs/zstd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from zarr.abc.codec import BytesBytesCodec
from zarr.core.buffer.cpu import as_numpy_array_wrapper
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.common import JSON, parse_named_configuration, reject_must_understand_metadata
from zarr.registry import register_codec

if TYPE_CHECKING:
Expand Down Expand Up @@ -42,7 +42,9 @@ class ZstdCodec(BytesBytesCodec):
level: int = 0
checksum: bool = False

def __init__(self, *, level: int = 0, checksum: bool = False) -> None:
def __init__(self, *, level: int = 0, checksum: bool = False, **kwargs: JSON) -> None:
reject_must_understand_metadata(kwargs, "`zstd` codec configuration")

# numcodecs 0.13.0 introduces the checksum attribute for the zstd codec
_numcodecs_version = Version(numcodecs.__version__)
if _numcodecs_version < Version("0.13.0"):
Expand Down
8 changes: 5 additions & 3 deletions src/zarr/core/chunk_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
ShapeLike,
parse_named_configuration,
parse_shapelike,
reject_must_understand_metadata,
)
from zarr.core.indexing import ceildiv

Expand Down Expand Up @@ -179,9 +180,10 @@

@classmethod
def _from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(data, "regular")

return cls(**configuration_parsed) # type: ignore[arg-type]
_, config_parsed = parse_named_configuration(data, "regular")
chunk_shape = config_parsed.pop("chunk_shape")
reject_must_understand_metadata(config_parsed, "chunk grid configuration")
return cls(chunk_shape=chunk_shape) # type: ignore[arg-type]

Check warning on line 186 in src/zarr/core/chunk_grids.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/chunk_grids.py#L183-L186

Added lines #L183 - L186 were not covered by tests

def to_dict(self) -> dict[str, JSON]:
return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}}
Expand Down
25 changes: 15 additions & 10 deletions src/zarr/core/chunk_key_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
JSON,
ChunkCoords,
parse_named_configuration,
reject_must_understand_metadata,
)

SeparatorLiteral = Literal[".", "/"]
Expand Down Expand Up @@ -44,24 +45,28 @@ def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEnco
return data

# handle ChunkKeyEncodingParams
if "name" in data and "separator" in data:
if isinstance(data, dict) and data.keys() == {"name", "separator"}:
data = {"name": data["name"], "configuration": {"separator": data["separator"]}}

# TODO: remove this cast when we are statically typing the JSON metadata completely.
data = cast(dict[str, JSON], data)

# configuration is optional for chunk key encodings
name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False)
separator = config_parsed.pop("separator", None) if config_parsed else None
reject_must_understand_metadata(config_parsed, "chunk key encoding configuration")

if name_parsed == "default":
if config_parsed is None:
# for default, normalize missing configuration to use the "/" separator.
config_parsed = {"separator": "/"}
return DefaultChunkKeyEncoding(**config_parsed) # type: ignore[arg-type]
# for default, normalize missing configuration to use the "/" separator.
return DefaultChunkKeyEncoding(
separator=separator or "/" # type: ignore[arg-type]
)
if name_parsed == "v2":
if config_parsed is None:
# for v2, normalize missing configuration to use the "." separator.
config_parsed = {"separator": "."}
return V2ChunkKeyEncoding(**config_parsed) # type: ignore[arg-type]
# for v2, normalize missing configuration to use the "." separator.
return V2ChunkKeyEncoding(
separator=separator or "." # type: ignore[arg-type]
)

msg = f"Unknown chunk key encoding. Got {name_parsed}, expected one of ('v2', 'default')."
raise ValueError(msg)

Expand All @@ -77,7 +82,7 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
pass


ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding
ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding | str


@dataclass(frozen=True)
Expand Down
33 changes: 26 additions & 7 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
def parse_configuration(data: JSON) -> JSON:
if not isinstance(data, dict):
raise TypeError(f"Expected dict, got {type(data)}")
return data
return data.copy()


@overload
Expand All @@ -115,19 +115,31 @@


def parse_named_configuration(
data: JSON, expected_name: str | None = None, *, require_configuration: bool = True
data: JSON,
expected_name: str | None = None,
*,
require_configuration: bool = True,
) -> tuple[str, JSON | None]:
if not isinstance(data, dict):
if isinstance(data, str):
data = {"name": data}

Check warning on line 124 in src/zarr/core/common.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/common.py#L124

Added line #L124 was not covered by tests
elif not isinstance(data, dict):
raise TypeError(f"Expected dict, got {type(data)}")

_data = data.copy()
if "name" not in data:
raise ValueError(f"Named configuration does not have a 'name' key. Got {data}.")
name_parsed = parse_name(data["name"], expected_name)
if "configuration" in data:
configuration_parsed = parse_configuration(data["configuration"])
name_parsed = parse_name(_data.pop("name"), expected_name)

if "configuration" in _data:
configuration_parsed = parse_configuration(_data.pop("configuration"))
elif require_configuration:
raise ValueError(f"Named configuration does not have a 'configuration' key. Got {data}.")
raise ValueError(

Check warning on line 136 in src/zarr/core/common.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/common.py#L136

Added line #L136 was not covered by tests
f"Named configuration with name='{name_parsed}' requires a 'configuration' key. Got keys {list(data.keys())}."
)
else:
configuration_parsed = None

reject_must_understand_metadata(_data, "named configuration")
return name_parsed, configuration_parsed


Expand Down Expand Up @@ -203,3 +215,10 @@
def _default_zarr_format() -> ZarrFormat:
"""Return the default zarr_version"""
return cast(ZarrFormat, int(zarr_config.get("default_zarr_format", 3)))


def reject_must_understand_metadata(data: dict[str, Any] | None, dict_name: str) -> None:
if data and not all(
isinstance(value, dict) and value.get("must_understand") is False for value in data.values()
):
raise ValueError(f"Unexpected {dict_name} keys: {list(data.keys())}")

Check warning on line 224 in src/zarr/core/common.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/common.py#L224

Added line #L224 was not covered by tests
Loading