Skip to content
Draft
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d3b19cf
prepare
brokkoli71 Feb 26, 2025
50cd5e0
Merge branch 'main' into zarr-extensions
brokkoli71 Feb 26, 2025
786669c
check for unexpected zarr metadata keys and codec configuration
brokkoli71 Feb 28, 2025
26b658a
format
brokkoli71 Feb 28, 2025
1e50587
Merge branch 'main' into zarr-extensions
brokkoli71 Feb 28, 2025
4967003
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 2, 2025
72a28e2
if data type has endianness, then codecs must specify endian attribute
brokkoli71 Apr 2, 2025
ac9f8d5
codec.from_dict does not select endian automatically
brokkoli71 Apr 4, 2025
36c4d33
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 4, 2025
54c13a0
fix for single byte data types
brokkoli71 Apr 4, 2025
d46176e
fix test_fail_on_invalid_key
brokkoli71 Apr 4, 2025
a38f25e
add testcase for test_codec_requires_endian
brokkoli71 Apr 4, 2025
0c82437
metadata: unknown configuration keys will get rejected except must_un…
brokkoli71 Apr 4, 2025
de8f5b1
codecs: unknown configuration keys will get rejected except must_unde…
brokkoli71 Apr 4, 2025
151796f
fix test_special_float_fill_values
brokkoli71 Apr 4, 2025
0353ae9
fix kwargs typing
brokkoli71 Apr 7, 2025
08fa7f5
objects for datatype, chunk_key_encodings, chunk_grid
brokkoli71 Apr 9, 2025
0cef5e6
document changes
brokkoli71 Apr 9, 2025
5ad18ec
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 9, 2025
385b7ca
extract helper reject_must_understand_metadata
brokkoli71 Apr 10, 2025
8812479
Merge remote-tracking branch 'origin/zarr-extensions' into zarr-exten…
brokkoli71 Apr 10, 2025
0d91dc6
fix circular import
brokkoli71 Apr 10, 2025
833f408
set kwargs type
brokkoli71 Apr 10, 2025
bc80e51
fix test_fail_on_invalid_metadata_key
brokkoli71 Apr 10, 2025
c801088
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 10, 2025
c9f6ae4
Merge branch 'main' into zarr-extensions
brokkoli71 Apr 11, 2025
9048e03
Merge branch 'main' into zarr-extensions
brokkoli71 May 8, 2025
2f67624
Merge branch 'main' into zarr-extensions
brokkoli71 May 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changes/2866.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
If metadata JSON contains invalid keys, or if a value object contains invalid keys, the zarr array will be rejected unless it contains the key-value-pair ``"must_understand": false``.
New codecs are required to validate their metadata and raise an error if it is invalid.
3 changes: 3 additions & 0 deletions docs/user-guide/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Custom codecs should also implement the following methods:
- ``evolve_from_array_spec`` (optional), which can be useful for automatically filling in
codec configuration metadata from the array metadata.

On initialization, custom codecs should reject unexpected metadata keys except for objects
marked with ``"must_understand": false``.

To use custom codecs in Zarr, they need to be registered using the
`entrypoint mechanism <https://packaging.python.org/en/latest/specifications/entry-points/>`_.
Commonly, entrypoints are declared in the ``pyproject.toml`` of your package under the
Expand Down
9 changes: 8 additions & 1 deletion src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass, replace
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import numcodecs
from numcodecs.blosc import Blosc
Expand Down Expand Up @@ -101,7 +101,14 @@ def __init__(
clevel: int = 5,
shuffle: BloscShuffle | str | None = None,
blocksize: int = 0,
**kwargs: Any,
) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `blosc` codec got an unexpected configuration: {kwargs}")

typesize_parsed = parse_typesize(typesize) if typesize is not None else None
cname_parsed = parse_enum(cname, BloscCname)
clevel_parsed = parse_clevel(clevel)
Expand Down
18 changes: 14 additions & 4 deletions src/zarr/codecs/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from dataclasses import dataclass, replace
from enum import Enum
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import numpy as np

Expand Down Expand Up @@ -36,9 +36,18 @@ class BytesCodec(ArrayBytesCodec):

endian: Endian | None

def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None:
def __init__(
self,
*,
endian: Endian | str | None = default_system_endian,
**kwargs: Any,
) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `bytes` codec got an unexpected configuration: {kwargs}")
endian_parsed = None if endian is None else parse_enum(endian, Endian)

object.__setattr__(self, "endian", endian_parsed)

@classmethod
Expand All @@ -47,6 +56,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
data, "bytes", require_configuration=False
)
configuration_parsed = configuration_parsed or {}
configuration_parsed.setdefault("endian", None)
return cls(**configuration_parsed) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
Expand All @@ -59,7 +69,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
if array_spec.dtype.itemsize == 0:
if self.endian is not None:
return replace(self, endian=None)
elif self.endian is None:
elif self.endian is None and array_spec.dtype.itemsize > 1:
raise ValueError(
"The `endian` configuration needs to be specified for multi-byte data types."
)
Expand Down
9 changes: 8 additions & 1 deletion src/zarr/codecs/crc32c_.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@ class Crc32cCodec(BytesBytesCodec):

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
parse_named_configuration(data, "crc32c", require_configuration=False)
_, configuration_parsed = parse_named_configuration(
data, "crc32c", require_configuration=False
)
if configuration_parsed and not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in configuration_parsed.values()
):
raise ValueError(f"The `crc32c` codec got an unexpected configuration: {data}")
return cls()

def to_dict(self) -> dict[str, JSON]:
Expand Down
10 changes: 8 additions & 2 deletions src/zarr/codecs/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

from numcodecs.gzip import GZip

Expand Down Expand Up @@ -34,7 +34,13 @@ class GzipCodec(BytesBytesCodec):

level: int = 5

def __init__(self, *, level: int = 5) -> None:
def __init__(self, *, level: int = 5, **kwargs: Any) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `gzip` codec got an unexpected configuration: {kwargs}")

level_parsed = parse_gzip_level(level)

object.__setattr__(self, "level", level_parsed)
Expand Down
9 changes: 9 additions & 0 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,14 @@ def __init__(
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
**kwargs: Any,
) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `sharding` codec got an unexpected configuration: {kwargs}")

chunk_shape_parsed = parse_shapelike(chunk_shape)
codecs_parsed = parse_codecs(codecs)
index_codecs_parsed = parse_codecs(index_codecs)
Expand Down Expand Up @@ -378,6 +385,8 @@ def __setstate__(self, state: dict[str, Any]) -> None:
@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(data, "sharding_indexed")
configuration_parsed.setdefault("codecs", "bytes")
configuration_parsed.setdefault("index_codecs", ("bytes", "crc32c"))
return cls(**configuration_parsed) # type: ignore[arg-type]

@property
Expand Down
8 changes: 7 additions & 1 deletion src/zarr/codecs/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ class TransposeCodec(ArrayArrayCodec):

order: tuple[int, ...]

def __init__(self, *, order: ChunkCoordsLike) -> None:
def __init__(self, *, order: ChunkCoordsLike, **kwargs: Any) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `transpose` codec got an unexpected configuration: {kwargs}")

order_parsed = parse_transpose_order(order)

object.__setattr__(self, "order", order_parsed)
Expand Down
18 changes: 15 additions & 3 deletions src/zarr/codecs/vlen_utf8.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from warnings import warn

import numpy as np
Expand All @@ -26,7 +26,13 @@

@dataclass(frozen=True)
class VLenUTF8Codec(ArrayBytesCodec):
def __init__(self) -> None:
def __init__(self, **kwargs: Any) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `vlen-utf8` codec got an unexpected configuration: {kwargs}")

warn(
"The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It "
"may not be supported by other zarr implementations and may change in the future.",
Expand Down Expand Up @@ -81,7 +87,13 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -

@dataclass(frozen=True)
class VLenBytesCodec(ArrayBytesCodec):
def __init__(self) -> None:
def __init__(self, **kwargs: Any) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `vlen-bytes` codec got an unexpected configuration: {kwargs}")

warn(
"The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It "
"may not be supported by other zarr implementations and may change in the future.",
Expand Down
10 changes: 8 additions & 2 deletions src/zarr/codecs/zstd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import asyncio
from dataclasses import dataclass
from functools import cached_property
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import numcodecs
from numcodecs.zstd import Zstd
Expand Down Expand Up @@ -42,7 +42,13 @@ class ZstdCodec(BytesBytesCodec):
level: int = 0
checksum: bool = False

def __init__(self, *, level: int = 0, checksum: bool = False) -> None:
def __init__(self, *, level: int = 0, checksum: bool = False, **kwargs: Any) -> None:
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"The `zstd` codec got an unexpected configuration: {kwargs}")

# numcodecs 0.13.0 introduces the checksum attribute for the zstd codec
_numcodecs_version = Version(numcodecs.__version__)
if _numcodecs_version < Version("0.13.0"):
Expand Down
12 changes: 10 additions & 2 deletions src/zarr/core/chunk_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,17 @@ def __init__(self, *, chunk_shape: ChunkCoordsLike) -> None:

@classmethod
def _from_dict(cls, data: dict[str, JSON]) -> Self:
_, configuration_parsed = parse_named_configuration(data, "regular")
_, config_parsed = parse_named_configuration(data, "regular")

if config_parsed and not all(
k == "chunk_shape" or (isinstance(v, dict) and v.get("must_understand") is False)
for k, v in config_parsed.items()
):
raise ValueError(
f"The chunk grid expects a 'chunk_shape' key. Got {list(config_parsed.keys())}."
)

return cls(**configuration_parsed) # type: ignore[arg-type]
return cls(chunk_shape=config_parsed.get("chunk_shape")) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}}
Expand Down
30 changes: 20 additions & 10 deletions src/zarr/core/chunk_key_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,34 @@ def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEnco
return data

# handle ChunkKeyEncodingParams
if "name" in data and "separator" in data:
if isinstance(data, dict) and data.keys() == {"name", "separator"}:
data = {"name": data["name"], "configuration": {"separator": data["separator"]}}

# TODO: remove this cast when we are statically typing the JSON metadata completely.
data = cast(dict[str, JSON], data)

# configuration is optional for chunk key encodings
name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False)

if config_parsed and not all(
k == "separator" or (isinstance(v, dict) and v.get("must_understand") is False)
for k, v in config_parsed.items()
):
raise ValueError(
f"The chunk key encoding expects a 'separator' key. Got {list(config_parsed.keys())}."
)

if name_parsed == "default":
if config_parsed is None:
# for default, normalize missing configuration to use the "/" separator.
config_parsed = {"separator": "/"}
return DefaultChunkKeyEncoding(**config_parsed) # type: ignore[arg-type]
# for default, normalize missing configuration to use the "/" separator.
return DefaultChunkKeyEncoding(
separator=config_parsed.get("separator") if config_parsed else "/" # type: ignore[arg-type]
)
if name_parsed == "v2":
if config_parsed is None:
# for v2, normalize missing configuration to use the "." separator.
config_parsed = {"separator": "."}
return V2ChunkKeyEncoding(**config_parsed) # type: ignore[arg-type]
# for v2, normalize missing configuration to use the "." separator.
return V2ChunkKeyEncoding(
separator=config_parsed.get("separator") if config_parsed else "." # type: ignore[arg-type]
)

msg = f"Unknown chunk key encoding. Got {name_parsed}, expected one of ('v2', 'default')."
raise ValueError(msg)

Expand All @@ -77,7 +87,7 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
pass


ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding
ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding | str


@dataclass(frozen=True)
Expand Down
22 changes: 19 additions & 3 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,33 @@ def parse_named_configuration(


def parse_named_configuration(
data: JSON, expected_name: str | None = None, *, require_configuration: bool = True
data: JSON,
expected_name: str | None = None,
*,
require_configuration: bool = True,
) -> tuple[str, JSON | None]:
if isinstance(data, str):
data = {"name": data}
if not isinstance(data, dict):
raise TypeError(f"Expected dict, got {type(data)}")
if "name" not in data:
elif not all(
k in {"name", "configuration"}
or (isinstance(v, dict) and (v.get("must_understand") is False))
for k, v in data.items()
):
raise ValueError(
f"Named configuration expects keys 'name' and 'configuration'. Got {list(data.keys())}."
)
elif "name" not in data:
raise ValueError(f"Named configuration does not have a 'name' key. Got {data}.")

name_parsed = parse_name(data["name"], expected_name)
if "configuration" in data:
configuration_parsed = parse_configuration(data["configuration"])
elif require_configuration:
raise ValueError(f"Named configuration does not have a 'configuration' key. Got {data}.")
raise ValueError(
f"Named configuration with name='{name_parsed}' requires a 'configuration' key. Got keys {list(data.keys())}."
)
else:
configuration_parsed = None
return name_parsed, configuration_parsed
Expand Down
22 changes: 18 additions & 4 deletions src/zarr/core/metadata/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,19 @@ def parse_codecs(data: object) -> tuple[Codec, ...]:

if not isinstance(data, Iterable):
raise TypeError(f"Expected iterable, got {type(data)}")

if isinstance(data, str):
data = [data]
for c in data:
if isinstance(
c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
): # Can't use Codec here because of mypy limitation
out += (c,)
else:
name_parsed, _ = parse_named_configuration(c, require_configuration=False)
out += (get_codec_class(name_parsed).from_dict(c),)
if isinstance(c, str):
c = {"name": c}
name_parsed, config_parsed = parse_named_configuration(c, require_configuration=False)
codec = get_codec_class(name_parsed).from_dict(c)
out += (codec,)

return out

Expand Down Expand Up @@ -259,10 +263,17 @@ def __init__(
attributes: dict[str, JSON] | None,
dimension_names: Iterable[str] | None,
storage_transformers: Iterable[dict[str, JSON]] | None = None,
**kwargs: Any,
) -> None:
"""
Because the class is a frozen dataclass, we set attributes using object.__setattr__
"""
if not all(
isinstance(value, dict) and value.get("must_understand") is False
for value in kwargs.values()
):
raise ValueError(f"Unexpected zarr metadata keys: {list(kwargs.keys())}")

shape_parsed = parse_shapelike(shape)
data_type_parsed = DataType.parse(data_type)
chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid)
Expand Down Expand Up @@ -402,7 +413,10 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
_ = parse_node_type_array(_data.pop("node_type"))

# check that the data_type attribute is valid
data_type = DataType.parse(_data.pop("data_type"))
dt = _data.pop("data_type")
if isinstance(dt, dict):
dt, _ = parse_named_configuration(dt, require_configuration=False)
data_type = DataType.parse(dt)

# dimension_names key is optional, normalize missing to `None`
_data["dimension_names"] = _data.pop("dimension_names", None)
Expand Down
Loading