zarr-developers · brokkoli71 · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/changes/2866.feature.rst b/changes/2866.feature.rst
@@ -0,0 +1,2 @@
+If metadata JSON contains invalid keys, or if a value object contains invalid keys, the zarr array will be rejected unless it contains the key-value-pair ``"must_understand": false``.
+New codecs are required to validate their metadata and raise an error if it is invalid.
diff --git a/docs/user-guide/extending.rst b/docs/user-guide/extending.rst
@@ -50,6 +50,9 @@ Custom codecs should also implement the following methods:
 - ``evolve_from_array_spec`` (optional), which can be useful for automatically filling in
   codec configuration metadata from the array metadata.
 
+On initialization, custom codecs should reject unexpected metadata keys except for objects
+marked with ``"must_understand": false``.
+
 To use custom codecs in Zarr, they need to be registered using the
 `entrypoint mechanism <https://packaging.python.org/en/latest/specifications/entry-points/>`_.
 Commonly, entrypoints are declared in the ``pyproject.toml`` of your package under the

diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass, replace
 from enum import Enum
 from functools import cached_property
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import numcodecs
 from numcodecs.blosc import Blosc
@@ -101,7 +101,14 @@ def __init__(
         clevel: int = 5,
         shuffle: BloscShuffle | str | None = None,
         blocksize: int = 0,
+        **kwargs: Any,
     ) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `blosc` codec got an unexpected configuration: {kwargs}")
+
         typesize_parsed = parse_typesize(typesize) if typesize is not None else None
         cname_parsed = parse_enum(cname, BloscCname)
         clevel_parsed = parse_clevel(clevel)

diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
@@ -3,7 +3,7 @@
 import sys
 from dataclasses import dataclass, replace
 from enum import Enum
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 
@@ -36,9 +36,18 @@ class BytesCodec(ArrayBytesCodec):
 
     endian: Endian | None
 
-    def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None:
+    def __init__(
+        self,
+        *,
+        endian: Endian | str | None = default_system_endian,
+        **kwargs: Any,
+    ) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `bytes` codec got an unexpected configuration: {kwargs}")
         endian_parsed = None if endian is None else parse_enum(endian, Endian)
-
         object.__setattr__(self, "endian", endian_parsed)
 
     @classmethod
@@ -47,6 +56,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
             data, "bytes", require_configuration=False
         )
         configuration_parsed = configuration_parsed or {}
+        configuration_parsed.setdefault("endian", None)
         return cls(**configuration_parsed)  # type: ignore[arg-type]
 
     def to_dict(self) -> dict[str, JSON]:
@@ -59,7 +69,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
         if array_spec.dtype.itemsize == 0:
             if self.endian is not None:
                 return replace(self, endian=None)
-        elif self.endian is None:
+        elif self.endian is None and array_spec.dtype.itemsize > 1:
             raise ValueError(
                 "The `endian` configuration needs to be specified for multi-byte data types."
             )

diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py
@@ -24,7 +24,14 @@ class Crc32cCodec(BytesBytesCodec):
 
     @classmethod
     def from_dict(cls, data: dict[str, JSON]) -> Self:
-        parse_named_configuration(data, "crc32c", require_configuration=False)
+        _, configuration_parsed = parse_named_configuration(
+            data, "crc32c", require_configuration=False
+        )
+        if configuration_parsed and not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in configuration_parsed.values()
+        ):
+            raise ValueError(f"The `crc32c` codec got an unexpected configuration: {data}")
         return cls()
 
     def to_dict(self) -> dict[str, JSON]:

diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from numcodecs.gzip import GZip
 
@@ -34,7 +34,13 @@ class GzipCodec(BytesBytesCodec):
 
     level: int = 5
 
-    def __init__(self, *, level: int = 5) -> None:
+    def __init__(self, *, level: int = 5, **kwargs: Any) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `gzip` codec got an unexpected configuration: {kwargs}")
+
         level_parsed = parse_gzip_level(level)
 
         object.__setattr__(self, "level", level_parsed)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
@@ -343,7 +343,14 @@ def __init__(
         codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
         index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
         index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
+        **kwargs: Any,
     ) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `sharding` codec got an unexpected configuration: {kwargs}")
+
         chunk_shape_parsed = parse_shapelike(chunk_shape)
         codecs_parsed = parse_codecs(codecs)
         index_codecs_parsed = parse_codecs(index_codecs)
@@ -378,6 +385,8 @@ def __setstate__(self, state: dict[str, Any]) -> None:
     @classmethod
     def from_dict(cls, data: dict[str, JSON]) -> Self:
         _, configuration_parsed = parse_named_configuration(data, "sharding_indexed")
+        configuration_parsed.setdefault("codecs", "bytes")
+        configuration_parsed.setdefault("index_codecs", ("bytes", "crc32c"))
         return cls(**configuration_parsed)  # type: ignore[arg-type]
 
     @property

diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py
@@ -32,7 +32,13 @@ class TransposeCodec(ArrayArrayCodec):
 
     order: tuple[int, ...]
 
-    def __init__(self, *, order: ChunkCoordsLike) -> None:
+    def __init__(self, *, order: ChunkCoordsLike, **kwargs: Any) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `transpose` codec got an unexpected configuration: {kwargs}")
+
         order_parsed = parse_transpose_order(order)
 
         object.__setattr__(self, "order", order_parsed)

diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from warnings import warn
 
 import numpy as np
@@ -26,7 +26,13 @@
 
 @dataclass(frozen=True)
 class VLenUTF8Codec(ArrayBytesCodec):
-    def __init__(self) -> None:
+    def __init__(self, **kwargs: Any) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `vlen-utf8` codec got an unexpected configuration: {kwargs}")
+
         warn(
             "The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It "
             "may not be supported by other zarr implementations and may change in the future.",
@@ -81,7 +87,13 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -
 
 @dataclass(frozen=True)
 class VLenBytesCodec(ArrayBytesCodec):
-    def __init__(self) -> None:
+    def __init__(self, **kwargs: Any) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `vlen-bytes` codec got an unexpected configuration: {kwargs}")
+
         warn(
             "The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It "
             "may not be supported by other zarr implementations and may change in the future.",

diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py
@@ -3,7 +3,7 @@
 import asyncio
 from dataclasses import dataclass
 from functools import cached_property
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import numcodecs
 from numcodecs.zstd import Zstd
@@ -42,7 +42,13 @@ class ZstdCodec(BytesBytesCodec):
     level: int = 0
     checksum: bool = False
 
-    def __init__(self, *, level: int = 0, checksum: bool = False) -> None:
+    def __init__(self, *, level: int = 0, checksum: bool = False, **kwargs: Any) -> None:
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"The `zstd` codec got an unexpected configuration: {kwargs}")
+
         # numcodecs 0.13.0 introduces the checksum attribute for the zstd codec
         _numcodecs_version = Version(numcodecs.__version__)
         if _numcodecs_version < Version("0.13.0"):

diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py
@@ -179,9 +179,17 @@ def __init__(self, *, chunk_shape: ChunkCoordsLike) -> None:
 
     @classmethod
     def _from_dict(cls, data: dict[str, JSON]) -> Self:
-        _, configuration_parsed = parse_named_configuration(data, "regular")
+        _, config_parsed = parse_named_configuration(data, "regular")
+
+        if config_parsed and not all(
+            k == "chunk_shape" or (isinstance(v, dict) and v.get("must_understand") is False)
+            for k, v in config_parsed.items()
+        ):
+            raise ValueError(
+                f"The chunk grid expects a 'chunk_shape' key. Got {list(config_parsed.keys())}."
+            )
 
-        return cls(**configuration_parsed)  # type: ignore[arg-type]
+        return cls(chunk_shape=config_parsed.get("chunk_shape"))  # type: ignore[arg-type]
 
     def to_dict(self) -> dict[str, JSON]:
         return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}}

diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py
@@ -44,24 +44,34 @@ def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncodingLike) -> ChunkKeyEnco
             return data
 
         # handle ChunkKeyEncodingParams
-        if "name" in data and "separator" in data:
+        if isinstance(data, dict) and data.keys() == {"name", "separator"}:
             data = {"name": data["name"], "configuration": {"separator": data["separator"]}}
 
         # TODO: remove this cast when we are statically typing the JSON metadata completely.
         data = cast(dict[str, JSON], data)
 
         # configuration is optional for chunk key encodings
         name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False)
+
+        if config_parsed and not all(
+            k == "separator" or (isinstance(v, dict) and v.get("must_understand") is False)
+            for k, v in config_parsed.items()
+        ):
+            raise ValueError(
+                f"The chunk key encoding expects a 'separator' key. Got {list(config_parsed.keys())}."
+            )
+
         if name_parsed == "default":
-            if config_parsed is None:
-                # for default, normalize missing configuration to use the "/" separator.
-                config_parsed = {"separator": "/"}
-            return DefaultChunkKeyEncoding(**config_parsed)  # type: ignore[arg-type]
+            # for default, normalize missing configuration to use the "/" separator.
+            return DefaultChunkKeyEncoding(
+                separator=config_parsed.get("separator") if config_parsed else "/"  # type: ignore[arg-type]
+            )
         if name_parsed == "v2":
-            if config_parsed is None:
-                # for v2, normalize missing configuration to use the "." separator.
-                config_parsed = {"separator": "."}
-            return V2ChunkKeyEncoding(**config_parsed)  # type: ignore[arg-type]
+            # for v2, normalize missing configuration to use the "." separator.
+            return V2ChunkKeyEncoding(
+                separator=config_parsed.get("separator") if config_parsed else "."  # type: ignore[arg-type]
+            )
+
         msg = f"Unknown chunk key encoding. Got {name_parsed}, expected one of ('v2', 'default')."
         raise ValueError(msg)
 
@@ -77,7 +87,7 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
         pass
 
 
-ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding
+ChunkKeyEncodingLike: TypeAlias = ChunkKeyEncodingParams | ChunkKeyEncoding | str
 
 
 @dataclass(frozen=True)

diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py
@@ -114,17 +114,33 @@ def parse_named_configuration(
 
 
 def parse_named_configuration(
-    data: JSON, expected_name: str | None = None, *, require_configuration: bool = True
+    data: JSON,
+    expected_name: str | None = None,
+    *,
+    require_configuration: bool = True,
 ) -> tuple[str, JSON | None]:
+    if isinstance(data, str):
+        data = {"name": data}
     if not isinstance(data, dict):
         raise TypeError(f"Expected dict, got {type(data)}")
-    if "name" not in data:
+    elif not all(
+        k in {"name", "configuration"}
+        or (isinstance(v, dict) and (v.get("must_understand") is False))
+        for k, v in data.items()
+    ):
+        raise ValueError(
+            f"Named configuration expects keys 'name' and 'configuration'. Got {list(data.keys())}."
+        )
+    elif "name" not in data:
         raise ValueError(f"Named configuration does not have a 'name' key. Got {data}.")
+
     name_parsed = parse_name(data["name"], expected_name)
     if "configuration" in data:
         configuration_parsed = parse_configuration(data["configuration"])
     elif require_configuration:
-        raise ValueError(f"Named configuration does not have a 'configuration' key. Got {data}.")
+        raise ValueError(
+            f"Named configuration with name='{name_parsed}' requires a 'configuration' key. Got keys {list(data.keys())}."
+        )
     else:
         configuration_parsed = None
     return name_parsed, configuration_parsed

diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -69,15 +69,19 @@ def parse_codecs(data: object) -> tuple[Codec, ...]:
 
     if not isinstance(data, Iterable):
         raise TypeError(f"Expected iterable, got {type(data)}")
-
+    if isinstance(data, str):
+        data = [data]
     for c in data:
         if isinstance(
             c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec
         ):  # Can't use Codec here because of mypy limitation
             out += (c,)
         else:
-            name_parsed, _ = parse_named_configuration(c, require_configuration=False)
-            out += (get_codec_class(name_parsed).from_dict(c),)
+            if isinstance(c, str):
+                c = {"name": c}
+            name_parsed, config_parsed = parse_named_configuration(c, require_configuration=False)
+            codec = get_codec_class(name_parsed).from_dict(c)
+            out += (codec,)
 
     return out
 
@@ -259,10 +263,17 @@ def __init__(
         attributes: dict[str, JSON] | None,
         dimension_names: Iterable[str] | None,
         storage_transformers: Iterable[dict[str, JSON]] | None = None,
+        **kwargs: Any,
     ) -> None:
         """
         Because the class is a frozen dataclass, we set attributes using object.__setattr__
         """
+        if not all(
+            isinstance(value, dict) and value.get("must_understand") is False
+            for value in kwargs.values()
+        ):
+            raise ValueError(f"Unexpected zarr metadata keys: {list(kwargs.keys())}")
+
         shape_parsed = parse_shapelike(shape)
         data_type_parsed = DataType.parse(data_type)
         chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid)
@@ -402,7 +413,10 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
         _ = parse_node_type_array(_data.pop("node_type"))
 
         # check that the data_type attribute is valid
-        data_type = DataType.parse(_data.pop("data_type"))
+        dt = _data.pop("data_type")
+        if isinstance(dt, dict):
+            dt, _ = parse_named_configuration(dt, require_configuration=False)
+        data_type = DataType.parse(dt)
 
         # dimension_names key is optional, normalize missing to `None`
         _data["dimension_names"] = _data.pop("dimension_names", None)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		If metadata JSON contains invalid keys, or if a value object contains invalid keys, the zarr array will be rejected unless it contains the key-value-pair ``"must_understand": false``.
		New codecs are required to validate their metadata and raise an error if it is invalid.