Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b2f4ff0
working type checker
d-v-b Aug 22, 2025
7adff59
working type check that fails for recursive parameters in generic typ…
d-v-b Aug 22, 2025
35c7203
working recursive parameters for generic typeddict
d-v-b Aug 22, 2025
07e2315
all but notrequired
d-v-b Aug 22, 2025
85b48df
switch up imports
d-v-b Aug 23, 2025
4fe9ae4
remove cache poisoing bug, and deploy type checker throughout the cod…
d-v-b Aug 23, 2025
2125153
restore dimension name normalization
d-v-b Aug 24, 2025
32cd309
fix array metadata dicts and refactor to_dict test
d-v-b Aug 24, 2025
21d6188
lint
d-v-b Aug 25, 2025
6125c1b
fan out v3 metadata test
d-v-b Aug 25, 2025
cf0615b
update from_dict
d-v-b Aug 25, 2025
7fce136
overloads for parse_array_metadata
d-v-b Aug 25, 2025
b467747
Merge branch 'main' of github.com:zarr-developers/zarr-python into fe…
d-v-b Aug 25, 2025
943e148
fix missing imports
d-v-b Aug 25, 2025
d1be08c
add more type information
d-v-b Aug 25, 2025
ea3ed12
fix bugs, refine structured data type json representation
d-v-b Aug 25, 2025
a098cc2
remove unnnecessary test case
d-v-b Aug 25, 2025
fc06ab4
changelog
d-v-b Aug 25, 2025
1d4bd72
bump minimal typing_extensions version to the release that included e…
d-v-b Aug 25, 2025
d061fe1
Merge branch 'main' of github.com:zarr-developers/zarr-python into fe…
d-v-b Aug 25, 2025
bbd8ba7
Merge branch 'main' into feat/type-checker
d-v-b Aug 25, 2025
11f7499
improve error messages, compactify tests, add special case for disamb…
d-v-b Aug 29, 2025
1892df1
Merge branch 'main' into feat/type-checker
d-v-b Aug 29, 2025
eda19ec
Merge branch 'main' into feat/type-checker
d-v-b Aug 29, 2025
bb7e84e
Merge branch 'main' of github.com:zarr-developers/zarr-python into fe…
d-v-b Aug 29, 2025
4cc0385
remove dead code and consolidate tests
d-v-b Aug 29, 2025
be71a87
Merge branch 'main' of github.com:zarr-developers/zarr-python into fe…
d-v-b Sep 1, 2025
971945b
remove redundant imports
d-v-b Sep 1, 2025
30d48a8
re-export codecjson type
d-v-b Sep 1, 2025
a483c73
more re-exports
d-v-b Sep 1, 2025
c7096b1
narrow input type of type_check
d-v-b Sep 1, 2025
9eb287b
Merge branch 'main' into feat/type-checker
d-v-b Sep 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changes/3400.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add a runtime type checker for ``JSON`` types, and a variety of typeddict classes necessary for
modelling Zarr metadata documents. This increases the type-safety of our internal metadata routines,
and provides Zarr users with types they can use to model Zarr metadata.
8 changes: 3 additions & 5 deletions examples/custom_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
DataTypeValidationError,
DTypeConfig_V2,
DTypeJSON,
check_dtype_spec_v2,
)
from zarr.core.type_check import guard_type

# This is the int2 array data type
int2_dtype_cls = type(np.dtype("int2"))
Expand Down Expand Up @@ -67,7 +67,7 @@ def to_native_dtype(self: Self) -> int2_dtype_cls:
return self.dtype_cls()

@classmethod
def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]:
def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["int2"], None]]:
"""
Type check for Zarr v2-flavored JSON.

Expand All @@ -84,9 +84,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b

See the Zarr docs for more information about the JSON encoding for data types.
"""
return (
check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None
)
return guard_type(data, DTypeConfig_V2[Literal["int2"], None])

@classmethod
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]:
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
'packaging>=22.0',
'numpy>=1.26',
'numcodecs[crc32c]>=0.14',
'typing_extensions>=4.9',
'typing_extensions>=4.13',
'donfig>=0.8',
]

Expand Down Expand Up @@ -226,7 +226,6 @@ dependencies = [
'fsspec==2023.10.0',
's3fs==2023.10.0',
'universal_pathlib==0.0.22',
'typing_extensions==4.9.*',
'donfig==0.8.*',
'obstore==0.5.*',
# test deps
Expand Down
33 changes: 7 additions & 26 deletions src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from __future__ import annotations

from abc import abstractmethod
from collections.abc import Mapping
from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar

from typing_extensions import ReadOnly, TypedDict
from typing import TYPE_CHECKING, Generic, TypeVar

from zarr.abc.metadata import Metadata
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import NamedConfig, concurrent_map
from zarr.core.common import ( # noqa: F401 CodecJSON re-exported for backwards compatibility
CodecJSON,
CodecJSON_V2,
CodecJSON_V3,
concurrent_map,
)
from zarr.core.config import config

if TYPE_CHECKING:
Expand Down Expand Up @@ -37,27 +39,6 @@
CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer)
CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer)

TName = TypeVar("TName", bound=str, covariant=True)


class CodecJSON_V2(TypedDict, Generic[TName]):
"""The JSON representation of a codec for Zarr V2"""
Comment on lines -43 to -44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should reexport these types here for backwards compatibility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done in 30d48a8


id: ReadOnly[TName]


def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]:
return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str)


CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]]
"""The JSON representation of a codec for Zarr V3."""

# The widest type we will *accept* for a codec JSON
# This covers v2 and v3
CodecJSON = str | Mapping[str, object]
"""The widest type of JSON-like input that could specify a codec."""


class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]):
"""Generic base class for codecs.
Expand Down
11 changes: 5 additions & 6 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import asyncio
import dataclasses
import warnings
from typing import TYPE_CHECKING, Any, Literal, cast
from typing import TYPE_CHECKING, Any, Literal

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -37,7 +37,7 @@
GroupMetadata,
create_hierarchy,
)
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.errors import (
ArrayNotFoundError,
GroupNotFoundError,
Expand Down Expand Up @@ -353,13 +353,12 @@ async def open(
try:
metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format)
# TODO: remove this cast when we fix typing for array metadata dicts
_metadata_dict = cast("ArrayMetadataDict", metadata_dict)
# for v2, the above would already have raised an exception if not an array
zarr_format = _metadata_dict["zarr_format"]
is_v3_array = zarr_format == 3 and _metadata_dict.get("node_type") == "array"
zarr_format = metadata_dict["zarr_format"]
is_v3_array = zarr_format == 3 and metadata_dict.get("node_type") == "array"
if is_v3_array or zarr_format == 2:
return AsyncArray(
store_path=store_path, metadata=_metadata_dict, config=kwargs.get("config")
store_path=store_path, metadata=metadata_dict, config=kwargs.get("config")
)
except (AssertionError, FileNotFoundError, NodeTypeValidationError):
pass
Expand Down
85 changes: 57 additions & 28 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
ZARR_JSON,
ZARRAY_JSON,
ZATTRS_JSON,
ArrayMetadataJSON_V2,
ArrayMetadataJSON_V3,
DimensionNames,
MemoryOrder,
ShapeLike,
Expand Down Expand Up @@ -103,11 +105,8 @@
)
from zarr.core.metadata import (
ArrayMetadata,
ArrayMetadataDict,
ArrayV2Metadata,
ArrayV2MetadataDict,
ArrayV3Metadata,
ArrayV3MetadataDict,
T_ArrayMetadata,
)
from zarr.core.metadata.v2 import (
Expand All @@ -116,11 +115,12 @@
parse_compressor,
parse_filters,
)
from zarr.core.metadata.v3 import parse_node_type_array
from zarr.core.sync import sync
from zarr.core.type_check import check_type
from zarr.errors import (
ArrayNotFoundError,
MetadataValidationError,
NodeTypeValidationError,
ZarrDeprecationWarning,
ZarrUserWarning,
)
Expand Down Expand Up @@ -175,25 +175,32 @@ class DefaultFillValue:
DEFAULT_FILL_VALUE = DefaultFillValue()


def parse_array_metadata(data: Any) -> ArrayMetadata:
@overload
def parse_array_metadata(data: ArrayV2Metadata | ArrayMetadataJSON_V2) -> ArrayV2Metadata: ...


@overload
def parse_array_metadata(data: ArrayV3Metadata | ArrayMetadataJSON_V3) -> ArrayV3Metadata: ...


def parse_array_metadata(
data: ArrayV2Metadata | ArrayMetadataJSON_V2 | ArrayV3Metadata | ArrayMetadataJSON_V3,
) -> ArrayV2Metadata | ArrayV3Metadata:
"""
If the input is a dict representation of a Zarr metadata document, instantiate the right metadata
class from that dict. If the input is a metadata object, return it.
"""

if isinstance(data, ArrayMetadata):
return data
elif isinstance(data, dict):
zarr_format = data.get("zarr_format")
else:
zarr_format = data["zarr_format"]
if zarr_format == 3:
meta_out = ArrayV3Metadata.from_dict(data)
if len(meta_out.storage_transformers) > 0:
msg = (
f"Array metadata contains storage transformers: {meta_out.storage_transformers}."
"Arrays with storage transformers are not supported in zarr-python at this time."
)
raise ValueError(msg)
return meta_out
return ArrayV3Metadata.from_dict(data) # type: ignore[arg-type]
elif zarr_format == 2:
return ArrayV2Metadata.from_dict(data)
return ArrayV2Metadata.from_dict(data) # type: ignore[arg-type]
else:
raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3")
raise TypeError # pragma: no cover


def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None) -> CodecPipeline:
Expand All @@ -213,9 +220,27 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
raise TypeError # pragma: no cover


@overload
async def get_array_metadata(
store_path: StorePath, zarr_format: Literal[3]
) -> ArrayMetadataJSON_V3: ...


@overload
async def get_array_metadata(
store_path: StorePath, zarr_format: Literal[2]
) -> ArrayMetadataJSON_V2: ...


@overload
async def get_array_metadata(
store_path: StorePath, zarr_format: None
) -> ArrayMetadataJSON_V3 | ArrayMetadataJSON_V2: ...


async def get_array_metadata(
store_path: StorePath, zarr_format: ZarrFormat | None = 3
) -> dict[str, JSON]:
) -> ArrayMetadataJSON_V3 | ArrayMetadataJSON_V2:
if zarr_format == 2:
zarray_bytes, zattrs_bytes = await gather(
(store_path / ZARRAY_JSON).get(prototype=cpu_buffer_prototype),
Expand Down Expand Up @@ -260,19 +285,25 @@ async def get_array_metadata(
msg = f"Invalid value for 'zarr_format'. Expected 2, 3, or None. Got '{zarr_format}'." # type: ignore[unreachable]
raise MetadataValidationError(msg)

metadata_dict: dict[str, JSON]
metadata_dict: ArrayMetadataJSON_V2 | ArrayMetadataJSON_V3
if zarr_format == 2:
# V2 arrays are comprised of a .zarray and .zattrs objects
assert zarray_bytes is not None
metadata_dict = json.loads(zarray_bytes.to_bytes())
zattrs_dict = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {}
metadata_dict["attributes"] = zattrs_dict
tycheck = check_type(metadata_dict, ArrayMetadataJSON_V2)
if not tycheck.success:
msg = "The .zarray object at {store_path} is not a valid Zarr array metadata object. "
raise NodeTypeValidationError("zarray", "Zarr array metadata object", metadata_dict)
else:
# V3 arrays are comprised of a zarr.json object
assert zarr_json_bytes is not None
metadata_dict = json.loads(zarr_json_bytes.to_bytes())

parse_node_type_array(metadata_dict.get("node_type"))
tycheck = check_type(metadata_dict, ArrayMetadataJSON_V3)
if not tycheck.success:
msg = "The zarr.json object at {store_path} is not a valid Zarr array metadata object. "
raise NodeTypeValidationError("zarr.json", "Zarr array metadata object", metadata_dict)

return metadata_dict

Expand Down Expand Up @@ -311,22 +342,22 @@ class AsyncArray(Generic[T_ArrayMetadata]):
@overload
def __init__(
self: AsyncArray[ArrayV2Metadata],
metadata: ArrayV2Metadata | ArrayV2MetadataDict,
metadata: ArrayV2Metadata | ArrayMetadataJSON_V2,
store_path: StorePath,
config: ArrayConfigLike | None = None,
) -> None: ...

@overload
def __init__(
self: AsyncArray[ArrayV3Metadata],
metadata: ArrayV3Metadata | ArrayV3MetadataDict,
metadata: ArrayV3Metadata | ArrayMetadataJSON_V3,
store_path: StorePath,
config: ArrayConfigLike | None = None,
) -> None: ...

def __init__(
self,
metadata: ArrayMetadata | ArrayMetadataDict,
metadata: ArrayMetadata | ArrayMetadataJSON_V2 | ArrayMetadataJSON_V3,
store_path: StorePath,
config: ArrayConfigLike | None = None,
) -> None:
Expand Down Expand Up @@ -945,7 +976,7 @@ def from_dict(
ValueError
If the dictionary data is invalid or incompatible with either Zarr format 2 or 3 array creation.
"""
metadata = parse_array_metadata(data)
metadata = parse_array_metadata(data) # type: ignore[call-overload]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the type: ignore needed due to the presence of ArrayMetadata in the signature for __init__?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need the overload because the data input to from_dict is typed as dict[str, JSON]. I could change that to the union of the two typeddict metadata types, but this will incompatibly override the base class implementation of from_dict, and so I will need another # type: ignore there. The only way to clean all of this up is to redo our base Metadata ABC, which I deemed out of scope for this PR

return cls(metadata=metadata, store_path=store_path)

@classmethod
Expand Down Expand Up @@ -978,9 +1009,7 @@ async def open(
"""
store_path = await make_store_path(store)
metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format)
# TODO: remove this cast when we have better type hints
_metadata_dict = cast("ArrayV3MetadataDict", metadata_dict)
return cls(store_path=store_path, metadata=_metadata_dict)
return cls(store_path=store_path, metadata=metadata_dict)

@property
def store(self) -> Store:
Expand Down
Loading
Loading