Skip to content

Commit 49f0062

Browse files
committed
use a common function signature for from_json by packing the object_codec_id in a typeddict for zarr v2 metadata
1 parent d26b695 commit 49f0062

File tree

26 files changed

+801
-491
lines changed

26 files changed

+801
-491
lines changed

src/zarr/core/dtype/__init__.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22

33
from typing import TYPE_CHECKING, Final, TypeAlias
44

5-
from zarr.core.dtype.common import DataTypeValidationError
5+
from zarr.core.dtype.common import (
6+
DataTypeValidationError,
7+
DTypeJSON,
8+
)
69
from zarr.core.dtype.npy.bool import Bool
710
from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes
811
from zarr.core.dtype.npy.complex import Complex64, Complex128
@@ -131,20 +134,20 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType,
131134
return data_type_registry.match_dtype(dtype=na_dtype)
132135

133136

134-
def get_data_type_from_json_v3(
135-
dtype_spec: JSON,
136-
) -> ZDType[TBaseDType, TBaseScalar]:
137-
return data_type_registry.match_json_v3(dtype_spec)
138-
139-
140-
def get_data_type_from_json_v2(
141-
dtype_spec: JSON, *, object_codec_id: str | None = None
137+
def get_data_type_from_json(
138+
dtype_spec: DTypeJSON, *, zarr_format: ZarrFormat
142139
) -> ZDType[TBaseDType, TBaseScalar]:
143-
return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id)
140+
"""
141+
Given a JSON representation of a data type and a Zarr format version,
142+
attempt to create a ZDType instance from the registered ZDType classes.
143+
"""
144+
return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format)
144145

145146

146147
def parse_data_type(
147-
dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None
148+
dtype_spec: ZDTypeLike,
149+
*,
150+
zarr_format: ZarrFormat,
148151
) -> ZDType[TBaseDType, TBaseScalar]:
149152
"""
150153
Interpret the input as a ZDType instance.
@@ -153,7 +156,7 @@ def parse_data_type(
153156
return dtype_spec
154157
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
155158
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
156-
return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type]
159+
return get_data_type_from_json(dtype_spec, zarr_format=3)
157160
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
158161
# we can create a numpy dtype from it, and do the dtype inference from that
159162
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]

src/zarr/core/dtype/common.py

Lines changed: 134 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,149 @@
11
from __future__ import annotations
22

33
import warnings
4+
from collections.abc import Mapping, Sequence
45
from dataclasses import dataclass
5-
from typing import ClassVar, Final, Literal
6+
from typing import (
7+
ClassVar,
8+
Final,
9+
Generic,
10+
Literal,
11+
TypedDict,
12+
TypeGuard,
13+
TypeVar,
14+
)
15+
16+
from zarr.core.common import NamedConfig
617

718
EndiannessStr = Literal["little", "big"]
819
ENDIANNESS_STR: Final = "little", "big"
20+
921
SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"]
1022
SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity")
23+
1124
JSONFloatV2 = float | SpecialFloatStrings
1225
JSONFloatV3 = float | SpecialFloatStrings | str
1326

27+
ObjectCodecID = Literal["vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2"]
28+
# These are the ids of the known object codecs for zarr v2.
29+
OBJECT_CODEC_IDS: Final = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2")
30+
31+
# This is a wider type than our standard JSON type because we need
32+
# to work with typeddict objects which are assignable to Mapping[str, object]
33+
DTypeJSON = str | int | float | Sequence["DTypeJSON"] | None | Mapping[str, object]
34+
35+
# The DTypeJSON_V2 type exists because ZDType.from_json takes a single argument, which must contain
36+
# all the information necessary to decode the data type. Zarr v2 supports multiple distinct
37+
# data types that all used the "|O" data type identifier. These data types can only be
38+
# discriminated on the basis of their "object codec", i.e. a special data type specific
39+
# compressor or filter. So to figure out what data type a zarr v2 array has, we need the
40+
# data type identifier from metadata, as well as an object codec id if the data type identifier
41+
# is "|O".
42+
# So we will pack the name of the dtype alongside the name of the object codec id, if applicable,
43+
# in a single dict, and pass that to the data type inference logic.
44+
# These type variables have a very wide bound because the individual zdtype
45+
# classes can perform a very specific type check.
46+
47+
# This is the JSON representation of a structured dtype in zarr v2
48+
StructuredName_V2 = Sequence["str | StructuredName_V2"]
49+
50+
# This models the type of the name a dtype might have in zarr v2 array metadata
51+
DTypeName_V2 = StructuredName_V2 | str
52+
53+
TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True)
54+
TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True)
55+
56+
57+
class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]):
58+
name: TDTypeNameV2_co
59+
object_codec_id: TObjectCodecID_co
60+
61+
62+
DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str]
63+
64+
65+
def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2]:
66+
"""
67+
A type guard for the inner elements of a structured dtype. This is a recursive check because
68+
the type is itself recursive.
69+
70+
This check ensures that all the elements are 2-element sequences beginning with a string
71+
and ending with either another string or another 2-element sequence beginning with a string and
72+
ending with another instance of that type.
73+
"""
74+
if isinstance(data, (str, Mapping)):
75+
return False
76+
if not isinstance(data, Sequence):
77+
return False
78+
if len(data) != 2:
79+
return False
80+
if not (isinstance(data[0], str)):
81+
return False
82+
if isinstance(data[-1], str):
83+
return True
84+
elif isinstance(data[-1], Sequence):
85+
return check_structured_dtype_v2_inner(data[-1])
86+
return False
87+
88+
89+
def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]:
90+
return all(check_structured_dtype_v2_inner(d) for d in data)
91+
92+
93+
def check_dtype_name_v2(data: object) -> TypeGuard[DTypeName_V2]:
94+
"""
95+
Type guard for narrowing the type of a python object to an valid zarr v2 dtype name.
96+
"""
97+
if isinstance(data, str):
98+
return True
99+
elif isinstance(data, Sequence):
100+
return check_structured_dtype_name_v2(data)
101+
return False
102+
103+
104+
def check_dtype_spec_v2(data: object) -> TypeGuard[DTypeSpec_V2]:
105+
"""
106+
Type guard for narrowing a python object to an instance of DTypeSpec_V2
107+
"""
108+
if not isinstance(data, Mapping):
109+
return False
110+
if set(data.keys()) != {"name", "object_codec_id"}:
111+
return False
112+
if not check_dtype_name_v2(data["name"]):
113+
return False
114+
return isinstance(data["object_codec_id"], str | None)
115+
116+
117+
# By comparison, The JSON representation of a dtype in zarr v3 is much simpler.
118+
# It's either a string, or a structured dict
119+
DTypeSpec_V3 = str | NamedConfig[str, Mapping[str, object]]
120+
121+
122+
def check_dtype_spec_v3(data: object) -> TypeGuard[DTypeSpec_V3]:
123+
"""
124+
Type guard for narrowing the type of a python object to an instance of
125+
DTypeSpec_V3, i.e either a string or a dict with a "name" field that's a string and a
126+
"configuration" field that's a mapping with string keys.
127+
"""
128+
if isinstance(data, str) or ( # noqa: SIM103
129+
isinstance(data, Mapping)
130+
and set(data.keys()) == {"name", "configuration"}
131+
and isinstance(data["configuration"], Mapping)
132+
and all(isinstance(k, str) for k in data["configuration"])
133+
):
134+
return True
135+
return False
136+
137+
138+
def unpack_dtype_json(data: DTypeSpec_V2 | DTypeSpec_V3) -> DTypeJSON:
139+
"""
140+
Return the array metadata form of the dtype JSON representation. For the Zarr V3 form of dtype
141+
metadata, this is a no-op. For the Zarr V2 form of dtype metadata, this unpacks the dtype name.
142+
"""
143+
if isinstance(data, Mapping) and set(data.keys()) == {"name", "object_codec_id"}:
144+
return data["name"]
145+
return data
146+
14147

15148
class DataTypeValidationError(ValueError): ...
16149

src/zarr/core/dtype/npy/bool.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
1+
from __future__ import annotations
2+
13
from dataclasses import dataclass
2-
from typing import ClassVar, Literal, Self, TypeGuard, overload
4+
from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload
35

46
import numpy as np
57

6-
from zarr.core.common import JSON, ZarrFormat
7-
from zarr.core.dtype.common import DataTypeValidationError, HasItemSize
8+
from zarr.core.dtype.common import (
9+
DataTypeValidationError,
10+
DTypeConfig_V2,
11+
DTypeJSON,
12+
HasItemSize,
13+
check_dtype_spec_v2,
14+
)
815
from zarr.core.dtype.wrapper import TBaseDType, ZDType
916

17+
if TYPE_CHECKING:
18+
from zarr.core.common import JSON, ZarrFormat
19+
1020

1121
@dataclass(frozen=True, kw_only=True, slots=True)
1222
class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize):
@@ -44,40 +54,47 @@ def to_native_dtype(self: Self) -> np.dtypes.BoolDType:
4454

4555
@classmethod
4656
def _check_json_v2(
47-
cls, data: JSON, *, object_codec_id: str | None = None
48-
) -> TypeGuard[Literal["|b1"]]:
57+
cls,
58+
data: DTypeJSON,
59+
) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]:
4960
"""
5061
Check that the input is a valid JSON representation of a Bool.
5162
"""
52-
return data == cls._zarr_v2_name
63+
return (
64+
check_dtype_spec_v2(data)
65+
and data["name"] == cls._zarr_v2_name
66+
and data["object_codec_id"] is None
67+
)
5368

5469
@classmethod
55-
def _check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]:
70+
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]:
5671
return data == cls._zarr_v3_name
5772

5873
@classmethod
59-
def from_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> Self:
74+
def _from_json_v2(cls, data: DTypeJSON) -> Self:
6075
if cls._check_json_v2(data):
6176
return cls()
6277
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}"
6378
raise DataTypeValidationError(msg)
6479

6580
@classmethod
66-
def from_json_v3(cls: type[Self], data: JSON) -> Self:
81+
def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self:
6782
if cls._check_json_v3(data):
6883
return cls()
6984
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}"
7085
raise DataTypeValidationError(msg)
7186

72-
@overload
73-
def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ...
87+
@overload # type: ignore[override]
88+
def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ...
7489

7590
@overload
7691
def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ...
7792

78-
def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]:
93+
def to_json(
94+
self, zarr_format: ZarrFormat
95+
) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]:
7996
if zarr_format == 2:
80-
return self._zarr_v2_name
97+
return {"name": self._zarr_v2_name, "object_codec_id": None}
8198
elif zarr_format == 3:
8299
return self._zarr_v3_name
83100
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover

0 commit comments

Comments
 (0)