Skip to content

Commit 5150d60

Browse files
committed
improve method names, refactor type hints with typeddictionaries, fix registry load frequency, add object_codec_id for v2 json deserialization
1 parent 807c585 commit 5150d60

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1210
-685
lines changed

docs/user-guide/arrays.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,8 @@ prints additional diagnostics, e.g.::
211211
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
212212
Compressors : (BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0),)
213213
No. bytes : 400000000 (381.5M)
214-
No. bytes stored : 3558573
215-
Storage ratio : 112.4
214+
No. bytes stored : 9696520
215+
Storage ratio : 41.3
216216
Chunks Initialized : 100
217217

218218
.. note::

docs/user-guide/data_types.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,20 +128,20 @@ Create a ``ZDType`` from a native data type:
128128
129129
>>> from zarr.core.dtype import Int8
130130
>>> import numpy as np
131-
>>> int8 = Int8.from_dtype(np.dtype('int8'))
131+
>>> int8 = Int8.from_native_dtype(np.dtype('int8'))
132132
133133
Convert back to native data type:
134134

135135
.. code-block:: python
136136
137-
>>> native_dtype = int8.to_dtype()
137+
>>> native_dtype = int8.to_native_dtype()
138138
>>> assert native_dtype == np.dtype('int8')
139139
140140
Get the default scalar value for the data type:
141141

142142
.. code-block:: python
143143
144-
>>> default_value = int8.default_value()
144+
>>> default_value = int8.default_scalar()
145145
>>> assert default_value == np.int8(0)
146146
147147
@@ -160,13 +160,13 @@ Serialize a scalar value to JSON:
160160

161161
.. code-block:: python
162162
163-
>>> json_value = int8.to_json_value(42, zarr_format=3)
163+
>>> json_value = int8.to_json_scalar(42, zarr_format=3)
164164
>>> json_value
165165
42
166166
167167
Deserialize a scalar value from JSON:
168168

169169
.. code-block:: python
170170
171-
>>> scalar_value = int8.from_json_value(42, zarr_format=3)
171+
>>> scalar_value = int8.from_json_scalar(42, zarr_format=3)
172172
>>> assert scalar_value == np.int8(42)

src/zarr/codecs/_v2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,15 @@ async def _decode_single(
4848
# segfaults and other bad things happening
4949
if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType:
5050
try:
51-
chunk = chunk.view(chunk_spec.dtype.to_dtype())
51+
chunk = chunk.view(chunk_spec.dtype.to_native_dtype())
5252
except TypeError:
5353
# this will happen if the dtype of the chunk
5454
# does not match the dtype of the array spec i.g. if
5555
# the dtype of the chunk_spec is a string dtype, but the chunk
5656
# is an object array. In this case, we need to convert the object
5757
# array to the correct dtype.
5858

59-
chunk = np.array(chunk).astype(chunk_spec.dtype.to_dtype())
59+
chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype())
6060

6161
elif chunk.dtype != object:
6262
# If we end up here, someone must have hacked around with the filters.
@@ -80,7 +80,7 @@ async def _encode_single(
8080
chunk = chunk_array.as_ndarray_like()
8181

8282
# ensure contiguous and correct order
83-
chunk = chunk.astype(chunk_spec.dtype.to_dtype(), order=chunk_spec.order, copy=False)
83+
chunk = chunk.astype(chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, copy=False)
8484

8585
# apply filters
8686
if self.filters:

src/zarr/codecs/bytes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ async def _decode_single(
7979
"Endianness | None", self.endian.value if self.endian is not None else None
8080
)
8181
new_byte_order = endianness_to_numpy_str(endian_str)
82-
dtype = chunk_spec.dtype.to_dtype().newbyteorder(new_byte_order)
82+
dtype = chunk_spec.dtype.to_native_dtype().newbyteorder(new_byte_order)
8383

8484
as_array_like = chunk_bytes.as_array_like()
8585
if isinstance(as_array_like, NDArrayLike):

src/zarr/codecs/sharding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ async def _decode_single(
452452
# setup output array
453453
out = chunk_spec.prototype.nd_buffer.create(
454454
shape=shard_shape,
455-
dtype=shard_spec.dtype.to_dtype(),
455+
dtype=shard_spec.dtype.to_native_dtype(),
456456
order=shard_spec.order,
457457
fill_value=0,
458458
)
@@ -499,7 +499,7 @@ async def _decode_partial_single(
499499
# setup output array
500500
out = shard_spec.prototype.nd_buffer.create(
501501
shape=indexer.shape,
502-
dtype=shard_spec.dtype.to_dtype(),
502+
dtype=shard_spec.dtype.to_native_dtype(),
503503
order=shard_spec.order,
504504
fill_value=0,
505505
)

src/zarr/codecs/vlen_utf8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ async def _decode_single(
6060
decoded = _vlen_utf8_codec.decode(raw_bytes)
6161
assert decoded.dtype == np.object_
6262
decoded.shape = chunk_spec.shape
63-
as_string_dtype = decoded.astype(chunk_spec.dtype.to_dtype(), copy=False)
63+
as_string_dtype = decoded.astype(chunk_spec.dtype.to_native_dtype(), copy=False)
6464
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
6565

6666
async def _encode_single(

src/zarr/core/array.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ def _create_metadata_v3(
700700

701701
if fill_value is None:
702702
# v3 spec will not allow a null fill value
703-
fill_value_parsed = dtype.default_value()
703+
fill_value_parsed = dtype.default_scalar()
704704
else:
705705
fill_value_parsed = fill_value
706706

@@ -782,7 +782,7 @@ def _create_metadata_v2(
782782
if dimension_separator is None:
783783
dimension_separator = "."
784784
if fill_value is None:
785-
fill_value = dtype.default_value() # type: ignore[assignment]
785+
fill_value = dtype.default_scalar() # type: ignore[assignment]
786786
return ArrayV2Metadata(
787787
shape=shape,
788788
dtype=dtype,
@@ -1056,7 +1056,7 @@ def dtype(self) -> TBaseDType:
10561056
np.dtype
10571057
Data type of the array
10581058
"""
1059-
return self._zdtype.to_dtype()
1059+
return self._zdtype.to_native_dtype()
10601060

10611061
@property
10621062
def order(self) -> MemoryOrder:

src/zarr/core/codec_pipeline.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
6262
# validated when decoding the metadata, but we support reading
6363
# Zarr V2 data and need to support the case where fill_value
6464
# is None.
65-
return chunk_spec.dtype.default_value()
65+
return chunk_spec.dtype.default_scalar()
6666
else:
6767
return fill_value
6868

@@ -296,7 +296,9 @@ def _merge_chunk_array(
296296
is_complete_chunk: bool,
297297
drop_axes: tuple[int, ...],
298298
) -> NDBuffer:
299-
if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype.to_dtype()):
299+
if chunk_selection == () or is_scalar(
300+
value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
301+
):
300302
chunk_value = value
301303
else:
302304
chunk_value = value[out_selection]
@@ -317,7 +319,7 @@ def _merge_chunk_array(
317319
if existing_chunk_array is None:
318320
chunk_array = chunk_spec.prototype.nd_buffer.create(
319321
shape=chunk_spec.shape,
320-
dtype=chunk_spec.dtype.to_dtype(),
322+
dtype=chunk_spec.dtype.to_native_dtype(),
321323
order=chunk_spec.order,
322324
fill_value=fill_value_or_default(chunk_spec),
323325
)

src/zarr/core/common.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
from typing import (
1111
TYPE_CHECKING,
1212
Any,
13+
Generic,
1314
Literal,
15+
TypedDict,
1416
TypeVar,
1517
cast,
1618
overload,
@@ -39,6 +41,14 @@
3941
AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"]
4042
DimensionNames = Iterable[str | None] | None
4143

44+
TName = TypeVar("TName", bound=str)
45+
TConfig = TypeVar("TConfig", bound=Mapping[str, object])
46+
47+
48+
class NamedConfig(TypedDict, Generic[TName, TConfig]):
49+
name: TName
50+
configuration: TConfig
51+
4252

4353
def product(tup: ChunkCoords) -> int:
4454
return functools.reduce(operator.mul, tup, 1)

src/zarr/core/dtype/__init__.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
if TYPE_CHECKING:
1717
from zarr.core.common import ZarrFormat
1818

19+
from collections.abc import Mapping
20+
1921
import numpy as np
2022
import numpy.typing as npt
2123

2224
from zarr.core.common import JSON
2325
from zarr.core.dtype.npy.string import (
24-
_NUMPY_SUPPORTS_VLEN_STRING,
2526
FixedLengthASCII,
2627
FixedLengthUTF32,
2728
VariableLengthString,
@@ -102,7 +103,7 @@
102103
)
103104

104105
# This type models inputs that can be coerced to a ZDType
105-
ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] | str
106+
ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str
106107

107108
for dtype in ANY_DTYPE:
108109
# mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType
@@ -114,42 +115,41 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType,
114115
"""
115116
Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype.
116117
"""
117-
data_type_registry.lazy_load()
118118
if not isinstance(dtype, np.dtype):
119-
# TODO: This check has a lot of assumptions in it! Chiefly, we assume that the
120-
# numpy object dtype contains variable length strings, which is not in general true
121-
# When / if zarr python supports ragged arrays, for example, this check will fail!
122-
if dtype in (str, "str", "|T16", "O", "|O", np.dtypes.ObjectDType()):
123-
if _NUMPY_SUPPORTS_VLEN_STRING:
124-
na_dtype = np.dtype("T")
125-
else:
126-
na_dtype = np.dtype("O")
127-
elif isinstance(dtype, list):
119+
na_dtype: np.dtype[np.generic]
120+
if isinstance(dtype, list):
128121
# this is a valid _VoidDTypeLike check
129122
na_dtype = np.dtype([tuple(d) for d in dtype])
130123
else:
131124
na_dtype = np.dtype(dtype)
132125
else:
133126
na_dtype = dtype
134-
return data_type_registry.match_dtype(na_dtype)
127+
return data_type_registry.match_dtype(dtype=na_dtype)
128+
129+
130+
def get_data_type_from_json_v3(
131+
dtype_spec: JSON,
132+
) -> ZDType[TBaseDType, TBaseScalar]:
133+
return data_type_registry.match_json_v3(dtype_spec)
135134

136135

137-
def get_data_type_from_json(
138-
dtype: JSON, zarr_format: ZarrFormat
136+
def get_data_type_from_json_v2(
137+
dtype_spec: JSON, *, object_codec_id: str | None = None
139138
) -> ZDType[TBaseDType, TBaseScalar]:
140-
return data_type_registry.match_json(dtype, zarr_format=zarr_format)
139+
return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id)
141140

142141

143-
def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]:
142+
def parse_data_type(
143+
dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None
144+
) -> ZDType[TBaseDType, TBaseScalar]:
144145
"""
145146
Interpret the input as a ZDType instance.
146147
"""
147-
if isinstance(dtype, ZDType):
148-
return dtype
149-
elif isinstance(dtype, dict):
150-
# This branch assumes that the data type has been specified in the JSON form
151-
# but it's also possible for numpy data types to be specified as dictionaries, which will
152-
# cause an error in the `get_data_type_from_json`, but that's ok for now
153-
return get_data_type_from_json(dtype, zarr_format=zarr_format) # type: ignore[arg-type]
154-
else:
155-
return get_data_type_from_native_dtype(dtype)
148+
if isinstance(dtype_spec, ZDType):
149+
return dtype_spec
150+
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
151+
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
152+
return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type]
153+
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
154+
# we can create a numpy dtype from it, and do the dtype inference from that
155+
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]

0 commit comments

Comments
 (0)