From 32e60d20a5352d1827a34dc71f7ddefb04442259 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 21 Feb 2025 13:43:34 +0100 Subject: [PATCH 001/129] modernize typing --- src/zarr/core/strings.py | 88 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 src/zarr/core/strings.py diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py new file mode 100644 index 0000000000..5bc7ceece5 --- /dev/null +++ b/src/zarr/core/strings.py @@ -0,0 +1,88 @@ +"""This module contains utilities for working with string arrays across +different versions of Numpy. +""" + +from __future__ import annotations + +from typing import Any, cast +from warnings import warn + +import numpy as np + +# _STRING_DTYPE is the in-memory datatype that will be used for V3 string arrays +# when reading data back from Zarr. +# Any valid string-like datatype should be fine for *setting* data. + +_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType +_NUMPY_SUPPORTS_VLEN_STRING: bool + + +def cast_array( + data: np.ndarray[Any, np.dtype[Any]], +) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: + raise NotImplementedError + + +try: + # this new vlen string dtype was added in NumPy 2.0 + _STRING_DTYPE = np.dtypes.StringDType() + _NUMPY_SUPPORTS_VLEN_STRING = True + + def cast_array( + data: np.ndarray[Any, np.dtype[Any]], + ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: + out = data.astype(_STRING_DTYPE, copy=False) + return cast(np.ndarray[Any, np.dtypes.StringDType], out) + +except AttributeError: + # if not available, we fall back on an object array of strings, as in Zarr < 3 + _STRING_DTYPE = np.dtypes.ObjectDType() + _NUMPY_SUPPORTS_VLEN_STRING = False + + def cast_array( + data: np.ndarray[Any, np.dtype[Any]], + ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: + out = data.astype(_STRING_DTYPE, copy=False) + return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) + + +def cast_to_string_dtype( + data: np.ndarray[Any, np.dtype[Any]], safe: bool = False +) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: + """Take any data and attempt to cast to to our preferred string dtype. + + data : np.ndarray + The data to cast + + safe : bool + If True, do not issue a warning if the data is cast from object to string dtype. + + """ + if np.issubdtype(data.dtype, np.str_): + # legacy fixed-width string type (e.g. "= 2.", + stacklevel=2, + ) + return cast_array(data) + raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype") From f104f275270c7748549a3f43d9b1862c71114afe Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Feb 2025 14:41:25 +0100 Subject: [PATCH 002/129] lint --- src/zarr/core/common.py | 11 +++++++++++ src/zarr/core/metadata/v3.py | 2 ++ src/zarr/core/strings.py | 21 ++++++++++---------- tests/test_strings.py | 37 ++++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 tests/test_strings.py diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e86347d808..1ec7553802 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -22,6 +22,7 @@ from typing_extensions import ReadOnly from zarr.core.config import config as zarr_config +from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -190,6 +191,16 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: + if dtype is str or dtype == "str": + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _VLEN_STRING_DTYPE + return np.dtype(dtype) + + def _warn_write_empty_chunks_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 84872d3dbd..a2efb0f9fd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -35,6 +35,8 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.strings import _VLEN_STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py index 5bc7ceece5..f14b38840d 100644 --- a/src/zarr/core/strings.py +++ b/src/zarr/core/strings.py @@ -13,42 +13,43 @@ # when reading data back from Zarr. # Any valid string-like datatype should be fine for *setting* data. -_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType +VLenStringType = np.dtypes.StringDType | np.dtypes.ObjectDType +_VLEN_STRING_DTYPE: VLenStringType _NUMPY_SUPPORTS_VLEN_STRING: bool def cast_array( data: np.ndarray[Any, np.dtype[Any]], -) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: +) -> np.ndarray[Any, VLenStringType]: raise NotImplementedError try: # this new vlen string dtype was added in NumPy 2.0 - _STRING_DTYPE = np.dtypes.StringDType() + _VLEN_STRING_DTYPE = np.dtypes.StringDType() _NUMPY_SUPPORTS_VLEN_STRING = True def cast_array( data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: - out = data.astype(_STRING_DTYPE, copy=False) + ) -> np.ndarray[Any, VLenStringType]: + out = data.astype(_VLEN_STRING_DTYPE, copy=False) return cast(np.ndarray[Any, np.dtypes.StringDType], out) except AttributeError: # if not available, we fall back on an object array of strings, as in Zarr < 3 - _STRING_DTYPE = np.dtypes.ObjectDType() + _VLEN_STRING_DTYPE = np.dtypes.ObjectDType() _NUMPY_SUPPORTS_VLEN_STRING = False def cast_array( data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: - out = data.astype(_STRING_DTYPE, copy=False) + ) -> np.ndarray[Any, VLenStringType]: + out = data.astype(_VLEN_STRING_DTYPE, copy=False) return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) def cast_to_string_dtype( data: np.ndarray[Any, np.dtype[Any]], safe: bool = False -) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]: +) -> np.ndarray[Any, VLenStringType]: """Take any data and attempt to cast to to our preferred string dtype. data : np.ndarray @@ -63,7 +64,7 @@ def cast_to_string_dtype( return cast_array(data) # out = data.astype(STRING_DTYPE, copy=False) # return cast(np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType], out) - if _NUMPY_SUPPORTS_VLEN_STRING and np.issubdtype(data.dtype, _STRING_DTYPE): + if _NUMPY_SUPPORTS_VLEN_STRING and np.issubdtype(data.dtype, _VLEN_STRING_DTYPE): # already a valid string variable length string dtype return cast_array(data) if np.issubdtype(data.dtype, np.object_): diff --git a/tests/test_strings.py b/tests/test_strings.py new file mode 100644 index 0000000000..963f2e305e --- /dev/null +++ b/tests/test_strings.py @@ -0,0 +1,37 @@ +"""Tests for the strings module.""" + +import numpy as np +import pytest + +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING, _VLEN_STRING_DTYPE, cast_to_string_dtype + + +def test_string_defaults() -> None: + if _NUMPY_SUPPORTS_VLEN_STRING: + assert _VLEN_STRING_DTYPE == np.dtypes.StringDType() + else: + assert _VLEN_STRING_DTYPE == np.dtypes.ObjectDType() + + +def test_cast_to_string_dtype() -> None: + d1 = np.array(["a", "b", "c"]) + assert d1.dtype == np.dtype(" Date: Wed, 26 Feb 2025 09:35:37 +0100 Subject: [PATCH 003/129] new dtypes --- src/zarr/core/_info.py | 11 +- src/zarr/core/array.py | 13 +- src/zarr/core/dtype/__init__.py | 231 +------------------- src/zarr/core/dtype/core.py | 196 +++++++++++++++++ src/zarr/core/metadata/dtype.py | 372 ++++++++++++++++++++++++++++++++ src/zarr/core/metadata/v3.py | 301 +++++++++++++++++++++++++- src/zarr/core/strings.py | 4 +- src/zarr/registry.py | 98 ++++++++- tests/test_array.py | 63 +----- tests/test_codecs/test_vlen.py | 48 ++++- tests/test_metadata/test_v3.py | 1 + 11 files changed, 1028 insertions(+), 310 deletions(-) create mode 100644 src/zarr/core/dtype/core.py create mode 100644 src/zarr/core/metadata/dtype.py diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index a5b14d573a..f1803a45dd 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -7,9 +7,11 @@ if TYPE_CHECKING: import numcodecs.abc - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.core.common import ZarrFormat +from zarr.core.metadata.dtype import BaseDataType + +# from zarr.core.metadata.v3 import DataType @dataclasses.dataclass(kw_only=True) @@ -80,8 +82,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[TBaseDType, TBaseScalar] - _fill_value: object + _data_type: np.dtype[Any] | BaseDataType _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b85c5aba4b..3e6716ccf9 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -109,6 +109,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) +from zarr.core.metadata.dtype import BaseDataType from zarr.core.metadata.v2 import ( CompressorLikev2, get_object_codec_id, @@ -122,6 +123,7 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, + get_data_type_from_numpy, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path @@ -1761,6 +1763,12 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: + _data_type: np.dtype[Any] | BaseDataType + if isinstance(self.metadata, ArrayV2Metadata): + _data_type = self.metadata.dtype + else: + _data_type = self.metadata.data_type + return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=self._zdtype, @@ -4653,8 +4661,11 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ + dtype = get_data_type_from_numpy(np_dtype) - dtype_category = categorize_data_type(dtype) + default_filters = zarr_config.get("array.v3_default_filters").get(dtype.type) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.type) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.type) filters = zarr_config.get("array.v3_default_filters").get(dtype_category) compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index aadf127c9b..58b884ff23 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,230 +1,3 @@ -from __future__ import annotations +from zarr.core.dtype.core import ZarrDType -from typing import TYPE_CHECKING, Final, TypeAlias - -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeJSON, -) -from zarr.core.dtype.npy.bool import Bool -from zarr.core.dtype.npy.bytes import ( - NullTerminatedBytes, - NullterminatedBytesJSON_V2, - NullTerminatedBytesJSON_V3, - RawBytes, - RawBytesJSON_V2, - RawBytesJSON_V3, - VariableLengthBytes, - VariableLengthBytesJSON_V2, -) -from zarr.core.dtype.npy.complex import Complex64, Complex128 -from zarr.core.dtype.npy.float import Float16, Float32, Float64 -from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3 -from zarr.core.dtype.npy.time import ( - DateTime64, - DateTime64JSON_V2, - DateTime64JSON_V3, - TimeDelta64, - TimeDelta64JSON_V2, - TimeDelta64JSON_V3, -) - -if TYPE_CHECKING: - from zarr.core.common import ZarrFormat - -from collections.abc import Mapping - -import numpy as np -import numpy.typing as npt - -from zarr.core.common import JSON -from zarr.core.dtype.npy.string import ( - FixedLengthUTF32, - FixedLengthUTF32JSON_V2, - FixedLengthUTF32JSON_V3, - VariableLengthUTF8, - VariableLengthUTF8JSON_V2, -) -from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - -__all__ = [ - "Bool", - "Complex64", - "Complex128", - "DataTypeRegistry", - "DataTypeValidationError", - "DateTime64", - "DateTime64JSON_V2", - "DateTime64JSON_V3", - "FixedLengthUTF32", - "FixedLengthUTF32JSON_V2", - "FixedLengthUTF32JSON_V3", - "Float16", - "Float32", - "Float64", - "Int8", - "Int16", - "Int32", - "Int64", - "NullTerminatedBytes", - "NullTerminatedBytesJSON_V3", - "NullterminatedBytesJSON_V2", - "RawBytes", - "RawBytesJSON_V2", - "RawBytesJSON_V3", - "Structured", - "StructuredJSON_V2", - "StructuredJSON_V3", - "TBaseDType", - "TBaseScalar", - "TimeDelta64", - "TimeDelta64", - "TimeDelta64JSON_V2", - "TimeDelta64JSON_V3", - "UInt8", - "UInt16", - "UInt32", - "UInt64", - "VariableLengthBytes", - "VariableLengthBytesJSON_V2", - "VariableLengthUTF8", - "VariableLengthUTF8JSON_V2", - "ZDType", - "data_type_registry", - "parse_data_type", -] - -data_type_registry = DataTypeRegistry() - -IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -INTEGER_DTYPE: Final = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 - -FloatDType = Float16 | Float32 | Float64 -FLOAT_DTYPE: Final = Float16, Float32, Float64 - -ComplexFloatDType = Complex64 | Complex128 -COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 - -StringDType = FixedLengthUTF32 | VariableLengthUTF8 -STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthUTF8 - -TimeDType = DateTime64 | TimeDelta64 -TIME_DTYPE: Final = DateTime64, TimeDelta64 - -BytesDType = RawBytes | NullTerminatedBytes | VariableLengthBytes -BYTES_DTYPE: Final = RawBytes, NullTerminatedBytes, VariableLengthBytes - -AnyDType = ( - Bool - | IntegerDType - | FloatDType - | ComplexFloatDType - | StringDType - | BytesDType - | Structured - | TimeDType - | VariableLengthBytes -) -# mypy has trouble inferring the type of variablelengthstring dtype, because its class definition -# depends on the installed numpy version. That's why the type: ignore statement is needed here. -ANY_DTYPE: Final = ( - Bool, - *INTEGER_DTYPE, - *FLOAT_DTYPE, - *COMPLEX_FLOAT_DTYPE, - *STRING_DTYPE, - *BYTES_DTYPE, - Structured, - *TIME_DTYPE, - VariableLengthBytes, -) - -# These are aliases for variable-length UTF-8 strings -# We handle them when a user requests a data type instead of using NumPy's dtype inferece because -# the default NumPy behavior -- to inspect the user-provided array data and choose -# an appropriately sized U dtype -- is unworkable for Zarr. -VLEN_UTF8_ALIAS: Final = ("str", str, "string") - -# This type models inputs that can be coerced to a ZDType -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str - -for dtype in ANY_DTYPE: - # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType - data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] - - -# TODO: find a better name for this function -def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, TBaseScalar]: - """ - Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. - """ - if not isinstance(dtype, np.dtype): - na_dtype: np.dtype[np.generic] - if isinstance(dtype, list): - # this is a valid _VoidDTypeLike check - na_dtype = np.dtype([tuple(d) for d in dtype]) - else: - na_dtype = np.dtype(dtype) - else: - na_dtype = dtype - return data_type_registry.match_dtype(dtype=na_dtype) - - -def get_data_type_from_json( - dtype_spec: DTypeJSON, *, zarr_format: ZarrFormat -) -> ZDType[TBaseDType, TBaseScalar]: - """ - Given a JSON representation of a data type and a Zarr format version, - attempt to create a ZDType instance from the registered ZDType classes. - """ - return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format) - - -def parse_data_type( - dtype_spec: ZDTypeLike, - *, - zarr_format: ZarrFormat, -) -> ZDType[TBaseDType, TBaseScalar]: - """ - Interpret the input as a ZDType instance. - - Parameters - ---------- - dtype_spec : ZDTypeLike - The input to be interpreted as a ZDType instance. This could be a native data type - (e.g., a NumPy data type), a Python object that can be converted into a native data type, - a ZDType instance (in which case the input is returned unchanged), or a JSON object - representation of a data type. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The ZDType instance corresponding to the input. - - Examples - -------- - >>> from zarr.dtype import parse_data_type - >>> import numpy as np - >>> parse_data_type("int32", zarr_format=2) - Int32(endianness='little') - >>> parse_data_type(np.dtype('S10'), zarr_format=2) - NullTerminatedBytes(length=10) - >>> parse_data_type({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3) - DateTime64(endianness='little', scale_factor=10, unit='s') - """ - if isinstance(dtype_spec, ZDType): - return dtype_spec - # dict and zarr_format 3 means that we have a JSON object representation of the dtype - if zarr_format == 3 and isinstance(dtype_spec, Mapping): - return get_data_type_from_json(dtype_spec, zarr_format=3) - if dtype_spec in VLEN_UTF8_ALIAS: - # If the dtype request is one of the aliases for variable-length UTF-8 strings, - # return that dtype. - return VariableLengthUTF8() # type: ignore[return-value] - # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case - # we can create a numpy dtype from it, and do the dtype inference from that - return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] +__all__ = ["ZarrDType"] diff --git a/src/zarr/core/dtype/core.py b/src/zarr/core/dtype/core.py new file mode 100644 index 0000000000..c6460706aa --- /dev/null +++ b/src/zarr/core/dtype/core.py @@ -0,0 +1,196 @@ +""" +# Overview + +This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase. + +The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the +zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with +dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what +endianness is the dtype etc). By providing this abstraction, the module aims to: + +- Simplify dtype management within zarr-python +- Support runtime flexibility and custom extensions +- Remove unnecessary dependencies on the numpy API + +## Extensibility + +The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes +without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism, +enabling integration of experimental features. Over time, widely adopted extensions may be formalized through +inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential. + +## Examples + +### Core `dtype` Registration + +The following example demonstrates how to register a built-in `dtype` in the core codebase: + +```python +from zarr.core.dtype import ZarrDType +from zarr.registry import register_v3dtype + +class Float16(ZarrDType): + zarr_spec_format = "3" + experimental = False + endianness = "little" + byte_count = 2 + to_numpy = np.dtype('float16') + +register_v3dtype(Float16) +``` + +### Entrypoint Extension + +The following example demonstrates how users can register a new `bfloat16` dtype for Zarr. +This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring +consistency with other extensions. The code below would typically be part of a Python package +that specifies the entrypoints for the extension: + +```python +import ml_dtypes +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +class Bfloat16(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = "little" + byte_count = 2 + to_numpy = np.dtype('bfloat16') # Enabled by importing ml_dtypes + configuration_v3 = { + "version": "example_value", + "author": "example_value", + "ml_dtypes_version": "example_value" + } +``` + +### dtype lookup + +The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given +a string that matches the dtype Zarr specification ID, or a numpy dtype object: + +``` +from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy + +get_v3dtype_class('complex64') # returns little-endian Complex64 ZarrDType +get_v3dtype_class('not_registered_dtype') # ValueError + +get_v3dtype_class_from_numpy('>i2') # returns big-endian Int16 ZarrDType +get_v3dtype_class_from_numpy(np.dtype('float32')) # returns little-endian Float32 ZarrDType +get_v3dtype_class_from_numpy('i10') # ValueError +``` + +### String dtypes + +The following indicates one possibility for supporting variable-length strings. It is via the +entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently +include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string +here to implicitly refer to a variable-length string data (there may be some subtleties with codecs +that means this needs to be refined further): + +```python +import numpy as np +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +try: + to_numpy = np.dtypes.StringDType() +except AttributeError: + to_numpy = np.dtypes.ObjectDType() + +class String(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = 'little' + byte_count = None # None is defined to mean variable + to_numpy = to_numpy +``` + +### int4 dtype + +There is currently considerable interest in the AI community in 'quantising' models - storing +models at reduced precision, while minimising loss of information content. There are a number +of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not +currently have support for handling such sub-byte dtypes in an easy way. However, they can +still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch +which can handle appropriately: + +```python +import numpy as np +from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype + +class Int4(ZarrDType): + zarr_spec_format = "3" + experimental = True + endianness = 'little' + byte_count = 1 # this is ugly, but I could change this from byte_count to bit_count if there was consensus + to_numpy = np.dtype('B') # could also be np.dtype('V1'), but this would prevent bit-twiddling + configuration_v3 = { + "version": "example_value", + "author": "example_value", + } +``` +""" + +from __future__ import annotations + +from typing import Any, Literal + +import numpy as np + + +class FrozenClassVariables(type): + def __setattr__(cls, attr: str, value: object) -> None: + if hasattr(cls, attr): + raise ValueError(f"Attribute {attr} on ZarrDType class can not be changed once set.") + else: + raise AttributeError(f"'{cls}' object has no attribute '{attr}'") + + +class ZarrDType(metaclass=FrozenClassVariables): + zarr_spec_format: Literal["2", "3"] # the version of the zarr spec used + experimental: bool # is this in the core spec or not + endianness: Literal[ + "big", "little", None + ] # None indicates not defined i.e. single byte or byte strings + byte_count: int | None # None indicates variable count + to_numpy: np.dtype[Any] # may involve installing a a numpy extension e.g. ml_dtypes; + + configuration_v3: dict | None # TODO: understand better how this is recommended by the spec + + _zarr_spec_identifier: str # implementation detail used to map to core spec + + def __init_subclass__( # enforces all required fields are set and basic sanity checks + cls, + **kwargs, + ) -> None: + required_attrs = [ + "zarr_spec_format", + "experimental", + "endianness", + "byte_count", + "to_numpy", + ] + for attr in required_attrs: + if not hasattr(cls, attr): + raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") + + if not hasattr(cls, "configuration_v3"): + cls.configuration_v3 = None + + cls._zarr_spec_identifier = ( + "big_" + cls.__qualname__.lower() + if cls.endianness == "big" + else cls.__qualname__.lower() + ) # how this dtype is identified in core spec; convention is prefix with big_ for big-endian + + cls._validate() # sanity check on basic requirements + + super().__init_subclass__(**kwargs) + + # TODO: add further checks + @classmethod + def _validate(cls): + if cls.byte_count is not None and cls.byte_count <= 0: + raise ValueError("byte_count must be a positive integer.") + + if cls.byte_count == 1 and cls.endianness is not None: + raise ValueError("Endianness must be None for single-byte types.") diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py new file mode 100644 index 0000000000..ab101f2fad --- /dev/null +++ b/src/zarr/core/metadata/dtype.py @@ -0,0 +1,372 @@ +from abc import ABC +from dataclasses import dataclass, field +from typing import Any, ClassVar, Literal, Self, get_args + +import numpy as np + +from zarr.abc.metadata import Metadata +from zarr.core.common import JSON +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.registry import register_data_type + +Endianness = Literal["little", "big", "native"] +DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] + + +def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(endianness)} or None" + ) + + +class BaseDataType(ABC, Metadata): + name: ClassVar[str] + numpy_character_code: ClassVar[str] + item_size: ClassVar[int | None] + type: ClassVar[DataTypeFlavor] + capacity: int + + def __init_subclass__(cls, **kwargs: object) -> None: + required_attrs = [ + "name", + "numpy_character_code", + "item_size", + "type", + ] + for attr in required_attrs: + if not hasattr(cls, attr): + raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") + + return super().__init_subclass__(**kwargs) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name} + + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: + endian_str = endianness_to_numpy_str(endianness) + return np.dtype(endian_str + self.numpy_character_code) + + +@dataclass(frozen=True, kw_only=True) +class Bool(BaseDataType): + name = "bool" + item_size = 1 + type = "boolean" + numpy_character_code = "?" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Bool) + + +@dataclass(frozen=True, kw_only=True) +class Int8(BaseDataType): + name = "int8" + item_size = 1 + type = "numeric" + numpy_character_code = "b" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int8) + + +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseDataType): + name = "uint8" + item_size = 2 + type = "numeric" + numpy_character_code = "B" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt8) + + +@dataclass(frozen=True, kw_only=True) +class Int16(BaseDataType): + name = "int16" + item_size = 2 + type = "numeric" + numpy_character_code = "h" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int16) + + +@dataclass(frozen=True, kw_only=True) +class UInt16(BaseDataType): + name = "uint16" + item_size = 2 + type = "numeric" + numpy_character_code = "H" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt16) + + +@dataclass(frozen=True, kw_only=True) +class Int32(BaseDataType): + name = "int32" + item_size = 4 + type = "numeric" + numpy_character_code = "i" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int32) + + +@dataclass(frozen=True, kw_only=True) +class UInt32(BaseDataType): + name = "uint32" + item_size = 4 + type = "numeric" + numpy_character_code = "I" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt32) + + +@dataclass(frozen=True, kw_only=True) +class Int64(BaseDataType): + name = "int64" + item_size = 8 + type = "numeric" + numpy_character_code = "l" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Int64) + + +@dataclass(frozen=True, kw_only=True) +class UInt64(BaseDataType): + name = "uint64" + item_size = 8 + type = "numeric" + numpy_character_code = "L" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(UInt64) + + +@dataclass(frozen=True, kw_only=True) +class Float16(BaseDataType): + name = "float16" + item_size = 2 + type = "numeric" + numpy_character_code = "e" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float16) + + +@dataclass(frozen=True, kw_only=True) +class Float32(BaseDataType): + name = "float32" + item_size = 4 + type = "numeric" + numpy_character_code = "f" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float32) + + +@dataclass(frozen=True, kw_only=True) +class Float64(BaseDataType): + name = "float64" + item_size = 8 + type = "numeric" + numpy_character_code = "d" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Float64) + + +@dataclass(frozen=True, kw_only=True) +class Complex64(BaseDataType): + name = "complex64" + item_size = 16 + type = "numeric" + numpy_character_code = "F" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Complex64) + + +@dataclass(frozen=True, kw_only=True) +class Complex128(BaseDataType): + name = "complex64" + item_size = 32 + type = "numeric" + numpy_character_code = "D" + capacity: int = field(default=1, init=False) + + def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: + return super().to_numpy(endianness=endianness) + + +register_data_type(Complex128) + + +@dataclass(frozen=True, kw_only=True) +class StaticByteString(BaseDataType): + name = "numpy/static_byte_string" + type = "string" + numpy_character_code = "S" + item_size = 1 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +register_data_type(StaticByteString) + +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VlenString(BaseDataType): + name = "numpy/vlen_string" + type = "string" + numpy_character_code = "T" + item_size = None + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy( + self, endianness: Endianness | None = "native" + ) -> np.dtype[np.dtypes.StringDType]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code) + +else: + + @dataclass(frozen=True, kw_only=True) + class VlenString(BaseDataType): + name = "numpy/vlen_string" + type = "string" + numpy_character_code = "O" + item_size = None + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy( + self, endianness: Endianness | None = "native" + ) -> np.dtype[np.dtypes.ObjectDType]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code) + + +register_data_type(VlenString) + + +@dataclass(frozen=True, kw_only=True) +class StaticUnicodeString(BaseDataType): + name = "numpy/static_unicode_string" + type = "string" + numpy_character_code = "U" + item_size = 4 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"capacity": self.capacity}} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +register_data_type(StaticUnicodeString) + + +@dataclass(frozen=True, kw_only=True) +class StaticRawBytes(BaseDataType): + name = "r*" + type = "bytes" + numpy_character_code = "V" + item_size = 1 + capacity: int + + def to_dict(self) -> dict[str, JSON]: + return {"name": f"r{self.capacity * 8}"} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + +def parse_dtype(dtype: npt.DtypeLike | BaseDataType) -> BaseDataType: + from zarr.registry import get_data_type_from_numpy + + if isinstance(dtype, BaseDataType): + return dtype + return get_data_type_from_numpy(dtype) + + +register_data_type(StaticRawBytes) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index a2efb0f9fd..797cd6ea7a 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,8 +4,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json -from zarr.core.dtype.common import check_dtype_spec_v3 +from zarr.core.metadata.dtype import BaseDataType if TYPE_CHECKING: from typing import Self @@ -156,7 +155,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: ZDType[TBaseDType, TBaseScalar], + data_type: npt.DTypeLike | BaseDataType, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -352,3 +351,299 @@ def update_shape(self, shape: ChunkCoords) -> Self: def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) + + +# enum Literals can't be used in typing, so we have to restate all of the V3 dtypes as types +# https://github.com/python/typing/issues/781 + +BOOL_DTYPE = Literal["bool"] +BOOL = np.bool_ +INTEGER_DTYPE = Literal["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"] +INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 +FLOAT_DTYPE = Literal["float16", "float32", "float64"] +FLOAT = np.float16 | np.float32 | np.float64 +COMPLEX_DTYPE = Literal["complex64", "complex128"] +COMPLEX = np.complex64 | np.complex128 +STRING_DTYPE = Literal["string"] +STRING = np.str_ +BYTES_DTYPE = Literal["bytes"] +BYTES = np.bytes_ + +ALL_DTYPES = BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | BYTES_DTYPE + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: BOOL_DTYPE, +) -> BOOL: ... + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: INTEGER_DTYPE, +) -> INTEGER: ... + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: FLOAT_DTYPE, +) -> FLOAT: ... + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: COMPLEX_DTYPE, +) -> COMPLEX: ... + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: STRING_DTYPE, +) -> STRING: ... + + +@overload +def parse_fill_value( + fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, + dtype: BYTES_DTYPE, +) -> BYTES: ... + + +def parse_fill_value( + fill_value: Any, + dtype: ALL_DTYPES, +) -> Any: + """ + Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. + If `fill_value` is `None`, then this function will return the result of casting the value 0 + to the provided data type. Otherwise, `fill_value` will be cast to the provided data type. + + Note that some numpy dtypes use very permissive casting rules. For example, + `np.bool_({'not remotely a bool'})` returns `True`. Thus this function should not be used for + validating that the provided fill value is a valid instance of the data type. + + Parameters + ---------- + fill_value : Any + A potential fill value. + dtype : str + A valid Zarr format 3 DataType. + + Returns + ------- + A scalar instance of `dtype` + """ + data_type = DataType(dtype) + if fill_value is None: + raise ValueError("Fill value cannot be None") + if data_type == DataType.string: + return np.str_(fill_value) + if data_type == DataType.bytes: + return np.bytes_(fill_value) + + # the rest are numeric types + np_dtype = cast(np.dtype[Any], data_type.to_numpy()) + + if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): + if data_type in (DataType.complex64, DataType.complex128): + if len(fill_value) == 2: + decoded_fill_value = tuple( + SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value + ) + # complex datatypes serialize to JSON arrays with two elements + return np_dtype.type(complex(*decoded_fill_value)) + else: + msg = ( + f"Got an invalid fill value for complex data type {data_type.value}." + f"Expected a sequence with 2 elements, but {fill_value!r} has " + f"length {len(fill_value)}." + ) + raise ValueError(msg) + msg = f"Cannot parse non-string sequence {fill_value!r} as a scalar with type {data_type.value}." + raise TypeError(msg) + + # Cast the fill_value to the given dtype + try: + # This warning filter can be removed after Zarr supports numpy>=2.0 + # The warning is saying that the future behavior of out of bounds casting will be to raise + # an OverflowError. In the meantime, we allow overflow and catch cases where + # fill_value != casted_value below. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + casted_value = np.dtype(np_dtype).type(fill_value) + except (ValueError, OverflowError, TypeError) as e: + raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") from e + # Check if the value is still representable by the dtype + if (fill_value == "NaN" and np.isnan(casted_value)) or ( + fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value) + ): + pass + elif np_dtype.kind == "f": + # float comparison is not exact, especially when dtype str | bytes | np.generic: + if dtype == DataType.string: + return "" + elif dtype == DataType.bytes: + return b"" + else: + np_dtype = dtype.to_numpy() + np_dtype = cast(np.dtype[Any], np_dtype) + return np_dtype.type(0) # type: ignore[misc] + + +# For type checking +_bool = bool + + +class DataTypex(Enum): + bool = "bool" + int8 = "int8" + int16 = "int16" + int32 = "int32" + int64 = "int64" + uint8 = "uint8" + uint16 = "uint16" + uint32 = "uint32" + uint64 = "uint64" + float16 = "float16" + float32 = "float32" + float64 = "float64" + complex64 = "complex64" + complex128 = "complex128" + string = "string" + bytes = "bytes" + + @property + def byte_count(self) -> int | None: + data_type_byte_counts = { + DataType.bool: 1, + DataType.int8: 1, + DataType.int16: 2, + DataType.int32: 4, + DataType.int64: 8, + DataType.uint8: 1, + DataType.uint16: 2, + DataType.uint32: 4, + DataType.uint64: 8, + DataType.float16: 2, + DataType.float32: 4, + DataType.float64: 8, + DataType.complex64: 8, + DataType.complex128: 16, + } + try: + return data_type_byte_counts[self] + except KeyError: + # string and bytes have variable length + return None + + @property + def has_endianness(self) -> _bool: + return self.byte_count is not None and self.byte_count != 1 + + def to_numpy_shortname(self) -> str: + data_type_to_numpy = { + DataType.bool: "bool", + DataType.int8: "i1", + DataType.int16: "i2", + DataType.int32: "i4", + DataType.int64: "i8", + DataType.uint8: "u1", + DataType.uint16: "u2", + DataType.uint32: "u4", + DataType.uint64: "u8", + DataType.float16: "f2", + DataType.float32: "f4", + DataType.float64: "f8", + DataType.complex64: "c8", + DataType.complex128: "c16", + } + return data_type_to_numpy[self] + + def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: + # note: it is not possible to round trip DataType <-> np.dtype + # due to the fact that DataType.string and DataType.bytes both + # generally return np.dtype("O") from this function, even though + # they can originate as fixed-length types (e.g. " DataType: + if dtype.kind in "UT": + return DataType.string + elif dtype.kind == "S": + return DataType.bytes + elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": + # numpy < 2.0 does not support vlen string dtype + # so we fall back on object array of strings + return DataType.string + dtype_to_data_type = { + "|b1": "bool", + "bool": "bool", + "|i1": "int8", + " DataType: + if dtype is None: + return DataType[DEFAULT_DTYPE] + if isinstance(dtype, DataType): + return dtype + try: + return DataType(dtype) + except ValueError: + pass + try: + dtype = np.dtype(dtype) + except (ValueError, TypeError) as e: + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e + # check that this is a valid v3 data_type + try: + data_type = DataType.from_numpy(dtype) + except KeyError as e: + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e + return data_type diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py index f14b38840d..15c30b6f9b 100644 --- a/src/zarr/core/strings.py +++ b/src/zarr/core/strings.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Any, cast +from typing import Any, Union, cast from warnings import warn import numpy as np @@ -13,7 +13,7 @@ # when reading data back from Zarr. # Any valid string-like datatype should be fine for *setting* data. -VLenStringType = np.dtypes.StringDType | np.dtypes.ObjectDType +VLenStringType = Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"] _VLEN_STRING_DTYPE: VLenStringType _NUMPY_SUPPORTS_VLEN_STRING: bool diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 189d42abed..ad7bad40ae 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -5,12 +5,16 @@ from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar +import numpy as np + from zarr.core.config import BadConfigError, config from zarr.core.dtype import data_type_registry if TYPE_CHECKING: from importlib.metadata import EntryPoint + import numpy.typing as npt + from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -20,6 +24,8 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON + from zarr.core.dtype import ZarrDType + from zarr.core.metadata.dtype import BaseDataType __all__ = [ "Registry", @@ -27,10 +33,14 @@ "get_codec_class", "get_ndbuffer_class", "get_pipeline_class", + "get_v2dtype_class", + "get_v3dtype_class", "register_buffer", "register_codec", "register_ndbuffer", "register_pipeline", + "register_v2dtype", + "register_v3dtype", ] T = TypeVar("T") @@ -57,6 +67,9 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() +__data_type_registry: Registry[BaseDataType] = Registry() +__v3_dtype_registry: Registry[ZarrDType] = Registry() +__v2_dtype_registry: Registry[ZarrDType] = Registry() """ The registry module is responsible for managing implementations of codecs, @@ -93,9 +106,13 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) + __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) + __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v2dtype")) + __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v2dtype")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( entry_points.select(group="zarr", name="codec_pipeline") @@ -141,6 +158,18 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) +def register_data_type(cls: type[BaseDataType]) -> None: + __data_type_registry.register(cls) + + +def register_v3dtype(cls: type[ZarrDType]) -> None: + __v3_dtype_registry.register(cls) + + +def register_v2dtype(cls: type[ZarrDType]) -> None: + __v2_dtype_registry.register(cls) + + def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -277,4 +306,69 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) +def get_data_type(dtype: str) -> type[BaseDataType]: + __data_type_registry.lazy_load() + maybe_dtype_cls = __data_type_registry.get(dtype) + if maybe_dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype}") + return maybe_dtype_cls + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[BaseDataType]: + np_dtype = np.dtype(dtype) + __data_type_registry.lazy_load() + for val in __data_type_registry.values(): + if val.numpy_character_code == np_dtype.char: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry)}." + ) + + +# TODO: merge the get_vXdtype_class_ functions +# these can be used instead of the various parse_X functions (hopefully) +def get_v3dtype_class(dtype: str) -> type[ZarrDType]: + __v3_dtype_registry.lazy_load() + v3dtype_class = __v3_dtype_registry.get(dtype) + if v3dtype_class: + return v3dtype_class + raise ValueError( + f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v3_dtype_registry)}." + ) + + +def get_v3dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: + __v3_dtype_registry.lazy_load() + + dtype = np.dtype(dtype) + for val in __v3_dtype_registry.values(): + if dtype == val.to_numpy: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v3_dtype_registry)}." + ) + + +def get_v2dtype_class(dtype: str) -> type[ZarrDType]: + __v2_dtype_registry.lazy_load() + v2dtype_class = __v2_dtype_registry.get(dtype) + if v2dtype_class: + return v2dtype_class + raise ValueError( + f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v2_dtype_registry)}." + ) + + +def get_v2dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: + __v2_dtype_registry.lazy_load() + + dtype = np.dtype(dtype) + for val in __v2_dtype_registry.values(): + if dtype == val.to_numpy: + return val + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v2_dtype_registry)}." + ) + + _collect_entrypoints() diff --git a/tests/test_array.py b/tests/test_array.py index 3f8e61a2e3..3366c1cfa8 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1034,68 +1034,7 @@ def test_default_fill_value_None( assert a.fill_value is None @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - @pytest.mark.parametrize("dtype", zdtype_examples) - def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None: - """ - Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string - """ - skip_object_dtype(dtype) - a = zarr.create_array( - store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format - ) - - b = zarr.create_array( - store, - name="b", - shape=(5,), - chunks=(5,), - dtype=dtype.to_native_dtype(), - zarr_format=zarr_format, - ) - assert a.dtype == b.dtype - - # Structured dtypes do not have a numpy string representation that uniquely identifies them - if not isinstance(dtype, Structured): - if isinstance(dtype, VariableLengthUTF8): - # in numpy 2.3, StringDType().str becomes the string 'StringDType()' which numpy - # does not accept as a string representation of the dtype. - c = zarr.create_array( - store, - name="c", - shape=(5,), - chunks=(5,), - dtype=dtype.to_native_dtype().char, - zarr_format=zarr_format, - ) - else: - c = zarr.create_array( - store, - name="c", - shape=(5,), - chunks=(5,), - dtype=dtype.to_native_dtype().str, - zarr_format=zarr_format, - ) - assert a.dtype == c.dtype - - @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - @pytest.mark.parametrize("dtype", zdtype_examples) - def test_dtype_roundtrip( - dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat - ) -> None: - """ - Test that creating an array, then opening it, gets the same array. - """ - skip_object_dtype(dtype) - a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format) - b = zarr.open_array(store) - assert a.dtype == b.dtype - - @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - @pytest.mark.parametrize("dtype", ["uint8", "float32", "U3", "S4", "V1"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", [ diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 6fe1863464..b5e8b60a8c 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,9 +8,9 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.registry import get_data_type_from_numpy from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] @@ -53,12 +53,48 @@ def test_vlen_string( else: a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == get_data_type_from_native_dtype(data.dtype) - assert a.dtype == data.dtype + assert a.metadata.data_type == get_data_type_from_numpy(dtype) + assert a.dtype == expected_array_string_dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == get_data_type_from_native_dtype(data.dtype) - assert a.dtype == data.dtype + assert b.metadata.data_type == get_data_type_from_numpy(dtype) + assert a.dtype == expected_array_string_dtype + + +@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) +@pytest.mark.parametrize("as_object_array", [False, True]) +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) +def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: + bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] + data = np.array(bstrings).reshape((2, 3)) + assert data.dtype == "|S5" + + sp = StorePath(store, path="string") + a = zarr.create_array( + sp, + shape=data.shape, + chunks=data.shape, + dtype=data.dtype, + fill_value=b"", + compressors=compressor, + ) + assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy + + # should also work if input array is an object array, provided we explicitly specified + # a bytesting-like dtype when creating the Array + if as_object_array: + data = data.astype("O") + a[:, :] = data + assert np.array_equal(data, a[:, :]) + assert a.metadata.data_type == DataType.bytes + assert a.dtype == "O" + + # test round trip + b = Array.open(sp) + assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy + assert np.array_equal(data, b[:, :]) + assert b.metadata.data_type == DataType.bytes + assert a.dtype == "O" diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 4f385afa6d..1a2483fc9b 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -17,6 +17,7 @@ from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, + default_fill_value, parse_dimension_names, parse_zarr_format, ) From f0dfbbfa8b7bd1402b9ffac095eb929436c8ff07 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Feb 2025 14:56:13 +0100 Subject: [PATCH 004/129] rename base dtype, change type to kind --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 20 +--- src/zarr/core/common.py | 9 +- src/zarr/core/metadata/dtype.py | 192 +++++++++++++++++++------------- src/zarr/core/metadata/v3.py | 7 +- src/zarr/registry.py | 10 +- 6 files changed, 130 insertions(+), 112 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index f1803a45dd..95cd50cc42 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase # from zarr.core.metadata.v3 import DataType @@ -82,7 +82,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | BaseDataType + _data_type: np.dtype[Any] | DtypeBase _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3e6716ccf9..fb9fc6bff7 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -109,7 +109,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase from zarr.core.metadata.v2 import ( CompressorLikev2, get_object_codec_id, @@ -725,15 +725,7 @@ def _create_metadata_v3( compressors: tuple[BytesBytesCodec, ...] shape = parse_shapelike(shape) - if codecs is None: - filters = default_filters_v3(dtype) - serializer = default_serializer_v3(dtype) - compressors = default_compressors_v3(dtype) - - codecs_parsed = (*filters, serializer, *compressors) - else: - codecs_parsed = tuple(codecs) - + codecs = list(codecs) if codecs is not None else _get_default_codecs(dtype) chunk_key_encoding_parsed: ChunkKeyEncodingLike if chunk_key_encoding is None: chunk_key_encoding_parsed = {"name": "default", "separator": "/"} @@ -1763,7 +1755,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | BaseDataType + _data_type: np.dtype[Any] | DtypeBase if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: @@ -4663,9 +4655,9 @@ def _get_default_chunk_encoding_v3( """ dtype = get_data_type_from_numpy(np_dtype) - default_filters = zarr_config.get("array.v3_default_filters").get(dtype.type) - default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.type) - default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.type) + default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.kind) filters = zarr_config.get("array.v3_default_filters").get(dtype_category) compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 1ec7553802..4b8f3e85cf 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -22,7 +22,6 @@ from typing_extensions import ReadOnly from zarr.core.config import config as zarr_config -from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -191,13 +190,7 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: - if dtype is str or dtype == "str": - if zarr_format == 2: - # special case as object - return np.dtype("object") - else: - return _VLEN_STRING_DTYPE +def parse_dtype(dtype: Any) -> np.dtype[Any]: return np.dtype(dtype) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index ab101f2fad..f3a571b372 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,8 +1,9 @@ from abc import ABC -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, ClassVar, Literal, Self, get_args import numpy as np +import numpy.typing as npt from zarr.abc.metadata import Metadata from zarr.core.common import JSON @@ -28,19 +29,22 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -class BaseDataType(ABC, Metadata): +class Flexible: + capacity: int + + +class DtypeBase(ABC, Metadata): name: ClassVar[str] numpy_character_code: ClassVar[str] item_size: ClassVar[int | None] - type: ClassVar[DataTypeFlavor] - capacity: int + kind: ClassVar[DataTypeFlavor] def __init_subclass__(cls, **kwargs: object) -> None: required_attrs = [ "name", "numpy_character_code", "item_size", - "type", + "kind", ] for attr in required_attrs: if not hasattr(cls, attr): @@ -51,18 +55,43 @@ def __init_subclass__(cls, **kwargs: object) -> None: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} + @classmethod + def from_numpy(cls, dtype: npt.DTypeLike) -> Self: + """ + Create an instance of this dtype from a numpy dtype. + + Parameters + ---------- + dtype : npt.DTypeLike + The numpy dtype to create an instance from. + + Returns + ------- + Self + An instance of this dtype. + + Raises + ------ + ValueError + If the provided numpy dtype does not match this class. + """ + if np.dtype(dtype).char != cls.numpy_character_code: + raise ValueError( + f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." + ) + return cls() + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: endian_str = endianness_to_numpy_str(endianness) return np.dtype(endian_str + self.numpy_character_code) @dataclass(frozen=True, kw_only=True) -class Bool(BaseDataType): +class Bool(DtypeBase): name = "bool" item_size = 1 - type = "boolean" + kind = "boolean" numpy_character_code = "?" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: return super().to_numpy(endianness=endianness) @@ -72,12 +101,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDTy @dataclass(frozen=True, kw_only=True) -class Int8(BaseDataType): +class Int8(DtypeBase): name = "int8" item_size = 1 - type = "numeric" + kind = "numeric" numpy_character_code = "b" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) @@ -87,12 +115,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DTy @dataclass(frozen=True, kw_only=True) -class UInt8(BaseDataType): +class UInt8(DtypeBase): name = "uint8" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "B" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) @@ -102,12 +129,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DT @dataclass(frozen=True, kw_only=True) -class Int16(BaseDataType): +class Int16(DtypeBase): name = "int16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "h" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) @@ -117,12 +143,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DT @dataclass(frozen=True, kw_only=True) -class UInt16(BaseDataType): +class UInt16(DtypeBase): name = "uint16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "H" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) @@ -132,12 +157,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16D @dataclass(frozen=True, kw_only=True) -class Int32(BaseDataType): +class Int32(DtypeBase): name = "int32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "i" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) @@ -147,12 +171,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DT @dataclass(frozen=True, kw_only=True) -class UInt32(BaseDataType): +class UInt32(DtypeBase): name = "uint32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "I" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) @@ -162,12 +185,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32D @dataclass(frozen=True, kw_only=True) -class Int64(BaseDataType): +class Int64(DtypeBase): name = "int64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "l" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) @@ -177,12 +199,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DT @dataclass(frozen=True, kw_only=True) -class UInt64(BaseDataType): +class UInt64(DtypeBase): name = "uint64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "L" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) @@ -192,12 +213,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64D @dataclass(frozen=True, kw_only=True) -class Float16(BaseDataType): +class Float16(DtypeBase): name = "float16" item_size = 2 - type = "numeric" + kind = "numeric" numpy_character_code = "e" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) @@ -207,12 +227,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16 @dataclass(frozen=True, kw_only=True) -class Float32(BaseDataType): +class Float32(DtypeBase): name = "float32" item_size = 4 - type = "numeric" + kind = "numeric" numpy_character_code = "f" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) @@ -222,12 +241,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32 @dataclass(frozen=True, kw_only=True) -class Float64(BaseDataType): +class Float64(DtypeBase): name = "float64" item_size = 8 - type = "numeric" + kind = "numeric" numpy_character_code = "d" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) @@ -237,12 +255,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64 @dataclass(frozen=True, kw_only=True) -class Complex64(BaseDataType): +class Complex64(DtypeBase): name = "complex64" item_size = 16 - type = "numeric" + kind = "numeric" numpy_character_code = "F" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) @@ -252,12 +269,11 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class Complex128(BaseDataType): +class Complex128(DtypeBase): name = "complex64" item_size = 32 - type = "numeric" + kind = "numeric" numpy_character_code = "D" - capacity: int = field(default=1, init=False) def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) @@ -267,12 +283,17 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class StaticByteString(BaseDataType): +class StaticByteString(DtypeBase, Flexible): name = "numpy/static_byte_string" - type = "string" + kind = "string" numpy_character_code = "S" item_size = 1 - capacity: int + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != cls.numpy_character_code: + raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") + return cls(capacity=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.capacity}} @@ -282,20 +303,42 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.byte return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) +@dataclass(frozen=True, kw_only=True) +class StaticRawBytes(DtypeBase, Flexible): + name = "r*" + kind = "bytes" + numpy_character_code = "V" + item_size = 1 + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != "V": + raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") + return cls(capacity=dtype.itemsize) + + def to_dict(self) -> dict[str, JSON]: + return {"name": f"r{self.capacity * 8}"} + + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + endianness_code = endianness_to_numpy_str(endianness) + return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + + register_data_type(StaticByteString) if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(BaseDataType): + class VlenString(DtypeBase): name = "numpy/vlen_string" - type = "string" + kind = "string" numpy_character_code = "T" + # this uses UTF-8, so the encoding of a code point varies between + # 1 and 4 bytes item_size = None - capacity: int def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name} def to_numpy( self, endianness: Endianness | None = "native" @@ -306,15 +349,14 @@ def to_numpy( else: @dataclass(frozen=True, kw_only=True) - class VlenString(BaseDataType): + class VlenString(DtypeBase): name = "numpy/vlen_string" - type = "string" + kind = "string" numpy_character_code = "O" item_size = None - capacity: int def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name} def to_numpy( self, endianness: Endianness | None = "native" @@ -327,12 +369,17 @@ def to_numpy( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(BaseDataType): +class StaticUnicodeString(DtypeBase, Flexible): name = "numpy/static_unicode_string" - type = "string" + kind = "string" numpy_character_code = "U" item_size = 4 - capacity: int + + def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: + dtype = np.dtype(dtype) + if dtype.kind != "U": + raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") + return cls(capacity=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.capacity}} @@ -345,28 +392,13 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_ register_data_type(StaticUnicodeString) -@dataclass(frozen=True, kw_only=True) -class StaticRawBytes(BaseDataType): - name = "r*" - type = "bytes" - numpy_character_code = "V" - item_size = 1 - capacity: int - - def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.capacity * 8}"} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) - - -def parse_dtype(dtype: npt.DtypeLike | BaseDataType) -> BaseDataType: +def resolve_dtype(dtype: npt.DTypeLike | DtypeBase) -> DtypeBase: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, BaseDataType): + if isinstance(dtype, DtypeBase): return dtype - return get_data_type_from_numpy(dtype) + cls = get_data_type_from_numpy(dtype) + return cls.from_numpy(dtype) register_data_type(StaticRawBytes) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 797cd6ea7a..206cda1de0 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,7 +4,7 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import BaseDataType +from zarr.core.metadata.dtype import DtypeBase, resolve_dtype if TYPE_CHECKING: from typing import Self @@ -140,7 +140,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: ZDType[TBaseDType, TBaseScalar] + data_type: DtypeBase chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -155,7 +155,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | BaseDataType, + data_type: npt.DTypeLike | DtypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -169,6 +169,7 @@ def __init__( """ shape_parsed = parse_shapelike(shape) + data_type_parsed = resolve_dtype(data_type) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index ad7bad40ae..966e143073 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -25,7 +25,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import BaseDataType + from zarr.core.metadata.dtype import DtypeBase __all__ = [ "Registry", @@ -67,7 +67,7 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[BaseDataType] = Registry() +__data_type_registry: Registry[DtypeBase] = Registry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -158,7 +158,7 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def register_data_type(cls: type[BaseDataType]) -> None: +def register_data_type(cls: type[DtypeBase]) -> None: __data_type_registry.register(cls) @@ -306,7 +306,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[BaseDataType]: +def get_data_type(dtype: str) -> type[DtypeBase]: __data_type_registry.lazy_load() maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: @@ -314,7 +314,7 @@ def get_data_type(dtype: str) -> type[BaseDataType]: return maybe_dtype_cls -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[BaseDataType]: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DtypeBase]: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.values(): From 06db4f6c28f29bc79d2eb149ef92caec6d7dadd1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 09:57:23 +0100 Subject: [PATCH 005/129] start working on JSON serialization --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 6 +- src/zarr/core/metadata/dtype.py | 363 ++++++++++++++++++++++++++------ src/zarr/core/metadata/v3.py | 209 ++++-------------- src/zarr/registry.py | 10 +- 5 files changed, 350 insertions(+), 242 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 95cd50cc42..2492728a27 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DtypeBase +from zarr.core.metadata.dtype import DTypeBase # from zarr.core.metadata.v3 import DataType @@ -82,7 +82,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DtypeBase + _data_type: np.dtype[Any] | DTypeBase _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fb9fc6bff7..46358e3a6a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -109,7 +109,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DtypeBase +from zarr.core.metadata.dtype import DTypeBase from zarr.core.metadata.v2 import ( CompressorLikev2, get_object_codec_id, @@ -1755,7 +1755,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DtypeBase + _data_type: np.dtype[Any] | DTypeBase if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: @@ -4238,7 +4238,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - zdtype = parse_data_type(dtype, zarr_format=zarr_format) + dtype_parsed = parse_dtype(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index f3a571b372..19a00343c8 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,17 +1,18 @@ -from abc import ABC +from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, get_args +from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args import numpy as np import numpy.typing as npt from zarr.abc.metadata import Metadata -from zarr.core.common import JSON +from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.registry import register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] +JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: @@ -29,23 +30,121 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) +def check_json_bool(data: JSON) -> TypeGuard[bool]: + return bool(isinstance(data, bool)) + + +def check_json_int(data: JSON) -> TypeGuard[int]: + return bool(isinstance(data, int)) + + +def check_json_float(data: JSON) -> TypeGuard[float]: + if data == "NaN" or data == "Infinity" or data == "-Infinity": + return True + else: + return bool(isinstance(data, float)) + + +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: + if np.isnan(data): + return "NaN" + elif np.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + return float(data) + + +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: + # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly + # so we just re-use the v2 routine here + return float_to_json_v2(data) + + +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: + """ + convert a float to JSON as per the zarr v3 spec + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_to_json_v2(data: complex | np.complex_) -> JSONFloat: + return float_to_json_v2(data) + + +def complex_to_json_v3(data: complex | np.complex_) -> tuple[JSONFloat, JSONFloat]: + return float_to_json_v3(data.real), float_to_json_v3(data.imag) + + +def complex_to_json( + data: complex | np.complex_, zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat] | JSONFloat: + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.float_: + if data == "NaN": + _data = np.nan + elif data == "Infinity": + _data = np.inf + elif data == "-Infinity": + _data = -np.inf + else: + _data = data + return dtype.type(_data) + + +def float_from_json_v3(data: JSONFloat, dtype: Any) -> np.floating[Any]: + # todo: support the v3-specific NaN handling + return float_from_json_v2(data, dtype) + + +def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np.floating[Any]: + if zarr_format == 2: + return float_from_json_v2(data, dtype) + else: + return float_from_json_v3(data, dtype) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complex_: + return dtype.type(data) + + +def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complex_: + return dtype.type(data[0] + 1j * data[1]) + + +def complex_from_json( + data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat +) -> np.complex_: + if zarr_format == 2: + return complex_from_json_v2(data, dtype) + else: + return complex_from_json_v3(data, dtype) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +@dataclass(frozen=True, kw_only=True) class Flexible: - capacity: int + length: int -class DtypeBase(ABC, Metadata): +class DTypeBase(ABC, Metadata): name: ClassVar[str] numpy_character_code: ClassVar[str] item_size: ClassVar[int | None] kind: ClassVar[DataTypeFlavor] + default: object def __init_subclass__(cls, **kwargs: object) -> None: - required_attrs = [ - "name", - "numpy_character_code", - "item_size", - "kind", - ] + required_attrs = ["name", "numpy_character_code", "item_size", "kind", "default"] for attr in required_attrs: if not hasattr(cls, attr): raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") @@ -57,223 +156,356 @@ def to_dict(self) -> dict[str, JSON]: @classmethod def from_numpy(cls, dtype: npt.DTypeLike) -> Self: - """ - Create an instance of this dtype from a numpy dtype. - - Parameters - ---------- - dtype : npt.DTypeLike - The numpy dtype to create an instance from. - - Returns - ------- - Self - An instance of this dtype. - - Raises - ------ - ValueError - If the provided numpy dtype does not match this class. - """ if np.dtype(dtype).char != cls.numpy_character_code: raise ValueError( f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." ) return cls() + def default_value(self: Self, *, endianness: Endianness | None = None) -> np.generic: + return cast(np.generic, self.to_numpy(endianness=endianness).type(self.default)) + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: endian_str = endianness_to_numpy_str(endianness) return np.dtype(endian_str + self.numpy_character_code) + @abstractmethod + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: + """ + Convert a single value to JSON-serializable format. Depends on the zarr format. + """ + raise NotImplementedError + + @abstractmethod + def from_json_value( + self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.generic: + """ + Read a JSON-serializable value as a numpy scalar + """ + raise NotImplementedError + @dataclass(frozen=True, kw_only=True) -class Bool(DtypeBase): +class Bool(DTypeBase): name = "bool" item_size = 1 kind = "boolean" numpy_character_code = "?" + default = False def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: + return bool(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.bool_: + if check_json_bool(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") + register_data_type(Bool) @dataclass(frozen=True, kw_only=True) -class Int8(DtypeBase): +class Int8(DTypeBase): name = "int8" item_size = 1 kind = "numeric" numpy_character_code = "b" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int8: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int8) @dataclass(frozen=True, kw_only=True) -class UInt8(DtypeBase): +class UInt8(DTypeBase): name = "uint8" item_size = 2 kind = "numeric" numpy_character_code = "B" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint8: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt8) @dataclass(frozen=True, kw_only=True) -class Int16(DtypeBase): +class Int16(DTypeBase): name = "int16" item_size = 2 kind = "numeric" numpy_character_code = "h" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int16: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int16) @dataclass(frozen=True, kw_only=True) -class UInt16(DtypeBase): +class UInt16(DTypeBase): name = "uint16" item_size = 2 kind = "numeric" numpy_character_code = "H" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint16: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt16) @dataclass(frozen=True, kw_only=True) -class Int32(DtypeBase): +class Int32(DTypeBase): name = "int32" item_size = 4 kind = "numeric" numpy_character_code = "i" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int32: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int32) @dataclass(frozen=True, kw_only=True) -class UInt32(DtypeBase): +class UInt32(DTypeBase): name = "uint32" item_size = 4 kind = "numeric" numpy_character_code = "I" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint32: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt32) @dataclass(frozen=True, kw_only=True) -class Int64(DtypeBase): +class Int64(DTypeBase): name = "int64" item_size = 8 kind = "numeric" numpy_character_code = "l" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.int64: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(Int64) @dataclass(frozen=True, kw_only=True) -class UInt64(DtypeBase): +class UInt64(DTypeBase): name = "uint64" item_size = 8 kind = "numeric" numpy_character_code = "L" + default = 0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.uint64: + if check_json_int(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + register_data_type(UInt64) @dataclass(frozen=True, kw_only=True) -class Float16(DtypeBase): +class Float16(DTypeBase): name = "float16" item_size = 2 kind = "numeric" numpy_character_code = "e" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float16: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float16) @dataclass(frozen=True, kw_only=True) -class Float32(DtypeBase): +class Float32(DTypeBase): name = "float32" item_size = 4 kind = "numeric" numpy_character_code = "f" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float32: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float32) @dataclass(frozen=True, kw_only=True) -class Float64(DtypeBase): +class Float64(DTypeBase): name = "float64" item_size = 8 kind = "numeric" numpy_character_code = "d" + default = 0.0 def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float64: + if check_json_float(data): + return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Float64) @dataclass(frozen=True, kw_only=True) -class Complex64(DtypeBase): +class Complex64(DTypeBase): name = "complex64" item_size = 16 kind = "numeric" numpy_character_code = "F" + default = 0.0 + 0.0j def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) + def to_json_value(self, data: np.generic) -> float: + return float(data) + + def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.complex64: + if check_json_float(data): + return self.to_numpy(endianness=endianness).type(data) + raise TypeError(f"Invalid type: {data}. Expected a float.") + register_data_type(Complex64) @dataclass(frozen=True, kw_only=True) -class Complex128(DtypeBase): +class Complex128(DTypeBase): name = "complex64" item_size = 32 kind = "numeric" numpy_character_code = "D" + default = 0.0 + 0.0j def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) @@ -283,45 +515,49 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex @dataclass(frozen=True, kw_only=True) -class StaticByteString(DtypeBase, Flexible): +class StaticByteString(DTypeBase, Flexible): name = "numpy/static_byte_string" kind = "string" numpy_character_code = "S" item_size = 1 + default = b"" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != cls.numpy_character_code: raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name, "configuration": {"capacity": self.length}} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DtypeBase, Flexible): +class StaticRawBytes(DTypeBase, Flexible): name = "r*" kind = "bytes" numpy_character_code = "V" item_size = 1 + default = b"" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != "V": raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.capacity * 8}"} + return {"name": f"r{self.length * 8}"} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) register_data_type(StaticByteString) @@ -329,13 +565,14 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DtypeBase): + class VlenString(DTypeBase): name = "numpy/vlen_string" kind = "string" numpy_character_code = "T" # this uses UTF-8, so the encoding of a code point varies between # 1 and 4 bytes item_size = None + default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -349,11 +586,12 @@ def to_numpy( else: @dataclass(frozen=True, kw_only=True) - class VlenString(DtypeBase): + class VlenString(DTypeBase): name = "numpy/vlen_string" kind = "string" numpy_character_code = "O" item_size = None + default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -369,36 +607,43 @@ def to_numpy( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DtypeBase, Flexible): +class StaticUnicodeString(DTypeBase, Flexible): name = "numpy/static_unicode_string" kind = "string" numpy_character_code = "U" item_size = 4 + default = "" + @classmethod def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: dtype = np.dtype(dtype) if dtype.kind != "U": raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(capacity=dtype.itemsize) + return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.capacity}} + return {"name": self.name, "configuration": {"capacity": self.length}} def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.capacity)) + return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DtypeBase) -> DtypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeBase) -> DTypeBase: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, DtypeBase): + if isinstance(dtype, DTypeBase): return dtype cls = get_data_type_from_numpy(dtype) return cls.from_numpy(dtype) register_data_type(StaticRawBytes) + +INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +FLOAT_DTYPE = Float16 | Float32 | Float64 +COMPLEX_DTYPE = Complex64 | Complex128 +STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 206cda1de0..25638cf58c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,7 +4,16 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import DtypeBase, resolve_dtype +from zarr.core.metadata.dtype import ( + COMPLEX_DTYPE, + FLOAT_DTYPE, + INTEGER_DTYPE, + STRING_DTYPE, + Bool, + DTypeBase, + StaticRawBytes, + resolve_dtype, +) if TYPE_CHECKING: from typing import Self @@ -18,8 +27,13 @@ import json from collections.abc import Iterable from dataclasses import dataclass, field, replace +from enum import Enum from typing import Any, Literal +import numcodecs.abc +import numpy as np +import numpy.typing as npt + from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid @@ -34,8 +48,6 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.strings import _VLEN_STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class @@ -81,7 +93,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeBase) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -94,11 +106,14 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseSc # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - # TODO: Fix typing here - if isinstance(dtype, VariableLengthUTF8) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] + if dtype.kind == "string" and not codec_class_name == "VLenUTF8Codec": raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) + if dtype.kind == "bytes" and not codec_class_name == "VLenBytesCodec": + raise ValueError( + f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_class_name}`." + ) def parse_dimension_names(data: object) -> tuple[str | None, ...] | None: @@ -140,7 +155,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DtypeBase + data_type: DTypeBase chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -155,7 +170,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | DtypeBase, + data_type: npt.DTypeLike | DTypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -173,8 +188,8 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - # Note: relying on a type method is numpy-specific - fill_value_parsed = data_type.cast_scalar(fill_value) + # we pass a string here rather than an enum to make mypy happy + fill_value_parsed = parse_fill_value(fill_value, data_type_parsed) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -357,26 +372,19 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: # enum Literals can't be used in typing, so we have to restate all of the V3 dtypes as types # https://github.com/python/typing/issues/781 -BOOL_DTYPE = Literal["bool"] BOOL = np.bool_ -INTEGER_DTYPE = Literal["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"] INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -FLOAT_DTYPE = Literal["float16", "float32", "float64"] FLOAT = np.float16 | np.float32 | np.float64 -COMPLEX_DTYPE = Literal["complex64", "complex128"] COMPLEX = np.complex64 | np.complex128 -STRING_DTYPE = Literal["string"] + STRING = np.str_ -BYTES_DTYPE = Literal["bytes"] BYTES = np.bytes_ -ALL_DTYPES = BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | BYTES_DTYPE - @overload def parse_fill_value( fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: BOOL_DTYPE, + dtype: Bool, ) -> BOOL: ... @@ -411,14 +419,14 @@ def parse_fill_value( @overload def parse_fill_value( fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: BYTES_DTYPE, + dtype: StaticRawBytes, ) -> BYTES: ... def parse_fill_value( fill_value: Any, - dtype: ALL_DTYPES, -) -> Any: + dtype: DTypeBase, +) -> np.generic: """ Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. If `fill_value` is `None`, then this function will return the result of casting the value 0 @@ -432,26 +440,26 @@ def parse_fill_value( ---------- fill_value : Any A potential fill value. - dtype : str + dtype : DTypeBase A valid Zarr format 3 DataType. Returns ------- A scalar instance of `dtype` """ - data_type = DataType(dtype) if fill_value is None: raise ValueError("Fill value cannot be None") - if data_type == DataType.string: + + if dtype.kind == "string": return np.str_(fill_value) - if data_type == DataType.bytes: + if dtype.kind == "bytes": return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[Any], data_type.to_numpy()) + np_dtype = dtype.to_numpy() if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): - if data_type in (DataType.complex64, DataType.complex128): + if isindata_type in (DataType.complex64, DataType.complex128): if len(fill_value) == 2: decoded_fill_value = tuple( SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value @@ -503,148 +511,3 @@ def parse_fill_value( raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") return casted_value - - -def default_fill_value(dtype: DataType) -> str | bytes | np.generic: - if dtype == DataType.string: - return "" - elif dtype == DataType.bytes: - return b"" - else: - np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[Any], np_dtype) - return np_dtype.type(0) # type: ignore[misc] - - -# For type checking -_bool = bool - - -class DataTypex(Enum): - bool = "bool" - int8 = "int8" - int16 = "int16" - int32 = "int32" - int64 = "int64" - uint8 = "uint8" - uint16 = "uint16" - uint32 = "uint32" - uint64 = "uint64" - float16 = "float16" - float32 = "float32" - float64 = "float64" - complex64 = "complex64" - complex128 = "complex128" - string = "string" - bytes = "bytes" - - @property - def byte_count(self) -> int | None: - data_type_byte_counts = { - DataType.bool: 1, - DataType.int8: 1, - DataType.int16: 2, - DataType.int32: 4, - DataType.int64: 8, - DataType.uint8: 1, - DataType.uint16: 2, - DataType.uint32: 4, - DataType.uint64: 8, - DataType.float16: 2, - DataType.float32: 4, - DataType.float64: 8, - DataType.complex64: 8, - DataType.complex128: 16, - } - try: - return data_type_byte_counts[self] - except KeyError: - # string and bytes have variable length - return None - - @property - def has_endianness(self) -> _bool: - return self.byte_count is not None and self.byte_count != 1 - - def to_numpy_shortname(self) -> str: - data_type_to_numpy = { - DataType.bool: "bool", - DataType.int8: "i1", - DataType.int16: "i2", - DataType.int32: "i4", - DataType.int64: "i8", - DataType.uint8: "u1", - DataType.uint16: "u2", - DataType.uint32: "u4", - DataType.uint64: "u8", - DataType.float16: "f2", - DataType.float32: "f4", - DataType.float64: "f8", - DataType.complex64: "c8", - DataType.complex128: "c16", - } - return data_type_to_numpy[self] - - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: - # note: it is not possible to round trip DataType <-> np.dtype - # due to the fact that DataType.string and DataType.bytes both - # generally return np.dtype("O") from this function, even though - # they can originate as fixed-length types (e.g. " DataType: - if dtype.kind in "UT": - return DataType.string - elif dtype.kind == "S": - return DataType.bytes - elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": - # numpy < 2.0 does not support vlen string dtype - # so we fall back on object array of strings - return DataType.string - dtype_to_data_type = { - "|b1": "bool", - "bool": "bool", - "|i1": "int8", - " DataType: - if dtype is None: - return DataType[DEFAULT_DTYPE] - if isinstance(dtype, DataType): - return dtype - try: - return DataType(dtype) - except ValueError: - pass - try: - dtype = np.dtype(dtype) - except (ValueError, TypeError) as e: - raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e - # check that this is a valid v3 data_type - try: - data_type = DataType.from_numpy(dtype) - except KeyError as e: - raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e - return data_type diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 966e143073..ecd85030aa 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -25,7 +25,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import DtypeBase + from zarr.core.metadata.dtype import DTypeBase __all__ = [ "Registry", @@ -67,7 +67,7 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[DtypeBase] = Registry() +__data_type_registry: Registry[DTypeBase] = Registry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -158,7 +158,7 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def register_data_type(cls: type[DtypeBase]) -> None: +def register_data_type(cls: type[DTypeBase]) -> None: __data_type_registry.register(cls) @@ -306,7 +306,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[DtypeBase]: +def get_data_type(dtype: str) -> type[DTypeBase]: __data_type_registry.lazy_load() maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: @@ -314,7 +314,7 @@ def get_data_type(dtype: str) -> type[DtypeBase]: return maybe_dtype_cls -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DtypeBase]: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DTypeBase]: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.values(): From 2bb4707f4a81e9948c6813d56e4f88c18815b51b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 18:06:14 +0100 Subject: [PATCH 006/129] get json de/serialization largely working, and start making tests pass --- src/zarr/api/asynchronous.py | 15 +++ src/zarr/codecs/sharding.py | 9 +- src/zarr/core/array.py | 25 ++-- src/zarr/core/common.py | 9 +- src/zarr/core/config.py | 32 +++++ src/zarr/core/metadata/dtype.py | 200 +++++++++++++++++++++++-------- src/zarr/core/metadata/v3.py | 204 +++----------------------------- src/zarr/registry.py | 56 +++++++-- tests/test_array.py | 13 +- tests/test_metadata/test_v3.py | 44 ++++--- 10 files changed, 321 insertions(+), 286 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 9a380082b0..83b473ed67 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1010,6 +1010,21 @@ async def create( or _default_zarr_format() ) + if zarr_format == 2: + if chunks is None: + chunks = shape + dtype = parse_dtype(dtype, zarr_format=zarr_format) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) + elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] + if chunks is not None: + chunk_shape = chunks + chunks = None + else: + chunk_shape = shape + if synchronizer is not None: warnings.warn("synchronizer is not yet implemented", RuntimeWarning, stacklevel=2) if chunk_store is not None: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index cd8676b4d1..80b12856d6 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -357,13 +357,10 @@ def __init__( object.__setattr__(self, "index_location", index_location_parsed) # Use instance-local lru_cache to avoid memory leaks - - # numpy void scalars are not hashable, which means an array spec with a fill value that is - # a numpy void scalar will break the lru_cache. This is commented for now but should be - # fixed. See https://github.com/zarr-developers/zarr-python/issues/3054 + # TODO: fix these when we don't get hashability errors for certain numpy dtypes # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) - object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) - object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) + # object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) + # object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) # todo: typedict return type def __getstate__(self) -> dict[str, Any]: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 46358e3a6a..8877e7ab02 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -622,6 +622,7 @@ async def _create( dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: @@ -732,21 +733,31 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if isinstance(fill_value, DefaultFillValue) or fill_value is None: - # Use dtype's default scalar for DefaultFillValue sentinel - # For v3, None is converted to DefaultFillValue behavior - fill_value_parsed = dtype.default_scalar() + if dtype.kind in "UTS": + warn( + f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=2, + ) + + # resolve the numpy dtype into zarr v3 datatype + zarr_data_type = get_data_type_from_numpy(dtype) + + if fill_value is None: + # v3 spec will not allow a null fill value + fill_value_parsed = dtype.type(zarr_data_type.default) else: fill_value_parsed = fill_value chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) return ArrayV3Metadata( shape=shape, - data_type=dtype, + data_type=zarr_data_type, chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, - codecs=codecs_parsed, # type: ignore[arg-type] + codecs=codecs, dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) @@ -4238,7 +4249,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_parsed = parse_dtype(dtype) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 4b8f3e85cf..1ec7553802 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -22,6 +22,7 @@ from typing_extensions import ReadOnly from zarr.core.config import config as zarr_config +from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -190,7 +191,13 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any) -> np.dtype[Any]: +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: + if dtype is str or dtype == "str": + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _VLEN_STRING_DTYPE return np.dtype(dtype) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index cc3c33cd17..9a15bf17d2 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -106,6 +106,38 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, + "v2_default_compressor": { + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "numeric": None, + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], + "raw": None, + }, + "v3_default_filters": {"boolean": [], "numeric": [], "string": [], "bytes": []}, + "v3_default_serializer": { + "boolean": {"name": "bytes", "configuration": {"endian": "little"}}, + "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, + "string": {"name": "vlen-utf8"}, + "bytes": {"name": "vlen-bytes"}, + }, + "v3_default_compressors": { + "boolean": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "numeric": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "string": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "bytes": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 19a00343c8..8f940b0e0b 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections.abc import Sequence from dataclasses import dataclass from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args @@ -8,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import register_data_type +from zarr.registry import get_data_type_from_dict, register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -30,11 +31,11 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -def check_json_bool(data: JSON) -> TypeGuard[bool]: +def check_str(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) -def check_json_int(data: JSON) -> TypeGuard[int]: +def check_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) @@ -42,7 +43,21 @@ def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True else: - return bool(isinstance(data, float)) + return bool(isinstance(data, float | int)) + + +def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float(data[0]) + and check_json_float(data[1]) + ) + + +def check_str(data: JSON) -> TypeGuard[str]: + return bool(isinstance(data, str)) def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: @@ -70,16 +85,16 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complex_) -> JSONFloat: +def complex_to_json_v2(data: complex | np.complexfloating) -> JSONFloat: return float_to_json_v2(data) -def complex_to_json_v3(data: complex | np.complex_) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) def complex_to_json( - data: complex | np.complex_, zarr_format: ZarrFormat + data: complex | np.complexfloating, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat] | JSONFloat: if zarr_format == 2: return complex_to_json_v2(data) @@ -88,7 +103,7 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.float_: +def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.floating[Any]: if data == "NaN": _data = np.nan elif data == "Infinity": @@ -113,21 +128,24 @@ def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np. raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complex_: +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: return dtype.type(data) -def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complex_: - return dtype.type(data[0] + 1j * data[1]) +def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complexfloating: + return dtype.type(complex(*data)) def complex_from_json( data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complex_: +) -> np.complexfloating: if zarr_format == 2: return complex_from_json_v2(data, dtype) else: - return complex_from_json_v3(data, dtype) + if check_json_complex_float(data): + return complex_from_json_v3(data, dtype) + else: + raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") @@ -203,7 +221,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: - if check_json_bool(data): + if check_str(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -222,13 +240,13 @@ class Int8(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int8: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -247,13 +265,13 @@ class UInt8(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint8: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -272,13 +290,13 @@ class Int16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int16: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -297,13 +315,13 @@ class UInt16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint16: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -322,13 +340,13 @@ class Int32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int32: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -347,13 +365,13 @@ class UInt32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint32: - if check_json_int(data): + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -372,11 +390,13 @@ class Int64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.int64: - if check_json_int(data): + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.int64: + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -395,11 +415,13 @@ class UInt64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> int: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.uint64: - if check_json_int(data): + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.uint64: + if check_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -418,10 +440,12 @@ class Float16(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float16: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float16: if check_json_float(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -441,10 +465,12 @@ class Float32(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float32: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float32: if check_json_float(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -464,10 +490,12 @@ class Float64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: return float(data) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.float64: + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.float64: if check_json_float(data): return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -487,13 +515,19 @@ class Complex64(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic) -> float: - return float(data) + def to_json_value( + self, data: np.generic, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) - def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) -> np.complex64: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected a float.") + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.complex64: + if check_json_complex_float(data): + return complex_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") register_data_type(Complex64) @@ -501,7 +535,7 @@ def from_json_value(self, data: JSON, *, endianness: Endianness | None = None) - @dataclass(frozen=True, kw_only=True) class Complex128(DTypeBase): - name = "complex64" + name = "complex128" item_size = 32 kind = "numeric" numpy_character_code = "D" @@ -510,6 +544,20 @@ class Complex128(DTypeBase): def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: return super().to_numpy(endianness=endianness) + def to_json_value( + self, data: np.generic, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.complex128: + if check_json_complex_float(data): + return complex_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + register_data_type(Complex128) @@ -536,6 +584,21 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.byte endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value( + self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> str: + return data.tobytes().decode("ascii") + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.bytes_: + if check_str(data): + return self.to_numpy(endianness=endianness).type(data.encode("ascii")) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + +register_data_type(StaticByteString) + @dataclass(frozen=True, kw_only=True) class StaticRawBytes(DTypeBase, Flexible): @@ -559,8 +622,17 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: + return tuple(*data.tobytes()) -register_data_type(StaticByteString) + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.void: + # todo: check that this is well-formed + return self.to_numpy(endianness=endianness).type(bytes(data)) + + +register_data_type(StaticRawBytes) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -583,6 +655,14 @@ def to_numpy( endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> "np.dtypes.StringDType": + return self.to_numpy(endianness=endianness).type(data) + else: @dataclass(frozen=True, kw_only=True) @@ -602,6 +682,14 @@ def to_numpy( endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) + def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.dtypes.ObjectDType: + return self.to_numpy(endianness=endianness).type(data) + register_data_type(VlenString) @@ -628,20 +716,30 @@ def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_ endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.str_: + if not check_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_numpy(endianness=endianness).type(data) + register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DTypeBase) -> DTypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeBase | dict[str, JSON]) -> DTypeBase: from zarr.registry import get_data_type_from_numpy if isinstance(dtype, DTypeBase): return dtype - cls = get_data_type_from_numpy(dtype) - return cls.from_numpy(dtype) - + elif isinstance(dtype, dict): + return get_data_type_from_dict(dtype) + else: + return get_data_type_from_numpy(dtype) -register_data_type(StaticRawBytes) INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 25638cf58c..c64817842e 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,16 +4,6 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import ( - COMPLEX_DTYPE, - FLOAT_DTYPE, - INTEGER_DTYPE, - STRING_DTYPE, - Bool, - DTypeBase, - StaticRawBytes, - resolve_dtype, -) if TYPE_CHECKING: from typing import Self @@ -21,8 +11,9 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar - + from zarr.core.metadata.dtype import ( + DTypeBase, + ) import json from collections.abc import Iterable @@ -32,7 +23,6 @@ import numcodecs.abc import numpy as np -import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -49,7 +39,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec_class, get_data_type_by_name, get_data_type_from_dict def parse_zarr_format(data: object) -> Literal[3]: @@ -170,7 +160,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: npt.DTypeLike | DTypeBase, + data_type: DTypeBase, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -184,12 +174,12 @@ def __init__( """ shape_parsed = parse_shapelike(shape) - data_type_parsed = resolve_dtype(data_type) + data_type_parsed = data_type chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # we pass a string here rather than an enum to make mypy happy - fill_value_parsed = parse_fill_value(fill_value, data_type_parsed) + fill_value_parsed = data_type_parsed.to_numpy().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -301,13 +291,9 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - json_indent = config.get("json_indent") d = self.to_dict() - return { - ZARR_JSON: prototype.buffer.from_bytes( - json.dumps(d, allow_nan=False, indent=json_indent).encode() - ) - } + # d = _replace_special_floats(self.to_dict()) + return {ZARR_JSON: prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode())} @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: @@ -320,16 +306,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - if not check_dtype_spec_v3(data_type_json): - raise ValueError(f"Invalid data_type: {data_type_json!r}") - data_type = get_data_type_from_json(data_type_json, zarr_format=3) + if isinstance(data_type_json, str): + # check that the data_type attribute is valid + data_type = get_data_type_by_name(data_type_json) - # check that the fill value is consistent with the data type - try: - fill = _data.pop("fill_value") - fill_value_parsed = data_type.from_json_scalar(fill, zarr_format=3) - except ValueError as e: - raise TypeError(f"Invalid fill_value: {fill!r}") from e + else: + data_type = get_data_type_from_dict(data_type_json) # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) @@ -341,7 +323,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() - out_dict["fill_value"] = self.data_type.to_json_scalar( + out_dict["fill_value"] = self.data_type.to_json_value( self.fill_value, zarr_format=self.zarr_format ) if not isinstance(out_dict, dict): @@ -351,15 +333,9 @@ def to_dict(self) -> dict[str, JSON]: # the metadata document if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") - - # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with - # to_json, from_json, and have ZDType inherit from `Metadata` - # until then, we have this hack here, which relies on the fact that to_dict will pass through - # any non-`Metadata` fields as-is. - dtype_meta = out_dict["data_type"] - if isinstance(dtype_meta, ZDType): - out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) # type: ignore[unreachable] - + # if data_type has no configuration, we just serialize the name + if "configuration" not in out_dict["data_type"]: + out_dict["data_type"] = out_dict["data_type"]["name"] return out_dict def update_shape(self, shape: ChunkCoords) -> Self: @@ -367,147 +343,3 @@ def update_shape(self, shape: ChunkCoords) -> Self: def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) - - -# enum Literals can't be used in typing, so we have to restate all of the V3 dtypes as types -# https://github.com/python/typing/issues/781 - -BOOL = np.bool_ -INTEGER = np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -FLOAT = np.float16 | np.float32 | np.float64 -COMPLEX = np.complex64 | np.complex128 - -STRING = np.str_ -BYTES = np.bytes_ - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: Bool, -) -> BOOL: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: INTEGER_DTYPE, -) -> INTEGER: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: FLOAT_DTYPE, -) -> FLOAT: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: COMPLEX_DTYPE, -) -> COMPLEX: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: STRING_DTYPE, -) -> STRING: ... - - -@overload -def parse_fill_value( - fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool, - dtype: StaticRawBytes, -) -> BYTES: ... - - -def parse_fill_value( - fill_value: Any, - dtype: DTypeBase, -) -> np.generic: - """ - Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. - If `fill_value` is `None`, then this function will return the result of casting the value 0 - to the provided data type. Otherwise, `fill_value` will be cast to the provided data type. - - Note that some numpy dtypes use very permissive casting rules. For example, - `np.bool_({'not remotely a bool'})` returns `True`. Thus this function should not be used for - validating that the provided fill value is a valid instance of the data type. - - Parameters - ---------- - fill_value : Any - A potential fill value. - dtype : DTypeBase - A valid Zarr format 3 DataType. - - Returns - ------- - A scalar instance of `dtype` - """ - if fill_value is None: - raise ValueError("Fill value cannot be None") - - if dtype.kind == "string": - return np.str_(fill_value) - if dtype.kind == "bytes": - return np.bytes_(fill_value) - - # the rest are numeric types - np_dtype = dtype.to_numpy() - - if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): - if isindata_type in (DataType.complex64, DataType.complex128): - if len(fill_value) == 2: - decoded_fill_value = tuple( - SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value - ) - # complex datatypes serialize to JSON arrays with two elements - return np_dtype.type(complex(*decoded_fill_value)) - else: - msg = ( - f"Got an invalid fill value for complex data type {data_type.value}." - f"Expected a sequence with 2 elements, but {fill_value!r} has " - f"length {len(fill_value)}." - ) - raise ValueError(msg) - msg = f"Cannot parse non-string sequence {fill_value!r} as a scalar with type {data_type.value}." - raise TypeError(msg) - - # Cast the fill_value to the given dtype - try: - # This warning filter can be removed after Zarr supports numpy>=2.0 - # The warning is saying that the future behavior of out of bounds casting will be to raise - # an OverflowError. In the meantime, we allow overflow and catch cases where - # fill_value != casted_value below. - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - casted_value = np.dtype(np_dtype).type(fill_value) - except (ValueError, OverflowError, TypeError) as e: - raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") from e - # Check if the value is still representable by the dtype - if (fill_value == "NaN" and np.isnan(casted_value)) or ( - fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value) - ): - pass - elif np_dtype.kind == "f": - # float comparison is not exact, especially when dtype None: self[qualname] = cls +@dataclass(frozen=True, kw_only=True) +class DataTypeRegistry: + contents: dict[str, type[DTypeBase]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.load()) + + self.lazy_load_list.clear() + + def register(self: Self, cls: type[DTypeBase], clobber: bool = False) -> None: + if cls.name in self.contents and not clobber: + raise ValueError( + f"Data type {cls.name} already registered. Use clobber=True to overwrite." + ) + self.contents[cls.name] = cls + + def get(self, key: str) -> type[DTypeBase]: + return self.contents[key] + + __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry: Registry[DTypeBase] = Registry() +__data_type_registry = DataTypeRegistry() __v3_dtype_registry: Registry[ZarrDType] = Registry() __v2_dtype_registry: Registry[ZarrDType] = Registry() @@ -106,8 +129,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) @@ -306,22 +329,35 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type(dtype: str) -> type[DTypeBase]: +def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeBase: __data_type_registry.lazy_load() + if configuration is None: + _configuration = {} + else: + _configuration = configuration maybe_dtype_cls = __data_type_registry.get(dtype) if maybe_dtype_cls is None: raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls + return maybe_dtype_cls.from_dict(_configuration) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> type[DTypeBase]: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: + __data_type_registry.lazy_load() + dtype_name = dtype["name"] + dtype_cls = __data_type_registry.get(dtype_name) + if dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype_name}") + return dtype_cls.from_dict(dtype.get("configuration", {})) + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeBase: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() - for val in __data_type_registry.values(): + for val in __data_type_registry.contents.values(): if val.numpy_character_code == np_dtype.char: - return val + return val.from_numpy(np_dtype) raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry)}." + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/test_array.py b/tests/test_array.py index 3366c1cfa8..0a10dff110 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -60,7 +60,6 @@ from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError @@ -481,8 +480,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr._async_array._zdtype, - _fill_value=arr.fill_value, + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -507,8 +505,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr._async_array._zdtype, - _fill_value=arr.fill_value, + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -569,8 +566,7 @@ async def test_info_v3_async( result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr._zdtype, - _fill_value=arr.metadata.fill_value, + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -597,8 +593,7 @@ async def test_info_complete_async( result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr._zdtype, - _fill_value=arr.metadata.fill_value, + _data_type=arr.metadata.data_type, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 1a2483fc9b..c3c715c2c6 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,13 +15,14 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type +from zarr.core.metadata.dtype import complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, - default_fill_value, parse_dimension_names, parse_zarr_format, ) -from zarr.errors import MetadataValidationError, NodeTypeValidationError +from zarr.errors import MetadataValidationError +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from collections.abc import Sequence @@ -130,11 +131,24 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: as length-2 sequences """ zarr_format = 3 - dtype = get_data_type_from_native_dtype(dtype_str) - expected = dtype.to_native_dtype().type(complex(*fill_value)) - observed = dtype.from_json_scalar(fill_value, zarr_format=zarr_format) + dtype = get_data_type_from_numpy(dtype_str) + expected = dtype.to_numpy().type(complex(*fill_value)) + observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected - assert dtype.to_json_scalar(observed, zarr_format=zarr_format) == tuple(fill_value) + assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) + + +@pytest.mark.parametrize("dtype_str", [*complex_dtypes]) +@pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) +def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: + """ + Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not + equal to 2 + """ + dtype_instance = get_data_type_from_numpy(dtype_str) + match = f"Invalid type: {data}. Expected a sequence of two numbers." + with pytest.raises(TypeError, match=re.escape(match)): + complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @@ -144,9 +158,9 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) raises TypeError for invalid non-sequential types. This test excludes bool because the bool constructor takes anything. """ - dtype_instance = get_data_type_from_native_dtype(dtype_str) + dtype_instance = get_data_type_from_numpy(dtype_str) with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): - dtype_instance.from_json_scalar(fill_value, zarr_format=3) + dtype_instance.from_json_value(fill_value, zarr_format=3) @pytest.mark.parametrize( @@ -165,9 +179,9 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) This test excludes bool because the bool constructor takes anything, and complex because complex values can be created from length-2 sequences. """ - dtype_instance = get_data_type_from_native_dtype(dtype_str) + dtype_instance = get_data_type_from_numpy(dtype_str) with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): - dtype_instance.from_json_scalar(fill_value, zarr_format=3) + dtype_instance.from_json_value(fill_value, zarr_format=3) @pytest.mark.parametrize("chunk_grid", ["regular"]) @@ -257,21 +271,19 @@ def test_json_indent(indent: int): assert d == json.dumps(json.loads(d), indent=indent).encode() +@pytest.mark.xfail(reason="Data type not supported yet") @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) @pytest.mark.parametrize("precision", ["ns", "D"]) async def test_datetime_metadata(fill_value: int, precision: str) -> None: - dtype = DateTime64(unit=precision) metadata_dict = { "zarr_format": 3, "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": dtype.to_json(zarr_format=3), + "data_type": f" Date: Thu, 27 Feb 2025 18:11:18 +0100 Subject: [PATCH 007/129] tweak json type guards --- src/zarr/core/metadata/dtype.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 8f940b0e0b..542cc85e5f 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -31,14 +31,15 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", ) -def check_str(data: JSON) -> TypeGuard[bool]: +def check_json_bool(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) +def check_json_str(data: JSON) -> TypeGuard[str]: + return bool(isinstance(data, str)) -def check_int(data: JSON) -> TypeGuard[int]: +def check_json_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) - def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True @@ -56,10 +57,6 @@ def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat ) -def check_str(data: JSON) -> TypeGuard[str]: - return bool(isinstance(data, str)) - - def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: if np.isnan(data): return "NaN" @@ -221,7 +218,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: - if check_str(data): + if check_json_bool(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -246,7 +243,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int8: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -271,7 +268,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint8: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -296,7 +293,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int16: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -321,7 +318,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint16: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -346,7 +343,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int32: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -371,7 +368,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint32: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -396,7 +393,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.int64: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -421,7 +418,7 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.uint64: - if check_int(data): + if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -592,7 +589,7 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: - if check_str(data): + if check_json_bool(data): return self.to_numpy(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -722,7 +719,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.str_: - if not check_str(data): + if not check_json_bool(data): raise TypeError(f"Invalid type: {data}. Expected a string.") return self.to_numpy(endianness=endianness).type(data) From 3fd0bf8da0d93b663cdfc239ca55e9c85970e3d2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 Feb 2025 19:55:51 +0100 Subject: [PATCH 008/129] fix dtype sizes, adjust fill value parsing in from_dict, fix tests --- src/zarr/core/metadata/dtype.py | 15 ++++++++++----- src/zarr/core/metadata/v3.py | 9 +++++---- tests/test_metadata/test_v3.py | 15 ++++++++++++++- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 542cc85e5f..008751adc5 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -34,12 +34,15 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", def check_json_bool(data: JSON) -> TypeGuard[bool]: return bool(isinstance(data, bool)) + def check_json_str(data: JSON) -> TypeGuard[str]: return bool(isinstance(data, str)) + def check_json_int(data: JSON) -> TypeGuard[int]: return bool(isinstance(data, int)) + def check_json_float(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True @@ -254,7 +257,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class UInt8(DTypeBase): name = "uint8" - item_size = 2 + item_size = 1 kind = "numeric" numpy_character_code = "B" default = 0 @@ -488,13 +491,15 @@ def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64 return super().to_numpy(endianness=endianness) def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) + return float_to_json(data, zarr_format) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.float64: if check_json_float(data): - return float_from_json(data, dtype=self.to_numpy(endianness=endianness)) + return float_from_json( + data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + ) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -504,7 +509,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class Complex64(DTypeBase): name = "complex64" - item_size = 16 + item_size = 8 kind = "numeric" numpy_character_code = "F" default = 0.0 + 0.0j @@ -533,7 +538,7 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) class Complex128(DTypeBase): name = "complex128" - item_size = 32 + item_size = 16 kind = "numeric" numpy_character_code = "D" default = 0.0 + 0.0j diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index c64817842e..091c6da817 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -174,19 +174,17 @@ def __init__( """ shape_parsed = parse_shapelike(shape) - data_type_parsed = data_type chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - # we pass a string here rather than an enum to make mypy happy - fill_value_parsed = data_type_parsed.to_numpy().type(fill_value) + fill_value_parsed = data_type.to_numpy().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type, + dtype=data_type.to_numpy(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -313,6 +311,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: else: data_type = get_data_type_from_dict(data_type_json) + # check that the fill value is consistent with the data type + fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) + # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index c3c715c2c6..5670cd798a 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,7 +15,7 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json +from zarr.core.metadata.dtype import Flexible, complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -336,3 +336,16 @@ async def test_special_float_fill_values(fill_value: str) -> None: elif fill_value == "-Infinity": assert np.isneginf(m.fill_value) assert d["fill_value"] == "-Infinity" + + +@pytest.mark.parametrize("dtype_str", dtypes) +def test_dtypes(dtype_str: str) -> None: + dt = get_data_type_from_numpy(dtype_str) + np_dtype = dt.to_numpy() + + if not isinstance(dt, Flexible): + assert dt.item_size == np_dtype.itemsize + else: + assert dt.length == np_dtype.itemsize + + assert dt.numpy_character_code == np_dtype.char From 404a71c1b6718c3e6cb656f2dc2fe335915c62b2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 12:54:57 +0100 Subject: [PATCH 009/129] mid-refactor commit --- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 6 +- src/zarr/core/metadata/dtype.py | 345 ++++++-------------------------- src/zarr/core/metadata/v3.py | 8 +- src/zarr/registry.py | 18 +- 5 files changed, 80 insertions(+), 301 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 2492728a27..22ef37eef8 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DTypeBase +from zarr.core.metadata.dtype import DTypeWrapper # from zarr.core.metadata.v3 import DataType @@ -82,7 +82,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DTypeBase + _data_type: np.dtype[Any] | DTypeWrapper _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8877e7ab02..221c403d66 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -109,7 +109,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeBase +from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v2 import ( CompressorLikev2, get_object_codec_id, @@ -746,7 +746,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.type(zarr_data_type.default) + fill_value_parsed = dtype.type(zarr_data_type._default_value) else: fill_value_parsed = fill_value @@ -1766,7 +1766,7 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DTypeBase + _data_type: np.dtype[Any] | DTypeWrapper if isinstance(self.metadata, ArrayV2Metadata): _data_type = self.metadata.dtype else: diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 008751adc5..106b3088d0 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypeGuard, cast, get_args +from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args import numpy as np import numpy.typing as npt @@ -148,44 +148,28 @@ def complex_from_json( raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +TDType = TypeVar("TDType", bound=np.dtype[Any]) +TScalar = TypeVar("TScalar", bound=np.generic) @dataclass(frozen=True, kw_only=True) class Flexible: length: int - -class DTypeBase(ABC, Metadata): +class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] - numpy_character_code: ClassVar[str] - item_size: ClassVar[int | None] + dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - default: object - - def __init_subclass__(cls, **kwargs: object) -> None: - required_attrs = ["name", "numpy_character_code", "item_size", "kind", "default"] - for attr in required_attrs: - if not hasattr(cls, attr): - raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") - - return super().__init_subclass__(**kwargs) + _default_value: object def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - @classmethod - def from_numpy(cls, dtype: npt.DTypeLike) -> Self: - if np.dtype(dtype).char != cls.numpy_character_code: - raise ValueError( - f"Invalid dtype {dtype}. Expected dtype with character code == {cls.numpy_character_code}." - ) - return cls() - - def default_value(self: Self, *, endianness: Endianness | None = None) -> np.generic: - return cast(np.generic, self.to_numpy(endianness=endianness).type(self.default)) + def default_value(self: Self, *, endianness: Endianness | None = None) -> TScalar: + return cast(np.generic, self.to_numpy(endianness=endianness).type(self._default_value)) - def to_numpy(self: Self, *, endianness: Endianness | None = None) -> np.dtype[Any]: + def to_numpy(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) - return np.dtype(endian_str + self.numpy_character_code) + return self.dtype_cls().newbyteorder(endian_str) @abstractmethod def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: @@ -197,7 +181,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: @abstractmethod def from_json_value( self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.generic: + ) -> TScalar: """ Read a JSON-serializable value as a numpy scalar """ @@ -205,16 +189,11 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class Bool(DTypeBase): +class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" - item_size = 1 kind = "boolean" - numpy_character_code = "?" default = False - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.BoolDType: - return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) @@ -228,295 +207,128 @@ def from_json_value( register_data_type(Bool) - -@dataclass(frozen=True, kw_only=True) -class Int8(DTypeBase): - name = "int8" - item_size = 1 +class BaseInt(DTypeWrapper[TDType, TScalar]): kind = "numeric" - numpy_character_code = "b" default = 0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int8DType: - return super().to_numpy(endianness=endianness) - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int8: + ) -> TScalar: if check_json_int(data): return self.to_numpy(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") +@dataclass(frozen=True, kw_only=True) +class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + name = "int8" + + register_data_type(Int8) @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeBase): +class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - item_size = 1 - kind = "numeric" - numpy_character_code = "B" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt8DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint8: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(UInt8) @dataclass(frozen=True, kw_only=True) -class Int16(DTypeBase): +class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): name = "int16" - item_size = 2 - kind = "numeric" - numpy_character_code = "h" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int16: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int16) @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeBase): +class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - item_size = 2 - kind = "numeric" - numpy_character_code = "H" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint16: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - register_data_type(UInt16) @dataclass(frozen=True, kw_only=True) -class Int32(DTypeBase): +class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): name = "int32" - item_size = 4 - kind = "numeric" - numpy_character_code = "i" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int32: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int32) @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeBase): +class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - item_size = 4 - kind = "numeric" - numpy_character_code = "I" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint32: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - register_data_type(UInt32) @dataclass(frozen=True, kw_only=True) -class Int64(DTypeBase): +class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): name = "int64" - item_size = 8 - kind = "numeric" - numpy_character_code = "l" - default = 0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Int64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.int64: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(Int64) @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeBase): +class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): name = "uint64" - item_size = 8 - kind = "numeric" - numpy_character_code = "L" - default = 0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.UInt64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.uint64: - if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") register_data_type(UInt64) -@dataclass(frozen=True, kw_only=True) -class Float16(DTypeBase): - name = "float16" - item_size = 2 +class FloatBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - numpy_character_code = "e" default = 0.0 - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float16DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) + def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float16: + ) -> TScalar: if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_numpy(endianness=endianness).type(float_from_json)(data, zarr_format) raise TypeError(f"Invalid type: {data}. Expected a float.") +@dataclass(frozen=True, kw_only=True) +class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): + name = "float16" + register_data_type(Float16) @dataclass(frozen=True, kw_only=True) -class Float32(DTypeBase): +class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): name = "float32" - item_size = 4 - kind = "numeric" - numpy_character_code = "f" - default = 0.0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float32DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float32: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(data) - raise TypeError(f"Invalid type: {data}. Expected a float.") - + register_data_type(Float32) @dataclass(frozen=True, kw_only=True) -class Float64(DTypeBase): +class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): name = "float64" - item_size = 8 - kind = "numeric" - numpy_character_code = "d" - default = 0.0 - - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Float64DType: - return super().to_numpy(endianness=endianness) - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> float: - return float_to_json(data, zarr_format) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.float64: - if check_json_float(data): - return float_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format - ) - raise TypeError(f"Invalid type: {data}. Expected a float.") register_data_type(Float64) @dataclass(frozen=True, kw_only=True) -class Complex64(DTypeBase): +class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" - item_size = 8 kind = "numeric" - numpy_character_code = "F" default = 0.0 + 0.0j - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex64DType: - return super().to_numpy(endianness=endianness) - def to_json_value( self, data: np.generic, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: @@ -536,16 +348,12 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class Complex128(DTypeBase): +class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" - item_size = 16 kind = "numeric" - numpy_character_code = "D" + dtype_cls = np.dtypes.Complex128DType default = 0.0 + 0.0j - def to_numpy(self, *, endianness: Endianness | None = None) -> np.dtypes.Complex128DType: - return super().to_numpy(endianness=endianness) - def to_json_value( self, data: np.generic, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: @@ -565,26 +373,17 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticByteString(DTypeBase, Flexible): +class StaticByteString(DTypeWrapper[np.dtypes.BytesDType, np.bytes_], Flexible): name = "numpy/static_byte_string" kind = "string" - numpy_character_code = "S" - item_size = 1 default = b"" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != cls.numpy_character_code: - raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(length=dtype.itemsize) - def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.length}} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.bytes_]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.BytesDType: endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + return self.dtype_cls(self.length).newbyteorder(endianness_code) def to_json_value( self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None @@ -603,26 +402,20 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DTypeBase, Flexible): +class StaticRawBytes(DTypeWrapper[np.dtypes.VoidDType, np.void], Flexible): name = "r*" kind = "bytes" - numpy_character_code = "V" - item_size = 1 default = b"" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != "V": - raise ValueError(f"Invalid dtype {dtype}. Expected a bytes dtype.") - return cls(length=dtype.itemsize) def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * 8}"} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.void]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.VoidDType: + # this needs to be overridden because numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) + return np.dtype(f'{endianness_code}V{self.length}') def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) @@ -639,13 +432,10 @@ def from_json_value( if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeBase): + class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" - numpy_character_code = "T" - # this uses UTF-8, so the encoding of a code point varies between - # 1 and 4 bytes - item_size = None + dtype_cls = np.dtypes.StringDType default = "" def to_dict(self) -> dict[str, JSON]: @@ -653,7 +443,7 @@ def to_dict(self) -> dict[str, JSON]: def to_numpy( self, endianness: Endianness | None = "native" - ) -> np.dtype[np.dtypes.StringDType]: + ) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -662,34 +452,32 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> "np.dtypes.StringDType": + ) -> str: return self.to_numpy(endianness=endianness).type(data) else: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeBase): + class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" - numpy_character_code = "O" - item_size = None + dtype_cls = np.dtypes.ObjectDType default = "" def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def to_numpy( - self, endianness: Endianness | None = "native" + self, endianness: Endianness | None = None ) -> np.dtype[np.dtypes.ObjectDType]: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code) + return super().to_numpy(endianness=endianness) def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.dtypes.ObjectDType: + ) -> str: return self.to_numpy(endianness=endianness).type(data) @@ -697,24 +485,15 @@ def from_json_value( @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DTypeBase, Flexible): +class StaticUnicodeString(DTypeWrapper[np.dtypes.StrDType, np.str_], Flexible): name = "numpy/static_unicode_string" kind = "string" - numpy_character_code = "U" - item_size = 4 default = "" - @classmethod - def from_numpy(cls: type[Self], dtype: npt.DTypeLike) -> Self: - dtype = np.dtype(dtype) - if dtype.kind != "U": - raise ValueError(f"Invalid dtype {dtype}. Expected a string dtype.") - return cls(length=dtype.itemsize) - def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"capacity": self.length}} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtype[np.str_]: + def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.StrDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) @@ -732,10 +511,10 @@ def from_json_value( register_data_type(StaticUnicodeString) -def resolve_dtype(dtype: npt.DTypeLike | DTypeBase | dict[str, JSON]) -> DTypeBase: +def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: from zarr.registry import get_data_type_from_numpy - if isinstance(dtype, DTypeBase): + if isinstance(dtype, DTypeWrapper): return dtype elif isinstance(dtype, dict): return get_data_type_from_dict(dtype) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 091c6da817..7f180d2ed5 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -12,7 +12,7 @@ from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords from zarr.core.metadata.dtype import ( - DTypeBase, + DTypeWrapper, ) import json @@ -83,7 +83,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeBase) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -145,7 +145,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeBase + data_type: DTypeWrapper chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -160,7 +160,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeBase, + data_type: DTypeWrapper, chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 0f056a826e..2960a71d66 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -26,7 +26,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON from zarr.core.dtype import ZarrDType - from zarr.core.metadata.dtype import DTypeBase + from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ "Registry", @@ -66,7 +66,7 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[DTypeBase]] = field(default_factory=dict, init=False) + contents: dict[str, type[DTypeWrapper]] = field(default_factory=dict, init=False) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: @@ -75,14 +75,14 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, cls: type[DTypeBase], clobber: bool = False) -> None: + def register(self: Self, cls: type[DTypeWrapper], clobber: bool = False) -> None: if cls.name in self.contents and not clobber: raise ValueError( f"Data type {cls.name} already registered. Use clobber=True to overwrite." ) self.contents[cls.name] = cls - def get(self, key: str) -> type[DTypeBase]: + def get(self, key: str) -> type[DTypeWrapper]: return self.contents[key] @@ -181,7 +181,7 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def register_data_type(cls: type[DTypeBase]) -> None: +def register_data_type(cls: type[DTypeWrapper]) -> None: __data_type_registry.register(cls) @@ -329,7 +329,7 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeBase: +def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeWrapper: __data_type_registry.lazy_load() if configuration is None: _configuration = {} @@ -341,7 +341,7 @@ def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = No return maybe_dtype_cls.from_dict(_configuration) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: __data_type_registry.lazy_load() dtype_name = dtype["name"] dtype_cls = __data_type_registry.get(dtype_name) @@ -350,12 +350,12 @@ def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeBase: return dtype_cls.from_dict(dtype.get("configuration", {})) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeBase: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): if val.numpy_character_code == np_dtype.char: - return val.from_numpy(np_dtype) + return val.from_str(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) From aaeeb9847d78e2756e53da12613d23cd20141118 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 19:44:43 +0100 Subject: [PATCH 010/129] working form for dtype classes --- src/zarr/core/array.py | 2 +- src/zarr/core/metadata/dtype.py | 366 ++++++++++++++++++-------------- src/zarr/core/metadata/v3.py | 13 +- src/zarr/registry.py | 14 +- tests/test_codecs/test_vlen.py | 8 +- tests/test_metadata/test_v3.py | 32 +-- 6 files changed, 239 insertions(+), 196 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 221c403d66..52204d830e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -746,7 +746,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.type(zarr_data_type._default_value) + fill_value_parsed = zarr_data_type.default_value else: fill_value_parsed = fill_value diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 106b3088d0..1b57831943 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -5,11 +5,12 @@ import numpy as np import numpy.typing as npt +from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import get_data_type_from_dict, register_data_type +from zarr.registry import register_data_type Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -32,34 +33,80 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", def check_json_bool(data: JSON) -> TypeGuard[bool]: + """ + Check if a JSON value represents a boolean. + """ return bool(isinstance(data, bool)) def check_json_str(data: JSON) -> TypeGuard[str]: + """ + Check if a JSON value represents a string. + """ return bool(isinstance(data, str)) def check_json_int(data: JSON) -> TypeGuard[int]: + """ + Check if a JSON value represents an integer. + """ return bool(isinstance(data, int)) -def check_json_float(data: JSON) -> TypeGuard[float]: +def check_json_float_v2(data: JSON) -> TypeGuard[float]: if data == "NaN" or data == "Infinity" or data == "-Infinity": return True else: return bool(isinstance(data, float | int)) -def check_json_complex_float(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_float_v3(data: JSON) -> TypeGuard[float]: + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) + + +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) + + +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the zarr v3 spec + """ return ( not isinstance(data, str) and isinstance(data, Sequence) and len(data) == 2 - and check_json_float(data[0]) - and check_json_float(data[1]) + and check_json_float_v3(data[0]) + and check_json_float_v3(data[1]) ) +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + """ + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) + + +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + if zarr_format == 2: + return check_json_complex_float_v2(data) + else: + return check_json_complex_float_v3(data) + + def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: if np.isnan(data): return "NaN" @@ -103,29 +150,28 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_from_json_v2(data: JSONFloat, dtype: np.floating[Any]) -> np.floating[Any]: - if data == "NaN": - _data = np.nan - elif data == "Infinity": - _data = np.inf - elif data == "-Infinity": - _data = -np.inf - else: - _data = data - return dtype.type(_data) +def float_from_json_v2(data: JSONFloat) -> float: + match data: + case "NaN": + return float("nan") + case "Infinity": + return float("inf") + case "-Infinity": + return float("-inf") + case _: + return float(data) -def float_from_json_v3(data: JSONFloat, dtype: Any) -> np.floating[Any]: +def float_from_json_v3(data: JSONFloat) -> float: # todo: support the v3-specific NaN handling - return float_from_json_v2(data, dtype) + return float_from_json_v2(data) -def float_from_json(data: JSONFloat, dtype: Any, zarr_format: ZarrFormat) -> np.floating[Any]: +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: if zarr_format == 2: - return float_from_json_v2(data, dtype) + return float_from_json_v2(data) else: - return float_from_json_v3(data, dtype) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + return float_from_json_v3(data) def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: @@ -142,32 +188,42 @@ def complex_from_json( if zarr_format == 2: return complex_from_json_v2(data, dtype) else: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json_v3(data, dtype) else: raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + TDType = TypeVar("TDType", bound=np.dtype[Any]) TScalar = TypeVar("TScalar", bound=np.generic) -@dataclass(frozen=True, kw_only=True) -class Flexible: - length: int class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] - dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype + dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - _default_value: object + default_value: TScalar + + def __init_subclass__(cls) -> None: + # Subclasses will bind the first generic type parameter to an attribute of the class + # TODO: wrap this in some *very informative* error handling + generic_args = get_args(get_original_bases(cls)[0]) + cls.dtype_cls = generic_args[0] + return super().__init_subclass__() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def default_value(self: Self, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.to_numpy(endianness=endianness).type(self._default_value)) + def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: + return cast(np.generic, self.to_dtype(endianness=endianness).type(value)) - def to_numpy(self: Self, *, endianness: Endianness | None = None) -> TDType: + @classmethod + @abstractmethod + def from_dtype(cls: type[Self], dtype: TDType) -> Self: + raise NotImplementedError + + def to_dtype(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) return self.dtype_cls().newbyteorder(endian_str) @@ -192,7 +248,11 @@ def from_json_value( class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" kind = "boolean" - default = False + default_value = np.False_ + + @classmethod + def from_dtype(cls, dtype: np.dtypes.BoolDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) @@ -201,15 +261,16 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: if check_json_bool(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") -register_data_type(Bool) - -class BaseInt(DTypeWrapper[TDType, TScalar]): +class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - default = 0 + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) @@ -218,76 +279,64 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_int(data): - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): +class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): name = "int8" - - -register_data_type(Int8) + default_value = np.int8(0) @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): +class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - - -register_data_type(UInt8) + default_value = np.uint8(0) @dataclass(frozen=True, kw_only=True) -class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): +class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): name = "int16" - - -register_data_type(Int16) + default_value = np.int16(0) @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): +class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - -register_data_type(UInt16) + default_value = np.uint16(0) @dataclass(frozen=True, kw_only=True) -class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): +class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): name = "int32" - - -register_data_type(Int32) + default_value = np.int32(0) @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): +class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - -register_data_type(UInt32) + default_value = np.uint32(0) @dataclass(frozen=True, kw_only=True) -class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): +class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): name = "int64" - - -register_data_type(Int64) + default_value = np.int64(0) @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): +class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): name = "uint64" + default_value = np.uint64(0) - -register_data_type(UInt64) - - -class FloatBase(DTypeWrapper[TDType, TScalar]): +class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" - default = 0.0 + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) @@ -295,39 +344,38 @@ def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: - if check_json_float(data): - return self.to_numpy(endianness=endianness).type(float_from_json)(data, zarr_format) + if check_json_float_v2(data): + return self.to_dtype(endianness=endianness).type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") + @dataclass(frozen=True, kw_only=True) -class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): +class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): name = "float16" - - -register_data_type(Float16) + default_value = np.float16(0) @dataclass(frozen=True, kw_only=True) -class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): +class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): name = "float32" - - -register_data_type(Float32) + default_value = np.float32(0) @dataclass(frozen=True, kw_only=True) -class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): +class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): name = "float64" - - -register_data_type(Float64) + default_value = np.float64(0) @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" kind = "numeric" - default = 0.0 + 0.0j + default_value = np.complex64(0) + + @classmethod + def from_dtype(cls, dtype: np.dtypes.Complex64DType) -> Self: + return cls() def to_json_value( self, data: np.generic, zarr_format: ZarrFormat @@ -337,22 +385,22 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.complex64: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") -register_data_type(Complex64) - - @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" kind = "numeric" - dtype_cls = np.dtypes.Complex128DType - default = 0.0 + 0.0j + default_value = np.complex128(0) + + @classmethod + def from_dtype(cls, dtype: np.dtypes.Complex128DType) -> Self: + return cls() def to_json_value( self, data: np.generic, zarr_format: ZarrFormat @@ -362,28 +410,36 @@ def to_json_value( def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.complex128: - if check_json_complex_float(data): + if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_numpy(endianness=endianness), zarr_format=zarr_format + data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") -register_data_type(Complex128) +@dataclass(frozen=True, kw_only=True) +class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): + item_size_bits: ClassVar[int] + length: int + + @classmethod + def from_dtype(cls, dtype: TDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self, endianness: Endianness | None = None) -> TDType: + endianness_code = endianness_to_numpy_str(endianness) + return self.dtype_cls(self.length).newbyteorder(endianness_code) @dataclass(frozen=True, kw_only=True) -class StaticByteString(DTypeWrapper[np.dtypes.BytesDType, np.bytes_], Flexible): +class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" kind = "string" - default = b"" + default_value = b"" + item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.length}} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.BytesDType: - endianness_code = endianness_to_numpy_str(endianness) - return self.dtype_cls(self.length).newbyteorder(endianness_code) + return {"name": self.name, "configuration": {"length": self.length}} def to_json_value( self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None @@ -394,28 +450,25 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: if check_json_bool(data): - return self.to_numpy(endianness=endianness).type(data.encode("ascii")) + return self.to_dtype(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") -register_data_type(StaticByteString) - - @dataclass(frozen=True, kw_only=True) -class StaticRawBytes(DTypeWrapper[np.dtypes.VoidDType, np.void], Flexible): +class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): name = "r*" kind = "bytes" - default = b"" - + default_value = np.void(b"") + item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.length * 8}"} + return {"name": f"r{self.length * self.item_size_bits}"} - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.VoidDType: + def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(f'{endianness_code}V{self.length}') + return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) @@ -424,10 +477,29 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.void: # todo: check that this is well-formed - return self.to_numpy(endianness=endianness).type(bytes(data)) + return self.to_dtype(endianness=endianness).type(bytes(data)) + + +@dataclass(frozen=True, kw_only=True) +class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): + name = "numpy/static_unicode_string" + kind = "string" + default_value = np.str_("") + item_size_bits = 32 # UCS4 is 32 bits per code point + def to_dict(self) -> dict[str, JSON]: + return {"name": self.name, "configuration": {"length": self.length}} + + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value( + self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None + ) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_dtype(endianness=endianness).type(data) -register_data_type(StaticRawBytes) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -435,15 +507,16 @@ def from_json_value( class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" - dtype_cls = np.dtypes.StringDType - default = "" + default_value = "" + + @classmethod + def from_dtype(cls, dtype: np.dtypes.StringDType) -> Self: + return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_numpy( - self, endianness: Endianness | None = "native" - ) -> np.dtypes.StringDType: + def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -453,7 +526,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_numpy(endianness=endianness).type(data) + return self.to_dtype(endianness=endianness).type(data) else: @@ -461,58 +534,29 @@ def from_json_value( class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" - dtype_cls = np.dtypes.ObjectDType - default = "" + default_value = np.object_("") def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_numpy( - self, endianness: Endianness | None = None - ) -> np.dtype[np.dtypes.ObjectDType]: - return super().to_numpy(endianness=endianness) + @classmethod + def from_dtype(cls, dtype: np.dtypes.ObjectDType) -> Self: + return cls() + + def to_dtype(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: + return super().to_dtype(endianness=endianness) - def to_json_value(self, data, *, zarr_format: ZarrFormat) -> str: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_numpy(endianness=endianness).type(data) - - -register_data_type(VlenString) - - -@dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(DTypeWrapper[np.dtypes.StrDType, np.str_], Flexible): - name = "numpy/static_unicode_string" - kind = "string" - default = "" - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"capacity": self.length}} - - def to_numpy(self, endianness: Endianness | None = "native") -> np.dtypes.StrDType: - endianness_code = endianness_to_numpy_str(endianness) - return np.dtype(endianness_code + self.numpy_character_code + str(self.length)) - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.str_: - if not check_json_bool(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_numpy(endianness=endianness).type(data) - - -register_data_type(StaticUnicodeString) + return self.to_dtype(endianness=endianness).type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: - from zarr.registry import get_data_type_from_numpy + from zarr.registry import get_data_type_from_dict, get_data_type_from_numpy if isinstance(dtype, DTypeWrapper): return dtype @@ -526,3 +570,7 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString +for dtype in get_args( + Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes +): + register_data_type(dtype) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 7f180d2ed5..d23d7a452c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -177,14 +177,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.to_numpy().type(fill_value) + fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_numpy(), + dtype=data_type.to_dtype(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -218,7 +218,14 @@ def _validate_metadata(self) -> None: if self.fill_value is None: raise ValueError("`fill_value` is required.") for codec in self.codecs: - codec.validate(shape=self.shape, dtype=self.data_type, chunk_grid=self.chunk_grid) + codec.validate( + shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid + ) + + @property + def dtype(self) -> np.dtype[Any]: + """Interpret Zarr dtype as NumPy dtype""" + return self.data_type.to_dtype() @property def ndim(self) -> int: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 2960a71d66..7614d29b48 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -75,12 +75,10 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, cls: type[DTypeWrapper], clobber: bool = False) -> None: - if cls.name in self.contents and not clobber: - raise ValueError( - f"Data type {cls.name} already registered. Use clobber=True to overwrite." - ) - self.contents[cls.name] = cls + def register(self: Self, cls: type[DTypeWrapper]) -> None: + # don't register the same dtype twice + if cls.name not in self.contents or self.contents[cls.name] != cls: + self.contents[cls.name] = cls def get(self, key: str) -> type[DTypeWrapper]: return self.contents[key] @@ -354,8 +352,8 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): - if val.numpy_character_code == np_dtype.char: - return val.from_str(np_dtype) + if val.dtype_cls is type(np_dtype): + return val.from_dtype(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index b5e8b60a8c..4ccb7cc8c3 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -53,15 +53,15 @@ def test_vlen_string( else: a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == get_data_type_from_numpy(dtype) - assert a.dtype == expected_array_string_dtype + assert a.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.dtype == data.dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == get_data_type_from_numpy(dtype) - assert a.dtype == expected_array_string_dtype + assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.dtype == data.dtype @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 5670cd798a..19a2631341 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,12 +15,13 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import Flexible, complex_from_json +from zarr.core.metadata.dtype import FlexibleWrapperBase, complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, parse_zarr_format, ) +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError from zarr.registry import get_data_type_from_numpy @@ -56,20 +57,13 @@ ) complex_dtypes = ("complex64", "complex128") -flexible_dtypes = ("str", "bytes", "void") +flexible_dtypes = ("str", "bytes", 'void') if _NUMPY_SUPPORTS_VLEN_STRING: - vlen_string_dtypes = ("T",) + vlen_string_dtypes = ("T","O") else: - vlen_string_dtypes = ("O",) - -dtypes = ( - *bool_dtypes, - *int_dtypes, - *float_dtypes, - *complex_dtypes, - *flexible_dtypes, - *vlen_string_dtypes, -) + vlen_string_dtypes = ("O") + +dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *flexible_dtypes, *vlen_string_dtypes) @pytest.mark.parametrize("data", [None, 1, 2, 4, 5, "3"]) @@ -132,7 +126,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.to_numpy().type(complex(*fill_value)) + expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) @@ -341,11 +335,7 @@ async def test_special_float_fill_values(fill_value: str) -> None: @pytest.mark.parametrize("dtype_str", dtypes) def test_dtypes(dtype_str: str) -> None: dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.to_numpy() - - if not isinstance(dt, Flexible): - assert dt.item_size == np_dtype.itemsize - else: - assert dt.length == np_dtype.itemsize + np_dtype = dt.to_dtype() + assert isinstance(np_dtype, dt.dtype_cls) + assert np_dtype.type(0) == dt.cast_value(0) - assert dt.numpy_character_code == np_dtype.char From ec934b89a50ae87e55811629f9a11c22b017ef80 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 21:55:27 +0100 Subject: [PATCH 011/129] remove unused code --- src/zarr/core/dtype/__init__.py | 3 - src/zarr/core/dtype/core.py | 196 -------------------------------- src/zarr/registry.py | 69 +---------- 3 files changed, 2 insertions(+), 266 deletions(-) delete mode 100644 src/zarr/core/dtype/__init__.py delete mode 100644 src/zarr/core/dtype/core.py diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py deleted file mode 100644 index 58b884ff23..0000000000 --- a/src/zarr/core/dtype/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from zarr.core.dtype.core import ZarrDType - -__all__ = ["ZarrDType"] diff --git a/src/zarr/core/dtype/core.py b/src/zarr/core/dtype/core.py deleted file mode 100644 index c6460706aa..0000000000 --- a/src/zarr/core/dtype/core.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -# Overview - -This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase. - -The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the -zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with -dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what -endianness is the dtype etc). By providing this abstraction, the module aims to: - -- Simplify dtype management within zarr-python -- Support runtime flexibility and custom extensions -- Remove unnecessary dependencies on the numpy API - -## Extensibility - -The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes -without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism, -enabling integration of experimental features. Over time, widely adopted extensions may be formalized through -inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential. - -## Examples - -### Core `dtype` Registration - -The following example demonstrates how to register a built-in `dtype` in the core codebase: - -```python -from zarr.core.dtype import ZarrDType -from zarr.registry import register_v3dtype - -class Float16(ZarrDType): - zarr_spec_format = "3" - experimental = False - endianness = "little" - byte_count = 2 - to_numpy = np.dtype('float16') - -register_v3dtype(Float16) -``` - -### Entrypoint Extension - -The following example demonstrates how users can register a new `bfloat16` dtype for Zarr. -This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring -consistency with other extensions. The code below would typically be part of a Python package -that specifies the entrypoints for the extension: - -```python -import ml_dtypes -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -class Bfloat16(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = "little" - byte_count = 2 - to_numpy = np.dtype('bfloat16') # Enabled by importing ml_dtypes - configuration_v3 = { - "version": "example_value", - "author": "example_value", - "ml_dtypes_version": "example_value" - } -``` - -### dtype lookup - -The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given -a string that matches the dtype Zarr specification ID, or a numpy dtype object: - -``` -from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy - -get_v3dtype_class('complex64') # returns little-endian Complex64 ZarrDType -get_v3dtype_class('not_registered_dtype') # ValueError - -get_v3dtype_class_from_numpy('>i2') # returns big-endian Int16 ZarrDType -get_v3dtype_class_from_numpy(np.dtype('float32')) # returns little-endian Float32 ZarrDType -get_v3dtype_class_from_numpy('i10') # ValueError -``` - -### String dtypes - -The following indicates one possibility for supporting variable-length strings. It is via the -entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently -include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string -here to implicitly refer to a variable-length string data (there may be some subtleties with codecs -that means this needs to be refined further): - -```python -import numpy as np -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -try: - to_numpy = np.dtypes.StringDType() -except AttributeError: - to_numpy = np.dtypes.ObjectDType() - -class String(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = 'little' - byte_count = None # None is defined to mean variable - to_numpy = to_numpy -``` - -### int4 dtype - -There is currently considerable interest in the AI community in 'quantising' models - storing -models at reduced precision, while minimising loss of information content. There are a number -of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not -currently have support for handling such sub-byte dtypes in an easy way. However, they can -still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch -which can handle appropriately: - -```python -import numpy as np -from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype - -class Int4(ZarrDType): - zarr_spec_format = "3" - experimental = True - endianness = 'little' - byte_count = 1 # this is ugly, but I could change this from byte_count to bit_count if there was consensus - to_numpy = np.dtype('B') # could also be np.dtype('V1'), but this would prevent bit-twiddling - configuration_v3 = { - "version": "example_value", - "author": "example_value", - } -``` -""" - -from __future__ import annotations - -from typing import Any, Literal - -import numpy as np - - -class FrozenClassVariables(type): - def __setattr__(cls, attr: str, value: object) -> None: - if hasattr(cls, attr): - raise ValueError(f"Attribute {attr} on ZarrDType class can not be changed once set.") - else: - raise AttributeError(f"'{cls}' object has no attribute '{attr}'") - - -class ZarrDType(metaclass=FrozenClassVariables): - zarr_spec_format: Literal["2", "3"] # the version of the zarr spec used - experimental: bool # is this in the core spec or not - endianness: Literal[ - "big", "little", None - ] # None indicates not defined i.e. single byte or byte strings - byte_count: int | None # None indicates variable count - to_numpy: np.dtype[Any] # may involve installing a a numpy extension e.g. ml_dtypes; - - configuration_v3: dict | None # TODO: understand better how this is recommended by the spec - - _zarr_spec_identifier: str # implementation detail used to map to core spec - - def __init_subclass__( # enforces all required fields are set and basic sanity checks - cls, - **kwargs, - ) -> None: - required_attrs = [ - "zarr_spec_format", - "experimental", - "endianness", - "byte_count", - "to_numpy", - ] - for attr in required_attrs: - if not hasattr(cls, attr): - raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") - - if not hasattr(cls, "configuration_v3"): - cls.configuration_v3 = None - - cls._zarr_spec_identifier = ( - "big_" + cls.__qualname__.lower() - if cls.endianness == "big" - else cls.__qualname__.lower() - ) # how this dtype is identified in core spec; convention is prefix with big_ for big-endian - - cls._validate() # sanity check on basic requirements - - super().__init_subclass__(**kwargs) - - # TODO: add further checks - @classmethod - def _validate(cls): - if cls.byte_count is not None and cls.byte_count <= 0: - raise ValueError("byte_count must be a positive integer.") - - if cls.byte_count == 1 and cls.endianness is not None: - raise ValueError("Endianness must be None for single-byte types.") diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 7614d29b48..d45f722df8 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -25,7 +25,6 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON - from zarr.core.dtype import ZarrDType from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ @@ -34,14 +33,10 @@ "get_codec_class", "get_ndbuffer_class", "get_pipeline_class", - "get_v2dtype_class", - "get_v3dtype_class", "register_buffer", "register_codec", "register_ndbuffer", "register_pipeline", - "register_v2dtype", - "register_v3dtype", ] T = TypeVar("T") @@ -89,8 +84,6 @@ def get(self, key: str) -> type[DTypeWrapper]: __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() __data_type_registry = DataTypeRegistry() -__v3_dtype_registry: Registry[ZarrDType] = Registry() -__v2_dtype_registry: Registry[ZarrDType] = Registry() """ The registry module is responsible for managing implementations of codecs, @@ -127,13 +120,9 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - # __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) - __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v3dtype")) - __v3_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v3dtype")) - __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr.v2dtype")) - __v2_dtype_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="v2dtype")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( entry_points.select(group="zarr", name="codec_pipeline") @@ -183,14 +172,6 @@ def register_data_type(cls: type[DTypeWrapper]) -> None: __data_type_registry.register(cls) -def register_v3dtype(cls: type[ZarrDType]) -> None: - __v3_dtype_registry.register(cls) - - -def register_v2dtype(cls: type[ZarrDType]) -> None: - __v2_dtype_registry.register(cls) - - def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -359,50 +340,4 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: ) -# TODO: merge the get_vXdtype_class_ functions -# these can be used instead of the various parse_X functions (hopefully) -def get_v3dtype_class(dtype: str) -> type[ZarrDType]: - __v3_dtype_registry.lazy_load() - v3dtype_class = __v3_dtype_registry.get(dtype) - if v3dtype_class: - return v3dtype_class - raise ValueError( - f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v3_dtype_registry)}." - ) - - -def get_v3dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: - __v3_dtype_registry.lazy_load() - - dtype = np.dtype(dtype) - for val in __v3_dtype_registry.values(): - if dtype == val.to_numpy: - return val - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v3_dtype_registry)}." - ) - - -def get_v2dtype_class(dtype: str) -> type[ZarrDType]: - __v2_dtype_registry.lazy_load() - v2dtype_class = __v2_dtype_registry.get(dtype) - if v2dtype_class: - return v2dtype_class - raise ValueError( - f"ZarrDType class '{dtype}' not found in registered buffers: {list(__v2_dtype_registry)}." - ) - - -def get_v2dtype_class_from_numpy(dtype: npt.DTypeLike) -> type[ZarrDType]: - __v2_dtype_registry.lazy_load() - - dtype = np.dtype(dtype) - for val in __v2_dtype_registry.values(): - if dtype == val.to_numpy: - return val - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__v2_dtype_registry)}." - ) - - _collect_entrypoints() From 8369ffc79ae4f01196b6c3d4d8d2a35125733159 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 2 Mar 2025 23:31:31 +0100 Subject: [PATCH 012/129] use wrap / unwrap instead of to_dtype / from_dtype; push into v2 codebase --- src/zarr/api/asynchronous.py | 16 ++-- src/zarr/codecs/_v2.py | 6 +- src/zarr/codecs/bytes.py | 2 +- src/zarr/core/array.py | 68 ++++++++------- src/zarr/core/array_spec.py | 13 ++- src/zarr/core/buffer/cpu.py | 9 +- src/zarr/core/chunk_grids.py | 5 +- src/zarr/core/common.py | 13 +-- src/zarr/core/metadata/dtype.py | 58 ++++++------- src/zarr/core/metadata/v2.py | 144 ++++++++++++++++++++++---------- src/zarr/core/metadata/v3.py | 8 +- src/zarr/registry.py | 2 +- tests/conftest.py | 2 +- tests/test_array.py | 8 +- tests/test_metadata/test_v3.py | 24 ++++-- 15 files changed, 224 insertions(+), 154 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 83b473ed67..9c82f6aa2b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -39,8 +39,9 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.errors import GroupNotFoundError, NodeTypeValidationError -from zarr.storage import StorePath +from zarr.core.metadata.v2 import _default_compressor, _default_filters +from zarr.errors import NodeTypeValidationError +from zarr.registry import get_data_type_from_numpy from zarr.storage._common import make_store_path if TYPE_CHECKING: @@ -457,7 +458,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - zarr_dtype = get_data_type_from_native_dtype(arr.dtype) + zarr_dtype = get_data_type_from_numpy(arr.dtype) new = await AsyncArray._create( store_path, zarr_format=zarr_format, @@ -1009,15 +1010,14 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - + dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) if zarr_format == 2: if chunks is None: chunks = shape - dtype = parse_dtype(dtype, zarr_format=zarr_format) if not filters: - filters = _default_filters(dtype) + filters = _default_filters(dtype_wrapped) if not compressor: - compressor = _default_compressor(dtype) + compressor = _default_compressor(dtype_wrapped) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks @@ -1064,7 +1064,7 @@ async def create( store_path, shape=shape, chunks=chunks, - dtype=dtype, + dtype=dtype_wrapped, compressor=compressor, fill_value=fill_value, overwrite=overwrite, diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..f0b7bfc9c9 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType: try: - chunk = chunk.view(chunk_spec.dtype.to_native_dtype()) + chunk = chunk.view(chunk_spec.dtype.unwrap()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.unwrap()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.unwrap(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index d663a3b2cc..d195d64eb8 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -57,7 +57,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if not isinstance(array_spec.dtype, HasEndianness): + if array_spec.dtype.unwrap().itemsize == 0: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 52204d830e..e5f94dc25b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -109,7 +109,7 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.metadata.dtype import DTypeWrapper, VariableLengthString from zarr.core.metadata.v2 import ( CompressorLikev2, get_object_codec_id, @@ -589,7 +589,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: ZDTypeLike | ZDType[TBaseDType, TBaseScalar], + dtype: npt.DTypeLike[Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, @@ -618,11 +618,13 @@ async def _create( See :func:`AsyncArray.create` for more details. Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - - dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) + # TODO: delete this and be more strict about where parsing occurs + if not isinstance(dtype, DTypeWrapper): + dtype_parsed = get_data_type_from_numpy(np.dtype(dtype)) + else: + dtype_parsed = dtype store_path = await make_store_path(store) - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: @@ -631,9 +633,9 @@ async def _create( if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size if chunks: - _chunks = normalize_chunks(chunks, shape, item_size) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.unwrap().itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, item_size) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.unwrap().itemsize) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @@ -711,7 +713,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], chunk_shape: ChunkCoords, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -741,19 +743,16 @@ def _create_metadata_v3( stacklevel=2, ) - # resolve the numpy dtype into zarr v3 datatype - zarr_data_type = get_data_type_from_numpy(dtype) - if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = zarr_data_type.default_value + fill_value_parsed = dtype.default_value else: fill_value_parsed = fill_value chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape) return ArrayV3Metadata( shape=shape, - data_type=zarr_data_type, + data_type=dtype, chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, @@ -816,7 +815,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -828,12 +827,13 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." - # Handle DefaultFillValue sentinel - if isinstance(fill_value, DefaultFillValue): - fill_value_parsed: Any = dtype.default_scalar() - else: - # For v2, preserve None as-is (backward compatibility) - fill_value_parsed = fill_value + # inject VLenUTF8 for str dtype if not already present + if isinstance(dtype, VariableLengthString): + filters = filters or [] + from numcodecs.vlen import VLenUTF8 + + if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [VLenUTF8()] return ArrayV2Metadata( shape=shape, @@ -2120,7 +2120,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype The NumPy data type. """ - return self._async_array.dtype + return self._async_array.dtype.unwrap() @property def attrs(self) -> Attributes: @@ -4249,7 +4249,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4271,7 +4271,7 @@ async def init_array( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=item_size, + item_size=dtype_wrapped.unwrap().itemsize, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -4287,7 +4287,7 @@ async def init_array( raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=zdtype + compressor=compressors, filters=filters, dtype=dtype_wrapped ) if dimension_names is not None: raise ValueError("Zarr format 2 arrays do not support dimension names.") @@ -4298,7 +4298,7 @@ async def init_array( meta = AsyncArray._create_metadata_v2( shape=shape_parsed, - dtype=zdtype, + dtype=dtype_wrapped, chunks=chunk_shape_parsed, dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, @@ -4312,7 +4312,7 @@ async def init_array( compressors=compressors, filters=filters, serializer=serializer, - dtype=zdtype, + dtype=dtype_wrapped, ) sub_codecs = cast("tuple[Codec, ...]", (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] @@ -4327,7 +4327,7 @@ async def init_array( ) sharding_codec.validate( shape=chunk_shape_parsed, - dtype=zdtype, + dtype=dtype_wrapped, chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) @@ -4341,7 +4341,7 @@ async def init_array( meta = AsyncArray._create_metadata_v3( shape=shape_parsed, - dtype=zdtype, + dtype=dtype_wrapped, fill_value=fill_value, chunk_shape=chunks_out, chunk_key_encoding=chunk_key_encoding_parsed, @@ -4659,12 +4659,11 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - dtype = get_data_type_from_numpy(np_dtype) default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) @@ -4681,7 +4680,9 @@ def _get_default_chunk_encoding_v3( ) -def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: +def _get_default_chunk_encoding_v2( + dtype: DTypeWrapper[Any, Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Given a data type, return the default filters for that data type. @@ -4689,6 +4690,8 @@ def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ return () + compressor_dict = _default_compressor(dtype) + filter_dicts = _default_filters(dtype) def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]: """ @@ -4761,11 +4764,12 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) _filters: tuple[numcodecs.abc.Codec, ...] | None _compressor: numcodecs.abc.Codec | None diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 279bf6edf0..daa9259d7f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,10 +11,14 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config +from zarr.core.metadata.dtype import DTypeWrapper +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from typing import NotRequired + import numpy.typing as npt + from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -89,7 +93,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[TBaseDType, TBaseScalar] + dtype: DTypeWrapper[Any, Any] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,12 +101,17 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: npt.DtypeLike | DTypeWrapper[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) + if not isinstance(dtype, DTypeWrapper): + dtype_parsed = get_data_type_from_numpy(dtype) + else: + dtype_parsed = dtype + fill_value_parsed = parse_fill_value(fill_value) object.__setattr__(self, "shape", shape_parsed) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 9da0059d0b..bacef5c83e 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -10,6 +10,7 @@ import numpy.typing as npt from zarr.core.buffer import core +from zarr.core.metadata.dtype import DTypeWrapper from zarr.registry import ( register_buffer, register_ndbuffer, @@ -150,7 +151,7 @@ def create( cls, *, shape: Iterable[int], - dtype: npt.DTypeLike, + dtype: DTypeWrapper[Any, Any], order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: @@ -158,7 +159,11 @@ def create( if fill_value is None or (isinstance(fill_value, int) and fill_value == 0): return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) else: - return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order)) + return cls( + np.full( + shape=tuple(shape), fill_value=fill_value, dtype=dtype.unwrap(), order=order + ) + ) @classmethod def empty( diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 4bf03c89de..74bf9b6ba8 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -63,10 +63,7 @@ def _guess_chunks( """ if isinstance(shape, int): shape = (shape,) - - if typesize == 0: - return shape - + typesize = max(typesize, 8) ndims = len(shape) # require chunks to have non-zero length for all dimensions chunks = np.maximum(np.array(shape, dtype="=f8"), 1) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 1ec7553802..da19e80a0d 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -22,7 +22,6 @@ from typing_extensions import ReadOnly from zarr.core.config import config as zarr_config -from zarr.core.strings import _VLEN_STRING_DTYPE if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -191,14 +190,10 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: - if dtype is str or dtype == "str": - if zarr_format == 2: - # special case as object - return np.dtype("object") - else: - return _VLEN_STRING_DTYPE - return np.dtype(dtype) +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> DTypeWrapper[Any, Any]: + from zarr.registry import get_data_type_from_numpy + + return get_data_type_from_numpy(np.dtype(dtype)) def _warn_write_empty_chunks_kwarg() -> None: diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 1b57831943..5d382076b4 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -216,14 +216,14 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.to_dtype(endianness=endianness).type(value)) + return cast(np.generic, self.unwrap(endianness=endianness).type(value)) @classmethod @abstractmethod - def from_dtype(cls: type[Self], dtype: TDType) -> Self: + def wrap(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError - def to_dtype(self: Self, *, endianness: Endianness | None = None) -> TDType: + def unwrap(self: Self, *, endianness: Endianness | None = None) -> TDType: endian_str = endianness_to_numpy_str(endianness) return self.dtype_cls().newbyteorder(endian_str) @@ -251,7 +251,7 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): default_value = np.False_ @classmethod - def from_dtype(cls, dtype: np.dtypes.BoolDType) -> Self: + def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: @@ -261,7 +261,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bool_: if check_json_bool(data): - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -269,7 +269,7 @@ class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: @@ -279,7 +279,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_int(data): - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -335,7 +335,7 @@ class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: @@ -345,7 +345,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> TScalar: if check_json_float_v2(data): - return self.to_dtype(endianness=endianness).type(float_from_json(data, zarr_format)) + return self.unwrap(endianness=endianness).type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -374,7 +374,7 @@ class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): default_value = np.complex64(0) @classmethod - def from_dtype(cls, dtype: np.dtypes.Complex64DType) -> Self: + def wrap(cls, dtype: np.dtypes.Complex64DType) -> Self: return cls() def to_json_value( @@ -387,7 +387,7 @@ def from_json_value( ) -> np.complex64: if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format + data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -399,7 +399,7 @@ class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): default_value = np.complex128(0) @classmethod - def from_dtype(cls, dtype: np.dtypes.Complex128DType) -> Self: + def wrap(cls, dtype: np.dtypes.Complex128DType) -> Self: return cls() def to_json_value( @@ -412,7 +412,7 @@ def from_json_value( ) -> np.complex128: if check_json_complex_float_v3(data): return complex_from_json( - data, dtype=self.to_dtype(endianness=endianness), zarr_format=zarr_format + data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format ) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -423,10 +423,10 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): length: int @classmethod - def from_dtype(cls, dtype: TDType) -> Self: + def wrap(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def to_dtype(self, endianness: Endianness | None = None) -> TDType: + def unwrap(self, endianness: Endianness | None = None) -> TDType: endianness_code = endianness_to_numpy_str(endianness) return self.dtype_cls(self.length).newbyteorder(endianness_code) @@ -450,7 +450,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.bytes_: if check_json_bool(data): - return self.to_dtype(endianness=endianness).type(data.encode("ascii")) + return self.unwrap(endianness=endianness).type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -464,7 +464,7 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} - def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: + def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly endianness_code = endianness_to_numpy_str(endianness) @@ -477,7 +477,7 @@ def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> np.void: # todo: check that this is well-formed - return self.to_dtype(endianness=endianness).type(bytes(data)) + return self.unwrap(endianness=endianness).type(bytes(data)) @dataclass(frozen=True, kw_only=True) @@ -498,25 +498,25 @@ def from_json_value( ) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeWrapper[np.dtypes.StringDType, str]): + class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" kind = "string" default_value = "" @classmethod - def from_dtype(cls, dtype: np.dtypes.StringDType) -> Self: + def wrap(cls, dtype: np.dtypes.StringDType) -> Self: return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def to_dtype(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: + def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: endianness_code = endianness_to_numpy_str(endianness) return np.dtype(endianness_code + self.numpy_character_code) @@ -526,12 +526,12 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) else: @dataclass(frozen=True, kw_only=True) - class VlenString(DTypeWrapper[np.dtypes.ObjectDType, str]): + class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" default_value = np.object_("") @@ -540,11 +540,11 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @classmethod - def from_dtype(cls, dtype: np.dtypes.ObjectDType) -> Self: + def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def to_dtype(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: - return super().to_dtype(endianness=endianness) + def unwrap(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: + return super().unwrap(endianness=endianness) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) @@ -552,7 +552,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value( self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None ) -> str: - return self.to_dtype(endianness=endianness).type(data) + return self.unwrap(endianness=endianness).type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: @@ -569,7 +569,7 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = StaticUnicodeString | VlenString | StaticByteString +STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString for dtype in get_args( Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes ): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 7bdad204b8..cb5f8ff2f8 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -8,9 +8,8 @@ import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.metadata.dtype import DTypeWrapper +from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from typing import Literal, Self @@ -63,8 +62,8 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: ZDType[TBaseDType, TBaseScalar] - fill_value: int | float | str | bytes | None = None + dtype: DTypeWrapper[Any, Any] + fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." @@ -76,7 +75,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: ZDType[TDType_co, TScalar_co], + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -94,11 +93,7 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed: TBaseScalar | None - if fill_value is not None: - fill_value_parsed = dtype.cast_scalar(fill_value) - else: - fill_value_parsed = fill_value + fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.unwrap()) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -145,30 +140,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - - # To resolve a numpy object dtype array, we need to search for an object codec, - # which could be in filters or as a compressor. - # we will reference a hard-coded collection of object codec ids for this search. - - _filters, _compressor = (data.get("filters"), data.get("compressor")) - if _filters is not None: - _filters = cast("tuple[dict[str, JSON], ...]", _filters) - object_codec_id = get_object_codec_id(tuple(_filters) + (_compressor,)) - else: - object_codec_id = get_object_codec_id((_compressor,)) - # we add a layer of indirection here around the dtype attribute of the array metadata - # because we also need to know the object codec id, if any, to resolve the data type - dtype_spec: DTypeSpec_V2 = { - "name": data["dtype"], - "object_codec_id": object_codec_id, - } - dtype = get_data_type_from_json(dtype_spec, zarr_format=2) - + dtype = get_data_type_from_numpy(parse_dtype(_data["dtype"])) _data["dtype"] = dtype - fill_value_encoded = _data.get("fill_value") - if fill_value_encoded is not None: - fill_value = dtype.from_json_scalar(fill_value_encoded, zarr_format=2) - _data["fill_value"] = fill_value + if dtype.unwrap().kind in "SV": + fill_value_encoded = _data.get("fill_value") + if fill_value_encoded is not None: + fill_value = base64.standard_b64decode(fill_value_encoded) + _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. # We don't want the ArrayV2Metadata constructor to fail just because someone put an @@ -223,8 +201,16 @@ def to_dict(self) -> dict[str, JSON]: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] + _ = zarray_dict.pop("dtype") + dtype_json: JSON + # TODO: Replace this with per-dtype method + # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string + dtype_descr = self.dtype.unwrap().descr + if self.dtype.unwrap().kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + dtype_json = tuple(self.dtype.unwrap().descr) + else: + dtype_json = self.dtype.unwrap().str + zarray_dict["dtype"] = dtype_json return zarray_dict @@ -321,11 +307,81 @@ def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: has an id that matches one of the hard-coded object codec ids, that id is returned immediately. """ - object_codec_id = None - for maybe_object_codec in maybe_object_codecs: - if ( - isinstance(maybe_object_codec, dict) - and maybe_object_codec.get("id") in OBJECT_CODEC_IDS - ): - return cast("str", maybe_object_codec["id"]) - return object_codec_id + + if fill_value is None or dtype.hasobject: + # no fill value + pass + elif not isinstance(fill_value, np.void) and fill_value == 0: + # this should be compatible across numpy versions for any array type, including + # structured arrays + fill_value = np.zeros((), dtype=dtype)[()] + + elif dtype.kind == "U": + # special case unicode because of encoding issues on Windows if passed through numpy + # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 + + if not isinstance(fill_value, str): + raise ValueError( + f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" + ) + else: + try: + if isinstance(fill_value, bytes) and dtype.kind == "V": + # special case for numpy 1.14 compatibility + fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] + else: + fill_value = np.array(fill_value, dtype=dtype)[()] + + except Exception as e: + msg = f"Fill_value {fill_value} is not valid for dtype {dtype}." + raise ValueError(msg) from e + + return fill_value + + +def _default_fill_value(dtype: np.dtype[Any]) -> Any: + """ + Get the default fill value for a type. + + Notes + ----- + This differs from :func:`parse_fill_value`, which parses a fill value + stored in the Array metadata into an in-memory value. This only gives + the default fill value for some type. + + This is useful for reading Zarr format 2 arrays, which allow the fill + value to be unspecified. + """ + if dtype.kind == "S": + return b"" + elif dtype.kind in "UO": + return "" + elif dtype.kind in "Mm": + return dtype.type("nat") + elif dtype.kind == "V": + if dtype.fields is not None: + default = tuple(_default_fill_value(field[0]) for field in dtype.fields.values()) + return np.array([default], dtype=dtype) + else: + return np.zeros(1, dtype=dtype) + else: + return dtype.type(0) + + +def _default_compressor( + dtype: DTypeWrapper[Any, Any], +) -> dict[str, JSON] | None: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_compressor = config.get("array.v2_default_compressor") + return cast(dict[str, JSON] | None, default_compressor.get(dtype.kind, None)) + + +def _default_filters( + dtype: DTypeWrapper, +) -> list[dict[str, JSON]] | None: + """Get the default filters and compressor for a dtype.""" + default_filters = config.get("array.v2_default_filters") + return cast(list[dict[str, JSON]] | None, default_filters.get(dtype.kind, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index d23d7a452c..a017b0c1d1 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -177,14 +177,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.to_dtype().type(fill_value) + fill_value_parsed = data_type.unwrap().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_dtype(), + dtype=data_type.unwrap(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -219,13 +219,13 @@ def _validate_metadata(self) -> None: raise ValueError("`fill_value` is required.") for codec in self.codecs: codec.validate( - shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid + shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid ) @property def dtype(self) -> np.dtype[Any]: """Interpret Zarr dtype as NumPy dtype""" - return self.data_type.to_dtype() + return self.data_type.unwrap() @property def ndim(self) -> int: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index d45f722df8..f2ec285cf3 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -334,7 +334,7 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: __data_type_registry.lazy_load() for val in __data_type_registry.contents.values(): if val.dtype_cls is type(np_dtype): - return val.from_dtype(np_dtype) + return val.wrap(np_dtype) raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." ) diff --git a/tests/conftest.py b/tests/conftest.py index 4d300a1fd4..858ef8dd0b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -292,7 +292,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=item_size, + dtype=dtype_parsed.unwrap().itemsize, ) if order is None: diff --git a/tests/test_array.py b/tests/test_array.py index 0a10dff110..dd36ebd71f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1103,7 +1103,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer="auto", - dtype=arr._zdtype, + dtype=arr.metadata.data_type, ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1283,10 +1283,8 @@ async def test_default_filters_compressors( ) elif zarr_format == 2: - default_filters, default_compressors = _parse_chunk_encoding_v2( - compressor=sig.parameters["compressors"].default, - filters=sig.parameters["filters"].default, - dtype=dtype, # type: ignore[arg-type] + default_filters, default_compressors = _get_default_chunk_encoding_v2( + dtype=np.dtype(dtype) ) if default_filters is None: expected_filters = () diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 19a2631341..de86e88d0a 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,7 +15,7 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import FlexibleWrapperBase, complex_from_json +from zarr.core.metadata.dtype import complex_from_json from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -57,13 +57,20 @@ ) complex_dtypes = ("complex64", "complex128") -flexible_dtypes = ("str", "bytes", 'void') +flexible_dtypes = ("str", "bytes", "void") if _NUMPY_SUPPORTS_VLEN_STRING: - vlen_string_dtypes = ("T","O") + vlen_string_dtypes = ("T", "O") else: - vlen_string_dtypes = ("O") - -dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *flexible_dtypes, *vlen_string_dtypes) + vlen_string_dtypes = "O" + +dtypes = ( + *bool_dtypes, + *int_dtypes, + *float_dtypes, + *complex_dtypes, + *flexible_dtypes, + *vlen_string_dtypes, +) @pytest.mark.parametrize("data", [None, 1, 2, 4, 5, "3"]) @@ -126,7 +133,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.to_dtype().type(complex(*fill_value)) + expected = dtype.unwrap().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) @@ -335,7 +342,6 @@ async def test_special_float_fill_values(fill_value: str) -> None: @pytest.mark.parametrize("dtype_str", dtypes) def test_dtypes(dtype_str: str) -> None: dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.to_dtype() + np_dtype = dt.unwrap() assert isinstance(np_dtype, dt.dtype_cls) assert np_dtype.type(0) == dt.cast_value(0) - From 0aa1e499ca0040180fd5a93bebe59a4a026d3854 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 3 Mar 2025 10:44:34 +0100 Subject: [PATCH 013/129] push into v2 --- src/zarr/api/asynchronous.py | 2 +- src/zarr/core/array.py | 15 +++------------ src/zarr/core/buffer/cpu.py | 4 ++-- src/zarr/core/common.py | 6 ------ src/zarr/core/metadata/v2.py | 3 +++ src/zarr/core/metadata/v3.py | 9 +++++---- tests/conftest.py | 5 +++-- 7 files changed, 17 insertions(+), 27 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 9c82f6aa2b..34c5f63572 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1010,7 +1010,7 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = get_data_type_from_numpy(dtype) if zarr_format == 2: if chunks is None: chunks = shape diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e5f94dc25b..80b3dc55b3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -62,6 +62,7 @@ _default_zarr_format, _warn_order_kwarg, concurrent_map, + parse_order, parse_shapelike, product, ) @@ -1093,17 +1094,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def _zdtype(self) -> ZDType[TBaseDType, TBaseScalar]: - """ - The zarr-specific representation of the array data type - """ - if self.metadata.zarr_format == 2: - return self.metadata.dtype - else: - return self.metadata.data_type - - @property - def dtype(self) -> TBaseDType: + def dtype(self) -> DTypeWrapper[Any, Any]: """Returns the data type of the array. Returns @@ -4249,7 +4240,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_dtype(dtype, zarr_format=zarr_format) + dtype_wrapped = get_data_type_from_numpy(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index bacef5c83e..d60801ba38 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -151,7 +151,7 @@ def create( cls, *, shape: Iterable[int], - dtype: DTypeWrapper[Any, Any], + dtype: np.dtype[Any], order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: @@ -161,7 +161,7 @@ def create( else: return cls( np.full( - shape=tuple(shape), fill_value=fill_value, dtype=dtype.unwrap(), order=order + shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order ) ) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index da19e80a0d..e86347d808 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -190,12 +190,6 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> DTypeWrapper[Any, Any]: - from zarr.registry import get_data_type_from_numpy - - return get_data_type_from_numpy(np.dtype(dtype)) - - def _warn_write_empty_chunks_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index cb5f8ff2f8..9e193aceee 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -89,6 +89,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) + # TODO: remove this + if not isinstance(dtype, DTypeWrapper): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index a017b0c1d1..d7222d3bdf 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,16 +4,14 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype - +from zarr.core.metadata.dtype import DTypeWrapper if TYPE_CHECKING: from typing import Self from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - from zarr.core.metadata.dtype import ( - DTypeWrapper, - ) + import json from collections.abc import Iterable @@ -173,6 +171,9 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ + # TODO: remove this + if not isinstance(data_type, DTypeWrapper): + raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) diff --git a/tests/conftest.py b/tests/conftest.py index 858ef8dd0b..33d523295b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,7 +19,7 @@ _parse_chunk_key_encoding, ) from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition -from zarr.core.common import JSON, DimensionNames, parse_shapelike +from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import ( get_data_type_from_native_dtype, @@ -28,6 +28,7 @@ from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync +from zarr.registry import get_data_type_from_numpy from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: @@ -280,7 +281,7 @@ def create_array_metadata( """ Create array metadata """ - dtype_parsed = get_data_type_from_native_dtype(dtype) + dtype_parsed = get_data_type_from_numpy(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format From de24a141ad38b1f313e5a01608b5bb46e595ef69 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 3 Mar 2025 15:03:01 +0100 Subject: [PATCH 014/129] remove endianness kwarg to methods, make it an instance variable instead --- src/zarr/core/metadata/dtype.py | 107 +++++++++++++------------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 5d382076b4..f88683e1e7 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from collections.abc import Sequence -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args import numpy as np @@ -199,11 +199,13 @@ def complex_from_json( TScalar = TypeVar("TScalar", bound=np.generic) +@dataclass(frozen=True, kw_only=True) class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] - default_value: TScalar + default_value: ClassVar[TScalar] + endianness: Endianness = "native" def __init_subclass__(cls) -> None: # Subclasses will bind the first generic type parameter to an attribute of the class @@ -215,18 +217,21 @@ def __init_subclass__(cls) -> None: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def cast_value(self: Self, value: object, *, endianness: Endianness | None = None) -> TScalar: - return cast(np.generic, self.unwrap(endianness=endianness).type(value)) + def cast_value(self: Self, value: object) -> TScalar: + return cast(np.generic, self.unwrap().type(value)) @classmethod @abstractmethod def wrap(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError - def unwrap(self: Self, *, endianness: Endianness | None = None) -> TDType: - endian_str = endianness_to_numpy_str(endianness) + def unwrap(self: Self) -> TDType: + endian_str = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(endian_str) + def with_endianness(self: Self, endianness: Endianness) -> Self: + return replace(self, endianness=endianness) + @abstractmethod def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: """ @@ -235,9 +240,7 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: raise NotImplementedError @abstractmethod - def from_json_value( - self: Self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: """ Read a JSON-serializable value as a numpy scalar """ @@ -257,11 +260,9 @@ def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: return bool(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.bool_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: if check_json_bool(data): - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @@ -275,11 +276,9 @@ def wrap(cls, dtype: TDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: return int(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: if check_json_int(data): - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @@ -341,11 +340,9 @@ def wrap(cls, dtype: TDType) -> Self: def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> TScalar: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: if check_json_float_v2(data): - return self.unwrap(endianness=endianness).type(float_from_json(data, zarr_format)) + return self.unwrap().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @@ -382,13 +379,9 @@ def to_json_value( ) -> tuple[JSONFloat, JSONFloat]: return complex_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.complex64: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: if check_json_complex_float_v3(data): - return complex_from_json( - data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format - ) + return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -407,13 +400,9 @@ def to_json_value( ) -> tuple[JSONFloat, JSONFloat]: return complex_to_json(data, zarr_format) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.complex128: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: if check_json_complex_float_v3(data): - return complex_from_json( - data, dtype=self.unwrap(endianness=endianness), zarr_format=zarr_format - ) + return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @@ -426,8 +415,8 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): def wrap(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def unwrap(self, endianness: Endianness | None = None) -> TDType: - endianness_code = endianness_to_numpy_str(endianness) + def unwrap(self) -> TDType: + endianness_code = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(endianness_code) @@ -435,22 +424,18 @@ def unwrap(self, endianness: Endianness | None = None) -> TDType: class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" kind = "string" - default_value = b"" + default_value = np.bytes_(0) item_size_bits = 8 def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} - def to_json_value( - self, data: np.generic, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return data.tobytes().decode("ascii") - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.bytes_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_bool(data): - return self.unwrap(endianness=endianness).type(data.encode("ascii")) + return self.unwrap().type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -464,20 +449,18 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} - def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.VoidDType: + def unwrap(self) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly - endianness_code = endianness_to_numpy_str(endianness) + endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: return tuple(*data.tobytes()) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.void: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed - return self.unwrap(endianness=endianness).type(bytes(data)) + return self.unwrap().type(bytes(data)) @dataclass(frozen=True, kw_only=True) @@ -493,12 +476,10 @@ def to_dict(self) -> dict[str, JSON]: def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> np.str_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap(endianness=endianness).type(data) + return self.unwrap().type(data) if _NUMPY_SUPPORTS_VLEN_STRING: @@ -516,17 +497,15 @@ def wrap(cls, dtype: np.dtypes.StringDType) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": self.name} - def unwrap(self, endianness: Endianness | None = None) -> np.dtypes.StringDType: - endianness_code = endianness_to_numpy_str(endianness) + def unwrap(self) -> np.dtypes.StringDType: + endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(endianness_code + self.numpy_character_code) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: - return self.unwrap(endianness=endianness).type(data) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + return self.unwrap().type(data) else: @@ -543,16 +522,14 @@ def to_dict(self) -> dict[str, JSON]: def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self, endianness: Endianness | None = None) -> np.dtype[np.dtypes.ObjectDType]: - return super().unwrap(endianness=endianness) + def unwrap(self) -> np.dtype[np.dtypes.ObjectDType]: + return super().unwrap() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value( - self, data: JSON, *, zarr_format: ZarrFormat, endianness: Endianness | None = None - ) -> str: - return self.unwrap(endianness=endianness).type(data) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + return self.unwrap().type(data) def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: From 31a39d631b2cbf8684c5afe80d0222727b9c679c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 4 Mar 2025 18:10:20 +0100 Subject: [PATCH 015/129] make wrapping safe by default --- src/zarr/api/asynchronous.py | 19 +- src/zarr/codecs/blosc.py | 8 +- src/zarr/codecs/bytes.py | 15 +- src/zarr/codecs/sharding.py | 9 +- src/zarr/core/array.py | 216 +++++++++++++---------- src/zarr/core/array_spec.py | 5 +- src/zarr/core/codec_pipeline.py | 4 +- src/zarr/core/metadata/dtype.py | 215 +++++++++++++++++++--- src/zarr/core/metadata/v2.py | 72 ++------ src/zarr/core/metadata/v3.py | 37 ++-- src/zarr/registry.py | 71 +------- tests/conftest.py | 8 +- tests/test_array.py | 79 ++++----- tests/test_codecs/test_vlen.py | 38 +--- tests/test_group.py | 2 +- tests/test_metadata/test_consolidated.py | 3 +- tests/test_metadata/test_v2.py | 2 +- tests/test_metadata/test_v3.py | 7 +- tests/test_v2.py | 74 ++++---- 19 files changed, 459 insertions(+), 425 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 34c5f63572..352630e6d1 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,17 +9,14 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.abc.store import Store from zarr.core.array import ( - DEFAULT_FILL_VALUE, Array, AsyncArray, - CompressorLike, + _get_default_chunk_encoding_v2, create_array, - from_array, get_array_metadata, ) -from zarr.core.array_spec import ArrayConfigLike, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -39,9 +36,8 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_compressor, _default_filters +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.errors import NodeTypeValidationError -from zarr.registry import get_data_type_from_numpy from zarr.storage._common import make_store_path if TYPE_CHECKING: @@ -1014,10 +1010,11 @@ async def create( if zarr_format == 2: if chunks is None: chunks = shape - if not filters: - filters = _default_filters(dtype_wrapped) - if not compressor: - compressor = _default_compressor(dtype_wrapped) + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) + if filters is None: + filters = default_filters + if compressor is None: + compressor = default_compressor elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 1c5e52e9a4..6ef1540acf 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -143,11 +143,15 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = array_spec.dtype.item_size new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=item_size) + new_codec = replace(new_codec, typesize=dtype.unwrap().itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), + shuffle=( + BloscShuffle.bitshuffle + if dtype.unwrap().itemsize == 1 + else BloscShuffle.shuffle + ), ) return new_codec diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index d195d64eb8..694a52b455 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -57,7 +57,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.unwrap().itemsize == 0: + if array_spec.dtype.unwrap().itemsize == 1: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -72,12 +72,15 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None - if isinstance(chunk_spec.dtype, HasEndianness): - dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + if chunk_spec.dtype.unwrap().itemsize > 0: + if self.endian == Endian.little: + prefix = "<" + else: + prefix = ">" + dtype = np.dtype(f"{prefix}{chunk_spec.dtype.unwrap().str[1:]}") else: - dtype = chunk_spec.dtype.to_native_dtype() + dtype = np.dtype(f"|{chunk_spec.dtype.unwrap().str[1:]}") + as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): as_nd_array_like = as_array_like diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 80b12856d6..10abb13aa1 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -51,6 +51,7 @@ get_indexer, morton_order_iter, ) +from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec @@ -406,11 +407,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self def validate( - self, - *, - shape: ChunkCoords, - dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, + self, *, shape: ChunkCoords, dtype: DTypeWrapper[Any, Any], chunk_grid: ChunkGrid ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( @@ -496,7 +493,7 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( shape=indexer.shape, - dtype=shard_spec.dtype.to_native_dtype(), + dtype=shard_spec.dtype.unwrap(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 80b3dc55b3..f7f09a8b24 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5,6 +5,7 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, field, replace +from functools import cached_property from itertools import starmap from logging import getLogger from typing import ( @@ -32,7 +33,7 @@ from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -110,10 +111,13 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import DTypeWrapper, VariableLengthString +from zarr.core.metadata.dtype import ( + DTypeWrapper, + StaticByteString, + VariableLengthString, + get_data_type_from_numpy, +) from zarr.core.metadata.v2 import ( - CompressorLikev2, - get_object_codec_id, parse_compressor, parse_filters, ) @@ -124,7 +128,6 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, - get_data_type_from_numpy, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path @@ -590,7 +593,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike[Any], + dtype: npt.DTypeLike[Any] | DTypeWrapper[Any, Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, @@ -729,14 +732,19 @@ def _create_metadata_v3( compressors: tuple[BytesBytesCodec, ...] shape = parse_shapelike(shape) - codecs = list(codecs) if codecs is not None else _get_default_codecs(dtype) + if codecs is None: + filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype) + codecs_parsed = (*filters, serializer, *compressors) + else: + codecs_parsed = tuple(codecs) + chunk_key_encoding_parsed: ChunkKeyEncodingLike if chunk_key_encoding is None: chunk_key_encoding_parsed = {"name": "default", "separator": "/"} else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.kind in "UTS": + if dtype.unwrap().kind in ("U", "T", "S"): warn( f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", @@ -757,7 +765,7 @@ def _create_metadata_v3( chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, - codecs=codecs, + codecs=codecs_parsed, dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) @@ -768,7 +776,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = DEFAULT_FILL_VALUE, @@ -828,14 +836,6 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." - # inject VLenUTF8 for str dtype if not already present - if isinstance(dtype, VariableLengthString): - filters = filters or [] - from numcodecs.vlen import VLenUTF8 - - if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [VLenUTF8()] - return ArrayV2Metadata( shape=shape, dtype=dtype, @@ -854,7 +854,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -1010,6 +1010,13 @@ def chunks(self) -> ChunkCoords: """ return self.metadata.chunks + @cached_property + def chunk_grid(self) -> RegularChunkGrid: + if self.metadata.zarr_format == 2: + return RegularChunkGrid(chunk_shape=self.chunks) + else: + return self.metadata.chunk_grid + @property def shards(self) -> ChunkCoords | None: """Returns the shard shape of the Array. @@ -1094,7 +1101,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def dtype(self) -> DTypeWrapper[Any, Any]: + def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. Returns @@ -1102,7 +1109,10 @@ def dtype(self) -> DTypeWrapper[Any, Any]: np.dtype Data type of the array """ - return self._zdtype.to_native_dtype() + if self.metadata.zarr_format == 2: + return self.metadata.dtype.unwrap() + else: + return self.metadata.data_type.unwrap() @property def order(self) -> MemoryOrder: @@ -1323,6 +1333,20 @@ def nbytes(self) -> int: """ return self.size * self.dtype.itemsize + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype + ) -> ArraySpec: + assert isinstance(self.chunk_grid, RegularChunkGrid), ( + "Currently, only regular chunk grid is supported" + ) + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.metadata.fill_value, + config=array_config, + prototype=prototype, + ) + async def _get_selection( self, indexer: Indexer, @@ -1362,7 +1386,7 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), + self.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1417,7 +1441,7 @@ async def getitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + chunk_grid=self.chunk_grid, ) return await self._get_selection(indexer, prototype=prototype) @@ -1462,7 +1486,7 @@ async def _set_selection( if isinstance(array_like, np._typing._SupportsArrayFunc): # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + array_like_ = cast(np._typing._SupportsArrayFunc, array_like) value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): @@ -1476,8 +1500,7 @@ async def _set_selection( value = value.astype(dtype=self.dtype, order="A") else: value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. @@ -1493,7 +1516,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype), + self.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1548,7 +1571,7 @@ async def setitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + chunk_grid=self.chunk_grid, ) return await self._set_selection(indexer, value, prototype=prototype) @@ -1585,8 +1608,8 @@ async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) if delete_outside_chunks: # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) + old_chunk_coords = set(self.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -1757,16 +1780,9 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - _data_type: np.dtype[Any] | DTypeWrapper - if isinstance(self.metadata, ArrayV2Metadata): - _data_type = self.metadata.dtype - else: - _data_type = self.metadata.data_type - return ArrayInfo( _zarr_format=self.metadata.zarr_format, - _data_type=self._zdtype, - _fill_value=self.metadata.fill_value, + _data_type=self.dtype, _shape=self.shape, _order=self.order, _shard_shape=self.shards, @@ -2111,7 +2127,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype The NumPy data type. """ - return self._async_array.dtype.unwrap() + return self._async_array.dtype @property def attrs(self) -> Attributes: @@ -2721,7 +2737,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self._async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + BasicIndexer(selection, self.shape, self._async_array.chunk_grid), out=out, fields=fields, prototype=prototype, @@ -2820,7 +2836,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_orthogonal_selection( @@ -2940,7 +2956,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3052,7 +3068,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3132,7 +3148,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3214,7 +3230,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_coordinate_selection( @@ -3294,7 +3310,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) out_array = sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3379,7 +3395,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -3494,7 +3510,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3587,7 +3603,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property @@ -4240,7 +4256,10 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = get_data_type_from_numpy(dtype) + if not isinstance(dtype, DTypeWrapper): + dtype_wrapped = get_data_type_from_numpy(dtype) + else: + dtype_wrapped = dtype shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4484,26 +4503,30 @@ async def create_array( data_parsed, shape_parsed, dtype_parsed = _parse_data_params( data=data, shape=shape, dtype=dtype ) - if data_parsed is not None: - return await from_array( - store, - data=data_parsed, - write_data=write_data, - name=name, - chunks=chunks, - shards=shards, - filters=filters, - compressors=compressors, - serializer=serializer, - fill_value=fill_value, - order=order, - zarr_format=zarr_format, - attributes=attributes, - chunk_key_encoding=chunk_key_encoding, - dimension_names=dimension_names, - storage_options=storage_options, - overwrite=overwrite, - config=config, + result = await init_array( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + overwrite=overwrite, + config=config, + ) + + if write_data is True and data_parsed is not None: + await result._set_selection( + BasicIndexer(..., shape=result.shape, chunk_grid=result.chunk_grid), + data_parsed, + prototype=default_buffer_prototype(), ) else: mode: Literal["a"] = "a" @@ -4655,20 +4678,21 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - - default_filters = zarr_config.get("array.v3_default_filters").get(dtype.kind) - default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype.kind) - default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype.kind) - - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) - - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) + filters = () + compressors = (ZstdCodec(level=0, checksum=False),) + # TODO: find a registry-style solution for this that isn't bloated + # We need to associate specific dtypes with specific encoding schemes + + if isinstance(dtype, VariableLengthString): + serializer = VLenUTF8Codec() + elif isinstance(dtype, StaticByteString): + serializer = VLenBytesCodec() + else: + if dtype.unwrap().itemsize == 1: + serializer = BytesCodec(endian=None) + else: + serializer = BytesCodec() + return filters, serializer, compressors def _get_default_chunk_encoding_v2( @@ -4679,18 +4703,18 @@ def _get_default_chunk_encoding_v2( This is an empty tuple. No data types have default filters. """ - return () - - compressor_dict = _default_compressor(dtype) - filter_dicts = _default_filters(dtype) - -def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]: - """ - Given a data type, return the default compressors for that data type. + from numcodecs import VLenBytes as numcodecs_VLenBytes + from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 + from numcodecs import Zstd as numcodecs_zstd + + if isinstance(dtype, VariableLengthString): + filters = (numcodecs_VLenUTF8(),) + elif isinstance(dtype, StaticByteString): + filters = (numcodecs_VLenBytes(),) + else: + filters = None - This is just a tuple containing ``ZstdCodec`` - """ - return (ZstdCodec(),) + compressor = numcodecs_zstd(level=0, checksum=False) def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: @@ -4825,7 +4849,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: DTypeWrapper[Any, Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index daa9259d7f..ef111ba20c 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,8 +11,7 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import DTypeWrapper -from zarr.registry import get_data_type_from_numpy +from zarr.core.metadata.dtype import DTypeWrapper, get_data_type_from_numpy if TYPE_CHECKING: from typing import NotRequired @@ -101,7 +100,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: npt.DtypeLike | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | DTypeWrapper[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 23c27e40c6..0f58060c91 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -62,7 +62,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return chunk_spec.dtype.default_scalar() + return chunk_spec.dtype.default_value else: return fill_value @@ -319,7 +319,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype.to_native_dtype(), + dtype=chunk_spec.dtype.unwrap(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index f88683e1e7..a573794730 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,16 +1,32 @@ +from __future__ import annotations + +import base64 from abc import ABC, abstractmethod from collections.abc import Sequence -from dataclasses import dataclass, replace -from typing import Any, ClassVar, Generic, Literal, Self, TypeGuard, TypeVar, cast, get_args +from dataclasses import dataclass, field, replace +from importlib.metadata import EntryPoint +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Generic, + Literal, + Self, + TypeGuard, + TypeVar, + cast, + get_args, +) import numpy as np import numpy.typing as npt from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata -from zarr.core.common import JSON, ZarrFormat from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import register_data_type + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat Endianness = Literal["little", "big", "native"] DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] @@ -132,16 +148,16 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complexfloating) -> JSONFloat: - return float_to_json_v2(data) +def complex_to_json_v2(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: + return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_to_json_v3(data: complex | np.complexfloating) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) def complex_to_json( - data: complex | np.complexfloating, zarr_format: ZarrFormat + data: complex | np.complexfloating[Any], zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat] | JSONFloat: if zarr_format == 2: return complex_to_json_v2(data) @@ -150,6 +166,18 @@ def complex_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return base64.b64encode(data).decode("ascii") + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + +def structured_scalar_from_json(data: JSON, zarr_format: ZarrFormat) -> bytes: + if zarr_format == 2: + return base64.b64decode(data.encode("ascii")) + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + + def float_from_json_v2(data: JSONFloat) -> float: match data: case "NaN": @@ -196,7 +224,7 @@ def complex_from_json( TDType = TypeVar("TDType", bound=np.dtype[Any]) -TScalar = TypeVar("TScalar", bound=np.generic) +TScalar = TypeVar("TScalar", bound=np.generic | str) @dataclass(frozen=True, kw_only=True) @@ -205,7 +233,7 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype kind: ClassVar[DataTypeFlavor] default_value: ClassVar[TScalar] - endianness: Endianness = "native" + endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: # Subclasses will bind the first generic type parameter to an attribute of the class @@ -221,8 +249,21 @@ def cast_value(self: Self, value: object) -> TScalar: return cast(np.generic, self.unwrap().type(value)) @classmethod - @abstractmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: + """ + Check that a dtype matches the dtype_cls class attribute + """ + return type(dtype) is cls.dtype_cls + + @classmethod def wrap(cls: type[Self], dtype: TDType) -> Self: + if cls.check_dtype(dtype): + return cls._wrap_unsafe(dtype) + raise TypeError(f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}.") + + @classmethod + @abstractmethod + def _wrap_unsafe(cls: type[Self], dtype: TDType) -> Self: raise NotImplementedError def unwrap(self: Self) -> TDType: @@ -254,7 +295,7 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): default_value = np.False_ @classmethod - def wrap(cls, dtype: np.dtypes.BoolDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: @@ -270,7 +311,7 @@ class IntWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: @@ -334,7 +375,7 @@ class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): kind = "numeric" @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls() def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: @@ -371,7 +412,7 @@ class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): default_value = np.complex64(0) @classmethod - def wrap(cls, dtype: np.dtypes.Complex64DType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: return cls() def to_json_value( @@ -392,7 +433,7 @@ class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): default_value = np.complex128(0) @classmethod - def wrap(cls, dtype: np.dtypes.Complex128DType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: return cls() def to_json_value( @@ -412,7 +453,7 @@ class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): length: int @classmethod - def wrap(cls, dtype: TDType) -> Self: + def _wrap_unsafe(cls, dtype: TDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def unwrap(self) -> TDType: @@ -431,10 +472,10 @@ def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return data.tobytes().decode("ascii") + return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_bool(data): + if check_json_str(data): return self.unwrap().type(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -456,7 +497,7 @@ def unwrap(self) -> np.dtypes.VoidDType: return np.dtype(f"{endianness_code}V{self.length}") def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: - return tuple(*data.tobytes()) + return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed @@ -491,20 +532,22 @@ class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): default_value = "" @classmethod - def wrap(cls, dtype: np.dtypes.StringDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: return cls() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def unwrap(self) -> np.dtypes.StringDType: - endianness_code = endianness_to_numpy_str(self.endianness) - return np.dtype(endianness_code + self.numpy_character_code) + # StringDType does not have endianness, so we ignore it here + return self.dtype_cls() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") return self.unwrap().type(data) else: @@ -514,27 +557,96 @@ class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" kind = "string" default_value = np.object_("") + endianness: Endianness = field(default=None) + + def __post_init__(self) -> None: + if self.endianness is not None: + raise ValueError("VariableLengthString does not support endianness.") def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @classmethod - def wrap(cls, dtype: np.dtypes.ObjectDType) -> Self: + def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self) -> np.dtype[np.dtypes.ObjectDType]: + def unwrap(self) -> np.dtypes.ObjectDType: return super().unwrap() def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") return self.unwrap().type(data) -def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTypeWrapper: - from zarr.registry import get_data_type_from_dict, get_data_type_from_numpy +@dataclass(frozen=True, kw_only=True) +class StructuredDtype(DTypeWrapper[np.dtypes.VoidDType, np.void]): + name = "numpy/struct" + kind = "struct" + fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] + + @classmethod + def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + """ + Check that this dtype is a numpy structured dtype + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + fields: list[tuple[str, DTypeWrapper[Any, Any], int]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + for key, (dtype_instance, offset) in dtype.fields.items(): + dtype_wrapped = data_type_registry.match_dtype(dtype_instance) + fields.append((key, dtype_wrapped, offset)) + + return cls(fields=tuple(fields)) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return structured_scalar_to_json(data.tobytes(), zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = structured_scalar_from_json(data, zarr_format=zarr_format) + dtype = self.unwrap() + return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: + if dtype in (str, "str"): + if _NUMPY_SUPPORTS_VLEN_STRING: + np_dtype = np.dtype("T") + else: + np_dtype = np.dtype("O") + else: + np_dtype = np.dtype(dtype) + data_type_registry.lazy_load() + for val in data_type_registry.contents.values(): + return val.wrap(np_dtype) + raise ValueError( + f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." + ) + + +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: + data_type_registry.lazy_load() + dtype_name = dtype["name"] + dtype_cls = data_type_registry.get(dtype_name) + if dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype_name}") + return dtype_cls.from_dict(dtype.get("configuration", {})) + + +def resolve_dtype( + dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], +) -> DTypeWrapper[Any, Any]: if isinstance(dtype, DTypeWrapper): return dtype elif isinstance(dtype, dict): @@ -543,6 +655,55 @@ def resolve_dtype(dtype: npt.DTypeLike | DTypeWrapper | dict[str, JSON]) -> DTyp return get_data_type_from_numpy(dtype) +def get_data_type_by_name( + dtype: str, configuration: dict[str, JSON] | None = None +) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + if configuration is None: + _configuration = {} + else: + _configuration = configuration + maybe_dtype_cls = data_type_registry.get(dtype) + if maybe_dtype_cls is None: + raise ValueError(f"No data type class matching name {dtype}") + return maybe_dtype_cls.from_dict(_configuration) + + +@dataclass(frozen=True, kw_only=True) +class DataTypeRegistry: + contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.load()) + + self.lazy_load_list.clear() + + def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: + # don't register the same dtype twice + if cls.name not in self.contents or self.contents[cls.name] != cls: + self.contents[cls.name] = cls + + def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: + return self.contents[key] + + def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + for val in data_type_registry.contents.values(): + try: + return val._wrap_unsafe(dtype) + except ValueError: + pass + raise ValueError(f"No data type wrapper found that matches {dtype}") + + +def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: + data_type_registry.register(cls) + + +data_type_registry = DataTypeRegistry() + INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 9e193aceee..20a4b276ca 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,15 +1,19 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence -from functools import cached_property -from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast +from collections.abc import Iterable +from enum import Enum +from typing import TYPE_CHECKING, TypedDict, cast import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.metadata.dtype import DTypeWrapper -from zarr.registry import get_data_type_from_numpy +from zarr.core.metadata.dtype import ( + DTypeWrapper, + StaticByteString, + StaticRawBytes, + get_data_type_from_numpy, +) if TYPE_CHECKING: from typing import Literal, Self @@ -116,10 +120,6 @@ def __init__( def ndim(self) -> int: return len(self.shape) - @cached_property - def chunk_grid(self) -> RegularChunkGrid: - return RegularChunkGrid(chunk_shape=self.chunks) - @property def shards(self) -> ChunkCoords | None: return None @@ -185,23 +185,14 @@ def to_dict(self) -> dict[str, JSON]: codec_config.pop("checksum") zarray_dict["compressor"] = codec_config - if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters - - # serialize the fill value after dtype-specific JSON encoding - if self.fill_value is not None: - fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) + if ( + isinstance(self.dtype, StaticByteString | StaticRawBytes) + and self.fill_value is not None + ): + # There's a relationship between self.dtype and self.fill_value + # that mypy isn't aware of. The fact that we have S or V dtype here + # means we should have a bytes-type fill_value. + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value _ = zarray_dict.pop("dtype") @@ -342,35 +333,6 @@ def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: return fill_value -def _default_fill_value(dtype: np.dtype[Any]) -> Any: - """ - Get the default fill value for a type. - - Notes - ----- - This differs from :func:`parse_fill_value`, which parses a fill value - stored in the Array metadata into an in-memory value. This only gives - the default fill value for some type. - - This is useful for reading Zarr format 2 arrays, which allow the fill - value to be unspecified. - """ - if dtype.kind == "S": - return b"" - elif dtype.kind in "UO": - return "" - elif dtype.kind in "Mm": - return dtype.type("nat") - elif dtype.kind == "V": - if dtype.fields is not None: - default = tuple(_default_fill_value(field[0]) for field in dtype.fields.values()) - return np.array([default], dtype=dtype) - else: - return np.zeros(1, dtype=dtype) - else: - return dtype.type(0) - - def _default_compressor( dtype: DTypeWrapper[Any, Any], ) -> dict[str, JSON] | None: diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index d7222d3bdf..880adddac5 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,14 +4,20 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.metadata.dtype import ( + DTypeWrapper, + VariableLengthString, + get_data_type_by_name, + get_data_type_from_dict, +) + if TYPE_CHECKING: from typing import Self from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - + import json from collections.abc import Iterable @@ -37,7 +43,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class, get_data_type_by_name, get_data_type_from_dict +from zarr.registry import get_codec_class def parse_zarr_format(data: object) -> Literal[3]: @@ -94,14 +100,10 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - if dtype.kind == "string" and not codec_class_name == "VLenUTF8Codec": + if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) - if dtype.kind == "bytes" and not codec_class_name == "VLenBytesCodec": - raise ValueError( - f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_class_name}`." - ) def parse_dimension_names(data: object) -> tuple[str | None, ...] | None: @@ -223,11 +225,6 @@ def _validate_metadata(self) -> None: shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid ) - @property - def dtype(self) -> np.dtype[Any]: - """Interpret Zarr dtype as NumPy dtype""" - return self.data_type.unwrap() - @property def ndim(self) -> int: return len(self.shape) @@ -279,20 +276,6 @@ def inner_codecs(self) -> tuple[Codec, ...]: return self.codecs[0].codecs return self.codecs - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype - ) -> ArraySpec: - assert isinstance(self.chunk_grid, RegularChunkGrid), ( - "Currently, only regular chunk grid is supported" - ) - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, - fill_value=self.fill_value, - config=array_config, - prototype=prototype, - ) - def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index f2ec285cf3..12281483ef 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -2,20 +2,15 @@ import warnings from collections import defaultdict -from dataclasses import dataclass, field from importlib.metadata import entry_points as get_entry_points -from typing import TYPE_CHECKING, Any, Generic, Self, TypeVar - -import numpy as np +from typing import TYPE_CHECKING, Any, Generic, TypeVar from zarr.core.config import BadConfigError, config -from zarr.core.dtype import data_type_registry +from zarr.core.metadata.dtype import data_type_registry if TYPE_CHECKING: from importlib.metadata import EntryPoint - import numpy.typing as npt - from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -25,7 +20,6 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON - from zarr.core.metadata.dtype import DTypeWrapper __all__ = [ "Registry", @@ -59,31 +53,10 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: self[qualname] = cls -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper]] = field(default_factory=dict, init=False) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def lazy_load(self) -> None: - for e in self.lazy_load_list: - self.register(e.load()) - - self.lazy_load_list.clear() - - def register(self: Self, cls: type[DTypeWrapper]) -> None: - # don't register the same dtype twice - if cls.name not in self.contents or self.contents[cls.name] != cls: - self.contents[cls.name] = cls - - def get(self, key: str) -> type[DTypeWrapper]: - return self.contents[key] - - __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() __ndbuffer_registry: Registry[NDBuffer] = Registry() -__data_type_registry = DataTypeRegistry() """ The registry module is responsible for managing implementations of codecs, @@ -120,8 +93,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - __data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( @@ -168,10 +141,6 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def register_data_type(cls: type[DTypeWrapper]) -> None: - __data_type_registry.register(cls) - - def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: _reload_config() @@ -308,36 +277,4 @@ def get_ndbuffer_class(reload_config: bool = False) -> type[NDBuffer]: ) -def get_data_type_by_name(dtype: str, configuration: dict[str, JSON] | None = None) -> DTypeWrapper: - __data_type_registry.lazy_load() - if configuration is None: - _configuration = {} - else: - _configuration = configuration - maybe_dtype_cls = __data_type_registry.get(dtype) - if maybe_dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls.from_dict(_configuration) - - -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: - __data_type_registry.lazy_load() - dtype_name = dtype["name"] - dtype_cls = __data_type_registry.get(dtype_name) - if dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype_name}") - return dtype_cls.from_dict(dtype.get("configuration", {})) - - -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: - np_dtype = np.dtype(dtype) - __data_type_registry.lazy_load() - for val in __data_type_registry.contents.values(): - if val.dtype_cls is type(np_dtype): - return val.wrap(np_dtype) - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(__data_type_registry.contents)}." - ) - - _collect_entrypoints() diff --git a/tests/conftest.py b/tests/conftest.py index 33d523295b..0112a07055 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,14 +21,10 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import ( - get_data_type_from_native_dtype, -) -from zarr.core.dtype.common import HasItemSize +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync -from zarr.registry import get_data_type_from_numpy from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: @@ -293,7 +289,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - dtype=dtype_parsed.unwrap().itemsize, + item_size=dtype_parsed.unwrap().itemsize, ) if order is None: diff --git a/tests/test_array.py b/tests/test_array.py index dd36ebd71f..a618395ee5 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -60,6 +60,7 @@ from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError @@ -441,6 +442,31 @@ async def test_nbytes_stored_async() -> None: assert result == 902 # the size with all chunks filled. +def test_default_fill_values() -> None: + a = zarr.Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: + with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): + Array.create(MemoryStore(), shape=5, chunks=5, dtype=" None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 @@ -462,8 +488,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=2, - _data_type=arr._async_array._zdtype, - _fill_value=arr.fill_value, + _data_type=arr.dtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=None, @@ -480,7 +505,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.metadata.data_type, + _data_type=arr.dtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -505,7 +530,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.metadata.data_type, + _data_type=arr.dtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -566,7 +591,7 @@ async def test_info_v3_async( result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.metadata.data_type, + _data_type=arr.dtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -593,7 +618,7 @@ async def test_info_complete_async( result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.metadata.data_type, + _data_type=arr.dtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -1006,28 +1031,6 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: else: assert a.fill_value == dtype.default_scalar() - @staticmethod - # @pytest.mark.parametrize("zarr_format", [2, 3]) - @pytest.mark.parametrize("dtype", zdtype_examples) - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - def test_default_fill_value_None( - dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat - ) -> None: - """ - Test that the fill value of an array is set to the default value for an explicit None argument for - Zarr Format 3, and to null for Zarr Format 2 - """ - a = zarr.create_array( - store, shape=(5,), chunks=(5,), dtype=dtype, fill_value=None, zarr_format=zarr_format - ) - if zarr_format == 3: - if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): - assert np.isnat(dtype.default_scalar()) - else: - assert a.fill_value == dtype.default_scalar() - elif zarr_format == 2: - assert a.fill_value is None - @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @pytest.mark.parametrize( @@ -1256,18 +1259,17 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") + @pytest.mark.parametrize("dtype_str", ["uint8", "float32", "str"]) async def test_default_filters_compressors( - store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthUTF8, zarr_format: ZarrFormat + store: MemoryStore, dtype_str: str, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ - + zdtype = get_data_type_from_numpy(dtype_str) arr = await create_array( store=store, - dtype=dtype, # type: ignore[arg-type] + dtype=dtype_str, shape=(10,), zarr_format=zarr_format, ) @@ -1275,17 +1277,12 @@ async def test_default_filters_compressors( sig = inspect.signature(create_array) if zarr_format == 3: - expected_filters, expected_serializer, expected_compressors = _parse_chunk_encoding_v3( - compressors=sig.parameters["compressors"].default, - filters=sig.parameters["filters"].default, - serializer=sig.parameters["serializer"].default, - dtype=dtype, # type: ignore[arg-type] + expected_filters, expected_serializer, expected_compressors = ( + _get_default_chunk_encoding_v3(dtype=zdtype) ) elif zarr_format == 2: - default_filters, default_compressors = _get_default_chunk_encoding_v2( - dtype=np.dtype(dtype) - ) + default_filters, default_compressors = _get_default_chunk_encoding_v2(dtype=zdtype) if default_filters is None: expected_filters = () else: diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 4ccb7cc8c3..4234eac3d0 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,9 +8,9 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.registry import get_data_type_from_numpy from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] @@ -62,39 +62,3 @@ def test_vlen_string( assert np.array_equal(data, b[:, :]) assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) assert a.dtype == data.dtype - - -@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) -@pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) -def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: - bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] - data = np.array(bstrings).reshape((2, 3)) - assert data.dtype == "|S5" - - sp = StorePath(store, path="string") - a = zarr.create_array( - sp, - shape=data.shape, - chunks=data.shape, - dtype=data.dtype, - fill_value=b"", - compressors=compressor, - ) - assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy - - # should also work if input array is an object array, provided we explicitly specified - # a bytesting-like dtype when creating the Array - if as_object_array: - data = data.astype("O") - a[:, :] = data - assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == DataType.bytes - assert a.dtype == "O" - - # test round trip - b = Array.open(sp) - assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy - assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == DataType.bytes - assert a.dtype == "O" diff --git a/tests/test_group.py b/tests/test_group.py index ac1afb539b..e7723e185a 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1010,7 +1010,7 @@ async def test_asyncgroup_create_array( assert subnode.dtype == dtype # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete # here. - assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape + assert subnode.chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 395e036db2..c831bf9a9e 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -21,6 +21,7 @@ from zarr.core.dtype import parse_data_type from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata +from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import StorePath @@ -504,7 +505,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = parse_data_type("uint8", zarr_format=2) + dtype = get_data_type_from_numpy("uint8") await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index a2894529aa..540935013f 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -3,7 +3,6 @@ import json from typing import TYPE_CHECKING, Literal -import numpy as np import pytest import zarr.api.asynchronous @@ -14,6 +13,7 @@ from zarr.core.dtype.npy.int import Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata +from zarr.core.metadata.dtype import Float32, Float64, Int16 from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index de86e88d0a..5fa77a29b0 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,7 +15,7 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json +from zarr.core.metadata.dtype import complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -23,7 +23,6 @@ ) from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError -from zarr.registry import get_data_type_from_numpy if TYPE_CHECKING: from collections.abc import Sequence @@ -59,9 +58,9 @@ complex_dtypes = ("complex64", "complex128") flexible_dtypes = ("str", "bytes", "void") if _NUMPY_SUPPORTS_VLEN_STRING: - vlen_string_dtypes = ("T", "O") + vlen_string_dtypes = ("T",) else: - vlen_string_dtypes = "O" + vlen_string_dtypes = ("O",) dtypes = ( *bool_dtypes, diff --git a/tests/test_v2.py b/tests/test_v2.py index 4d17305995..c14f7496ae 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -70,31 +70,35 @@ def test_codec_pipeline() -> None: ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), ], ) -async def test_v2_encode_decode( - dtype: str, expected_dtype: str, fill_value: bytes, fill_value_json: str -) -> None: - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None - ) +async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_json) -> None: + with config.set( + { + "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], + "array.v2_default_compressor.bytes": None, + } + ): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None + ) result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) assert result is not None - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": expected_dtype, - "fill_value": fill_value_json, - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": expected_dtype, + "fill_value": fill_value_json, + "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected data = zarr.open_array(store=store, path="foo")[:] np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) @@ -103,16 +107,10 @@ async def test_v2_encode_decode( np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) -@pytest.mark.parametrize( - ("dtype", "value"), - [ - (NullTerminatedBytes(length=1), b"Y"), - (FixedLengthUTF32(length=1), "Y"), - (VariableLengthUTF8(), "Y"), - ], -) -def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str) -> None: - expected = np.full((3,), value, dtype=dtype.to_native_dtype()) +@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), ("O", "Y")]) +def test_v2_encode_decode_with_data(dtype, value): + dtype, value = dtype, value + expected = np.full((3,), value, dtype=dtype) a = zarr.create( shape=(3,), zarr_format=2, @@ -123,6 +121,18 @@ def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str) -> None np.testing.assert_equal(data, expected) +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any) -> None: + data = ["a", "bb", "ccc"] + arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" + assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, np.array(data, dtype="object")) + + @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: @@ -227,7 +237,7 @@ def test_v2_non_contiguous(numpy_order: Literal["C", "F"], zarr_order: Literal[" def test_default_compressor_deprecation_warning() -> None: with pytest.warns(DeprecationWarning, match="default_compressor is deprecated"): - zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" # type: ignore[attr-defined] + zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" @pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) From 2079efeb95127e2cac0a4e1ee795752661d0463e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 4 Mar 2025 23:08:15 +0100 Subject: [PATCH 016/129] dtype-specific tests --- src/zarr/core/metadata/dtype.py | 161 ++++++++++++++++-------- tests/test_metadata/test_dtype.py | 203 ++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+), 52 deletions(-) create mode 100644 tests/test_metadata/test_dtype.py diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index a573794730..590ab7df67 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -172,7 +172,7 @@ def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") -def structured_scalar_from_json(data: JSON, zarr_format: ZarrFormat) -> bytes: +def structured_scalar_from_json(data: str, zarr_format: ZarrFormat) -> bytes: if zarr_format == 2: return base64.b64decode(data.encode("ascii")) raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") @@ -202,11 +202,13 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating: - return dtype.type(data) +def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating[Any, Any]: + return dtype.type(complex(*data)) -def complex_from_json_v3(data: tuple[JSONFloat, JSONFloat], dtype: Any) -> np.complexfloating: +def complex_from_json_v3( + data: tuple[JSONFloat, JSONFloat], dtype: Any +) -> np.complexfloating[Any, Any]: return dtype.type(complex(*data)) @@ -223,6 +225,14 @@ def complex_from_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") +def datetime_to_json(data: np.datetime64[Any]) -> int: + return data.view("int").item() + + +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: + return np.int64(data).view(f"datetime64[{unit}]") + + TDType = TypeVar("TDType", bound=np.dtype[Any]) TScalar = TypeVar("TScalar", bound=np.generic | str) @@ -231,8 +241,6 @@ def complex_from_json( class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): name: ClassVar[str] dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype - kind: ClassVar[DataTypeFlavor] - default_value: ClassVar[TScalar] endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: @@ -248,6 +256,9 @@ def to_dict(self) -> dict[str, JSON]: def cast_value(self: Self, value: object) -> TScalar: return cast(np.generic, self.unwrap().type(value)) + @abstractmethod + def default_value(self) -> TScalar: ... + @classmethod def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: """ @@ -291,8 +302,9 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal @dataclass(frozen=True, kw_only=True) class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" - kind = "boolean" - default_value = np.False_ + + def default_value(self) -> np.bool_: + return np.False_ @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: @@ -308,7 +320,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: class IntWrapperBase(DTypeWrapper[TDType, TScalar]): - kind = "numeric" + def default_value(self) -> TScalar: + return self.unwrap().type(0) @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -326,53 +339,46 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): name = "int8" - default_value = np.int8(0) @dataclass(frozen=True, kw_only=True) class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): name = "uint8" - default_value = np.uint8(0) @dataclass(frozen=True, kw_only=True) class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): name = "int16" - default_value = np.int16(0) @dataclass(frozen=True, kw_only=True) class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): name = "uint16" - default_value = np.uint16(0) @dataclass(frozen=True, kw_only=True) class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): name = "int32" - default_value = np.int32(0) @dataclass(frozen=True, kw_only=True) class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): name = "uint32" - default_value = np.uint32(0) @dataclass(frozen=True, kw_only=True) class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): name = "int64" - default_value = np.int64(0) @dataclass(frozen=True, kw_only=True) class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): name = "uint64" - default_value = np.uint64(0) class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): - kind = "numeric" + def default_value(self) -> TScalar: + return self.unwrap().type(0.0) @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -390,26 +396,24 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): name = "float16" - default_value = np.float16(0) @dataclass(frozen=True, kw_only=True) class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): name = "float32" - default_value = np.float32(0) @dataclass(frozen=True, kw_only=True) class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): name = "float64" - default_value = np.float64(0) @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): name = "complex64" - kind = "numeric" - default_value = np.complex64(0) + + def default_value(self) -> np.complex64: + return np.complex64(0.0) @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: @@ -429,8 +433,9 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): name = "complex128" - kind = "numeric" - default_value = np.complex128(0) + + def default_value(self) -> np.complex128: + return np.complex128(0.0) @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: @@ -464,10 +469,11 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" - kind = "string" - default_value = np.bytes_(0) item_size_bits = 8 + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} @@ -476,17 +482,18 @@ def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): - return self.unwrap().type(data.encode("ascii")) + return self.unwrap().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") @dataclass(frozen=True, kw_only=True) class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): name = "r*" - kind = "bytes" - default_value = np.void(b"") item_size_bits = 8 + def default_value(self) -> np.void: + return np.void(b"") + def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} @@ -496,21 +503,22 @@ def unwrap(self) -> np.dtypes.VoidDType: endianness_code = endianness_to_numpy_str(self.endianness) return np.dtype(f"{endianness_code}V{self.length}") - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> tuple[int, ...]: + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: # todo: check that this is well-formed - return self.unwrap().type(bytes(data)) + return self.unwrap().type(base64.standard_b64decode(data)) @dataclass(frozen=True, kw_only=True) class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): name = "numpy/static_unicode_string" - kind = "string" - default_value = np.str_("") item_size_bits = 32 # UCS4 is 32 bits per code point + def default_value(self) -> np.str_: + return np.str_("") + def to_dict(self) -> dict[str, JSON]: return {"name": self.name, "configuration": {"length": self.length}} @@ -528,8 +536,9 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): name = "numpy/vlen_string" - kind = "string" - default_value = "" + + def default_value(self) -> str: + return "" @classmethod def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -555,10 +564,11 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): name = "numpy/vlen_string" - kind = "string" - default_value = np.object_("") endianness: Endianness = field(default=None) + def default_value(self) -> str: + return "" + def __post_init__(self) -> None: if self.endianness is not None: raise ValueError("VariableLengthString does not support endianness.") @@ -570,24 +580,57 @@ def to_dict(self) -> dict[str, JSON]: def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() - def unwrap(self) -> np.dtypes.ObjectDType: - return super().unwrap() - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + String literals pass through + """ if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) + return data + + +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] @dataclass(frozen=True, kw_only=True) -class StructuredDtype(DTypeWrapper[np.dtypes.VoidDType, np.void]): +class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + name = "numpy/datetime64" + unit: DateUnit | TimeUnit + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + @classmethod + def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + return cls(unit=unit) + + def unwrap(self) -> np.dtypes.DateTime64DType: + return np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) + + +@dataclass(frozen=True, kw_only=True) +class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): name = "numpy/struct" - kind = "struct" fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] + def default_value(self) -> np.void: + return np.array([0], dtype=self.unwrap())[0] + @classmethod def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: """ @@ -608,6 +651,9 @@ def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(fields=tuple(fields)) + def unwrap(self) -> np.dtypes.VoidDType: + return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return structured_scalar_to_json(data.tobytes(), zarr_format) @@ -629,7 +675,10 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: np_dtype = np.dtype(dtype) data_type_registry.lazy_load() for val in data_type_registry.contents.values(): - return val.wrap(np_dtype) + try: + return val.wrap(np_dtype) + except TypeError: + pass raise ValueError( f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." ) @@ -689,11 +738,11 @@ def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: return self.contents[key] def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: - data_type_registry.lazy_load() - for val in data_type_registry.contents.values(): + self.lazy_load() + for val in self.contents.values(): try: - return val._wrap_unsafe(dtype) - except ValueError: + return val.wrap(dtype) + except TypeError: pass raise ValueError(f"No data type wrapper found that matches {dtype}") @@ -708,7 +757,15 @@ def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString -for dtype in get_args( - Bool | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | StaticRawBytes -): +DTYPE = ( + Bool + | INTEGER_DTYPE + | FLOAT_DTYPE + | COMPLEX_DTYPE + | STRING_DTYPE + | StaticRawBytes + | Structured + | DateTime64 +) +for dtype in get_args(DTYPE): register_data_type(dtype) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py new file mode 100644 index 0000000000..a3f29a34f5 --- /dev/null +++ b/tests/test_metadata/test_dtype.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from zarr.core.metadata.dtype import ( + Bool, + Complex64, + Complex128, + DataTypeRegistry, + DateTime64, + DTypeWrapper, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + StaticByteString, + StaticRawBytes, + StaticUnicodeString, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, +) + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +if _NUMPY_SUPPORTS_VLEN_STRING: + VLEN_STRING_DTYPE = np.dtypes.StringDType() + VLEN_STRING_CODE = "T" +else: + VLEN_STRING_DTYPE = np.dtypes.ObjectDType() + VLEN_STRING_CODE = "O" + + +@pytest.mark.parametrize( + ("wrapper_cls", "np_dtype"), + [ + (Bool, "bool"), + (Int8, "int8"), + (Int16, "int16"), + (Int32, "int32"), + (Int64, "int64"), + (UInt8, "uint8"), + (UInt16, "uint16"), + (UInt32, "uint32"), + (UInt64, "uint64"), + (Float32, "float32"), + (Float64, "float64"), + (Complex64, "complex64"), + (Complex128, "complex128"), + (StaticUnicodeString, "U"), + (StaticByteString, "S"), + (StaticRawBytes, "V"), + (VariableLengthString, VLEN_STRING_CODE), + (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), + (DateTime64, "datetime64[s]"), + ], +) +def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | str) -> None: + """ + Test that the wrapper class has the correct dtype class bound to the dtype_cls variable + Test that the ``wrap`` method produces an instance of the wrapper class + Test that the ``unwrap`` method returns the original dtype + """ + dt = np.dtype(np_dtype) + assert wrapper_cls.dtype_cls is type(dt) + wrapped = wrapper_cls.wrap(dt) + + with pytest.raises(TypeError, match="Invalid dtype"): + wrapper_cls.wrap("not a dtype") + + assert isinstance(wrapped, wrapper_cls) + assert wrapped.unwrap() == dt + + +def test_registry_match() -> None: + """ + Test that registering a dtype in a data type registry works + Test that match_dtype resolves a numpy dtype into the stored dtype + Test that match_dtype raises an error if the dtype is not registered + """ + local_registry = DataTypeRegistry() + local_registry.register(Bool) + assert isinstance(local_registry.match_dtype(np.dtype("bool")), Bool) + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches {outside_dtype}" + ): + local_registry.match_dtype(np.dtype(outside_dtype)) + + +# start writing new tests here + + +@pytest.mark.parametrize( + ("wrapper", "expected_default"), + [ + (Bool(), np.False_), + (Int8(), np.int8(0)), + (UInt8(), np.uint8(0)), + (Int16(), np.int16(0)), + (UInt16(), np.uint16(0)), + (Int32(), np.int32(0)), + (UInt32(), np.uint32(0)), + (Int64(), np.int64(0)), + (UInt64(), np.uint64(0)), + (Float16(), np.float16(0)), + (Float32(), np.float32(0)), + (Float64(), np.float64(0)), + (Complex64(), np.complex64(0)), + (Complex128(), np.complex128(0)), + (StaticByteString(length=3), np.bytes_(b"")), + (StaticRawBytes(length=3), np.void(b"")), + (StaticUnicodeString(length=3), np.str_("")), + ( + Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), + np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], + ), + (VariableLengthString(), ""), + (DateTime64(unit="s"), np.datetime64("NaT")), + ], +) +def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: Any) -> None: + """ + Test that the default_value method is correctly set for each dtype wrapper. + """ + if isinstance(wrapper, DateTime64): + assert np.isnan(wrapper.default_value()) + else: + assert wrapper.default_value() == expected_default + + +@pytest.mark.parametrize( + ("wrapper", "input_value", "expected_json"), + [ + (Bool(), np.bool_(True), True), + (Int8(), np.int8(42), 42), + (UInt8(), np.uint8(42), 42), + (Int16(), np.int16(42), 42), + (UInt16(), np.uint16(42), 42), + (Int32(), np.int32(42), 42), + (UInt32(), np.uint32(42), 42), + (Int64(), np.int64(42), 42), + (UInt64(), np.uint64(42), 42), + (Float16(), np.float16(42.0), 42.0), + (Float32(), np.float32(42.0), 42.0), + (Float64(), np.float64(42.0), 42.0), + (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), + (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), + (StaticByteString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), + (StaticUnicodeString(length=4), np.str_("test"), "test"), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), + ], +) +def test_to_json_value_v2( + wrapper: type[DTypeWrapper[Any, Any]], input_value: Any, expected_json: Any +) -> None: + """ + Test the to_json_value method for each dtype wrapper for zarr v2 + """ + assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json + + +@pytest.mark.parametrize( + ("wrapper", "json_value", "expected_value"), + [ + (Bool(), True, np.bool_(True)), + (Int8(), 42, np.int8(42)), + (UInt8(), 42, np.uint8(42)), + (Int16(), 42, np.int16(42)), + (UInt16(), 42, np.uint16(42)), + (Int32(), 42, np.int32(42)), + (UInt32(), 42, np.uint32(42)), + (Int64(), 42, np.int64(42)), + (UInt64(), 42, np.uint64(42)), + (Float16(), 42.0, np.float16(42.0)), + (Float32(), 42.0, np.float32(42.0)), + (Float64(), 42.0, np.float64(42.0)), + (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), + (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), + (StaticByteString(length=4), "dGVzdA==", np.bytes_(b"test")), + (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), + (StaticUnicodeString(length=4), "test", np.str_("test")), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), + ], +) +def test_from_json_value( + wrapper: type[DTypeWrapper[Any, Any]], json_value: Any, expected_value: Any +) -> None: + """ + Test the from_json_value method for each dtype wrapper. + """ + assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value From 46a761b80b71b25ac8a75149423490bf1e1b146f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 5 Mar 2025 16:57:23 +0100 Subject: [PATCH 017/129] more tests, fix void type default value logic --- src/zarr/core/array.py | 11 +-- src/zarr/core/codec_pipeline.py | 2 +- src/zarr/core/metadata/dtype.py | 87 +++++++++++++++------- src/zarr/core/metadata/v2.py | 33 ++++---- tests/test_array.py | 27 +------ tests/test_metadata/test_dtype.py | 120 +++++++++++++++++++++++------- tests/test_metadata/test_v3.py | 18 ++--- tests/test_v2.py | 14 +--- 8 files changed, 182 insertions(+), 130 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f7f09a8b24..2a8bc8a630 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -113,7 +113,8 @@ ) from zarr.core.metadata.dtype import ( DTypeWrapper, - StaticByteString, + FixedLengthAsciiString, + FixedLengthUnicodeString, VariableLengthString, get_data_type_from_numpy, ) @@ -754,7 +755,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.default_value + fill_value_parsed = dtype.default_value() else: fill_value_parsed = fill_value @@ -4685,7 +4686,7 @@ def _get_default_chunk_encoding_v3( if isinstance(dtype, VariableLengthString): serializer = VLenUTF8Codec() - elif isinstance(dtype, StaticByteString): + elif isinstance(dtype, FixedLengthAsciiString): serializer = VLenBytesCodec() else: if dtype.unwrap().itemsize == 1: @@ -4707,9 +4708,9 @@ def _get_default_chunk_encoding_v2( from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 from numcodecs import Zstd as numcodecs_zstd - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthString | FixedLengthUnicodeString): filters = (numcodecs_VLenUTF8(),) - elif isinstance(dtype, StaticByteString): + elif isinstance(dtype, FixedLengthAsciiString): filters = (numcodecs_VLenBytes(),) else: filters = None diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 0f58060c91..f5ad7b668f 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -62,7 +62,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return chunk_spec.dtype.default_value + return chunk_spec.dtype.default_value() else: return fill_value diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 590ab7df67..17e67fbb05 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -16,6 +16,7 @@ TypeVar, cast, get_args, + get_origin, ) import numpy as np @@ -133,7 +134,7 @@ def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just re-use the v2 routine here + # so we just reuse the v2 routine here return float_to_json_v2(data) @@ -148,11 +149,11 @@ def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JS raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_to_json_v2(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_to_json_v3(data: complex | np.complexfloating[Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: return float_to_json_v3(data.real), float_to_json_v3(data.imag) @@ -226,15 +227,16 @@ def complex_from_json( def datetime_to_json(data: np.datetime64[Any]) -> int: - return data.view("int").item() + return data.view(np.int64).item() def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: return np.int64(data).view(f"datetime64[{unit}]") +TScalar = TypeVar("TScalar", bound=np.generic | str, covariant=True) +# TODO: figure out an interface or protocol that non-numpy dtypes can TDType = TypeVar("TDType", bound=np.dtype[Any]) -TScalar = TypeVar("TScalar", bound=np.generic | str) @dataclass(frozen=True, kw_only=True) @@ -244,17 +246,27 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): endianness: Endianness | None = "native" def __init_subclass__(cls) -> None: - # Subclasses will bind the first generic type parameter to an attribute of the class # TODO: wrap this in some *very informative* error handling generic_args = get_args(get_original_bases(cls)[0]) - cls.dtype_cls = generic_args[0] + # the logic here is that if a subclass was created with generic type parameters + # specified explicitly, then we bind that type parameter to the dtype_cls attribute + if len(generic_args) > 0: + cls.dtype_cls = generic_args[0] + else: + # but if the subclass was created without generic type parameters specified explicitly, + # then we check the parent DTypeWrapper classes and retrieve their generic type parameters + for base in cls.__orig_bases__: + if get_origin(base) is DTypeWrapper: + generic_args = get_args(base) + cls.dtype_cls = generic_args[0] + break return super().__init_subclass__() def to_dict(self) -> dict[str, JSON]: return {"name": self.name} def cast_value(self: Self, value: object) -> TScalar: - return cast(np.generic, self.unwrap().type(value)) + return cast(TScalar, self.unwrap().type(value)) @abstractmethod def default_value(self) -> TScalar: ... @@ -455,7 +467,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): item_size_bits: ClassVar[int] - length: int + length: int = 0 @classmethod def _wrap_unsafe(cls, dtype: TDType) -> Self: @@ -467,7 +479,7 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) -class StaticByteString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): +class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): name = "numpy/static_byte_string" item_size_bits = 8 @@ -492,11 +504,18 @@ class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): item_size_bits = 8 def default_value(self) -> np.void: - return np.void(b"") + return self.cast_value(("\x00" * self.length).encode("ascii")) def to_dict(self) -> dict[str, JSON]: return {"name": f"r{self.length * self.item_size_bits}"} + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: + """ + Reject structured dtypes by ensuring that dtype.fields is None + """ + return type(dtype) is cls.dtype_cls and dtype.fields is None + def unwrap(self) -> np.dtypes.VoidDType: # this needs to be overridden because numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly @@ -512,7 +531,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class StaticUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): +class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): name = "numpy/static_unicode_string" item_size_bits = 32 # UCS4 is 32 bits per code point @@ -599,7 +618,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): name = "numpy/datetime64" - unit: DateUnit | TimeUnit + unit: DateUnit | TimeUnit = "s" def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -609,6 +628,9 @@ def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] return cls(unit=unit) + def cast_value(self, value: object) -> np.datetime64: + return self.unwrap().type(value, self.unit) + def unwrap(self) -> np.dtypes.DateTime64DType: return np.dtype(f"datetime64[{self.unit}]").newbyteorder( endianness_to_numpy_str(self.endianness) @@ -651,6 +673,26 @@ def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(fields=tuple(fields)) + def to_dict(self) -> dict[str, JSON]: + base_dict = super().to_dict() + if base_dict.get("configuration", {}) != {}: + raise ValueError( + "This data type wrapper cannot inherit from a data type wrapper that defines a configuration for its dict serialization" + ) + field_configs = [ + (f_name, f_dtype.to_dict(), f_offset) for f_name, f_dtype, f_offset in self.fields + ] + base_dict["configuration"] = {"fields": field_configs} + return base_dict + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + fields = tuple( + (f_name, get_data_type_from_dict(f_dtype), f_offset) + for f_name, f_dtype, f_offset in data["fields"] + ) + return cls(fields=fields) + def unwrap(self) -> np.dtypes.VoidDType: return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) @@ -665,7 +707,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: if dtype in (str, "str"): if _NUMPY_SUPPORTS_VLEN_STRING: np_dtype = np.dtype("T") @@ -674,17 +716,10 @@ def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper: else: np_dtype = np.dtype(dtype) data_type_registry.lazy_load() - for val in data_type_registry.contents.values(): - try: - return val.wrap(np_dtype) - except TypeError: - pass - raise ValueError( - f"numpy dtype '{dtype}' does not have a corresponding Zarr dtype in: {list(data_type_registry.contents)}." - ) + return data_type_registry.match_dtype(np_dtype) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper: +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any.Any]: data_type_registry.lazy_load() dtype_name = dtype["name"] dtype_cls = data_type_registry.get(dtype_name) @@ -737,14 +772,14 @@ def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: return self.contents[key] - def match_dtype(self, dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: self.lazy_load() for val in self.contents.values(): try: return val.wrap(dtype) except TypeError: pass - raise ValueError(f"No data type wrapper found that matches {dtype}") + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: @@ -756,7 +791,7 @@ def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = StaticUnicodeString | VariableLengthString | StaticByteString +STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString DTYPE = ( Bool | INTEGER_DTYPE diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 20a4b276ca..6aef469b71 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -2,7 +2,6 @@ import warnings from collections.abc import Iterable -from enum import Enum from typing import TYPE_CHECKING, TypedDict, cast import numcodecs.abc @@ -10,8 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.metadata.dtype import ( DTypeWrapper, - StaticByteString, - StaticRawBytes, + Structured, get_data_type_from_numpy, ) @@ -130,7 +128,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() + json.dumps(zarray_dict, indent=json_indent).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() @@ -179,16 +177,18 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - - if ( - isinstance(self.dtype, StaticByteString | StaticRawBytes) - and self.fill_value is not None - ): + zarray_dict["compressor"] = zarray_dict["compressor"].get_config() + if zarray_dict["filters"] is not None: + raw_filters = zarray_dict["filters"] + new_filters = [] + for f in raw_filters: + if isinstance(f, numcodecs.abc.Codec): + new_filters.append(f.get_config()) + else: + new_filters.append(f) + zarray_dict["filters"] = new_filters + + if self.fill_value is not None: # There's a relationship between self.dtype and self.fill_value # that mypy isn't aware of. The fact that we have S or V dtype here # means we should have a bytes-type fill_value. @@ -197,10 +197,7 @@ def to_dict(self) -> dict[str, JSON]: _ = zarray_dict.pop("dtype") dtype_json: JSON - # TODO: Replace this with per-dtype method - # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string - dtype_descr = self.dtype.unwrap().descr - if self.dtype.unwrap().kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + if isinstance(self.dtype, Structured): dtype_json = tuple(self.dtype.unwrap().descr) else: dtype_json = self.dtype.unwrap().str diff --git a/tests/test_array.py b/tests/test_array.py index a618395ee5..97c3a5b572 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -442,31 +442,6 @@ async def test_nbytes_stored_async() -> None: assert result == 902 # the size with all chunks filled. -def test_default_fill_values() -> None: - a = zarr.Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: - with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array.create(MemoryStore(), shape=5, chunks=5, dtype=" None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 @@ -1245,7 +1220,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2) + filters=filters, compressor=compressors, dtype=get_data_type_from_numpy(dtype) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index a3f29a34f5..d0a0243a9f 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -1,17 +1,20 @@ from __future__ import annotations -from typing import Any +from typing import Any, get_args import numpy as np import pytest from zarr.core.metadata.dtype import ( + DTYPE, Bool, Complex64, Complex128, DataTypeRegistry, DateTime64, DTypeWrapper, + FixedLengthAsciiString, + FixedLengthUnicodeString, Float16, Float32, Float64, @@ -19,17 +22,22 @@ Int16, Int32, Int64, - StaticByteString, StaticRawBytes, - StaticUnicodeString, Structured, UInt8, UInt16, UInt32, UInt64, VariableLengthString, + data_type_registry, ) + +@pytest.fixture +def dtype_registry() -> DataTypeRegistry: + return DataTypeRegistry() + + _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") if _NUMPY_SUPPORTS_VLEN_STRING: VLEN_STRING_DTYPE = np.dtypes.StringDType() @@ -55,8 +63,8 @@ (Float64, "float64"), (Complex64, "complex64"), (Complex128, "complex128"), - (StaticUnicodeString, "U"), - (StaticByteString, "S"), + (FixedLengthUnicodeString, "U"), + (FixedLengthAsciiString, "S"), (StaticRawBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), @@ -80,23 +88,14 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st assert wrapped.unwrap() == dt -def test_registry_match() -> None: - """ - Test that registering a dtype in a data type registry works - Test that match_dtype resolves a numpy dtype into the stored dtype - Test that match_dtype raises an error if the dtype is not registered - """ - local_registry = DataTypeRegistry() - local_registry.register(Bool) - assert isinstance(local_registry.match_dtype(np.dtype("bool")), Bool) - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches {outside_dtype}" - ): - local_registry.match_dtype(np.dtype(outside_dtype)) - - -# start writing new tests here +@pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) +def test_dict_serialization(wrapper_cls: DTYPE) -> None: + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool(), 0),))) + else: + instance = wrapper_cls() + as_dict = instance.to_dict() + assert wrapper_cls.from_dict(data=as_dict.get("configuration", {})) == instance @pytest.mark.parametrize( @@ -116,9 +115,9 @@ def test_registry_match() -> None: (Float64(), np.float64(0)), (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), - (StaticByteString(length=3), np.bytes_(b"")), + (FixedLengthAsciiString(length=3), np.bytes_(b"")), (StaticRawBytes(length=3), np.void(b"")), - (StaticUnicodeString(length=3), np.str_("")), + (FixedLengthUnicodeString(length=3), np.str_("")), ( Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], @@ -154,9 +153,9 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Float64(), np.float64(42.0), 42.0), (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (StaticByteString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), - (StaticUnicodeString(length=4), np.str_("test"), "test"), + (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), ], @@ -187,9 +186,9 @@ def test_to_json_value_v2( (Float64(), 42.0, np.float64(42.0)), (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (StaticByteString(length=4), "dGVzdA==", np.bytes_(b"test")), + (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), - (StaticUnicodeString(length=4), "test", np.str_("test")), + (FixedLengthUnicodeString(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), ], @@ -201,3 +200,68 @@ def test_from_json_value( Test the from_json_value method for each dtype wrapper. """ assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value + + +class TestRegistry: + @staticmethod + def test_register(dtype_registry: DataTypeRegistry) -> None: + """ + Test that registering a dtype in a data type registry works. + """ + dtype_registry.register(Bool) + assert dtype_registry.get("bool") == Bool + assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) + + @staticmethod + def test_override(dtype_registry: DataTypeRegistry) -> None: + """ + Test that registering a new dtype with the same name works (overriding the previous one). + """ + dtype_registry.register(Bool) + + class NewBool(Bool): + def default_value(self) -> np.bool_: + return np.True_ + + dtype_registry.register(NewBool) + assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) + + @staticmethod + @pytest.mark.parametrize( + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicodeString, "|U4")] + ) + def test_match_dtype( + dtype_registry: DataTypeRegistry, wrapper_cls: type[DTypeWrapper[Any, Any]], dtype_str: str + ) -> None: + """ + Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. + """ + dtype_registry.register(wrapper_cls) + assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) + + @staticmethod + def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: + """ + Test that match_dtype raises an error if the dtype is not registered. + """ + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" + ): + dtype_registry.match_dtype(np.dtype(outside_dtype)) + + with pytest.raises(KeyError): + dtype_registry.get(outside_dtype) + + @staticmethod + @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: + """ + Test that the registered dtypes can be retrieved from the registry. + """ + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool(), 0),))) + else: + instance = wrapper_cls() + + assert data_type_registry.match_dtype(instance.unwrap()) == instance diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 5fa77a29b0..44887433e7 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -15,7 +15,7 @@ from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import complex_from_json, get_data_type_from_numpy +from zarr.core.metadata.dtype import DateTime64, complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -271,19 +271,19 @@ def test_json_indent(indent: int): assert d == json.dumps(json.loads(d), indent=indent).encode() -@pytest.mark.xfail(reason="Data type not supported yet") @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) @pytest.mark.parametrize("precision", ["ns", "D"]) async def test_datetime_metadata(fill_value: int, precision: str) -> None: + dtype = DateTime64(unit=precision) metadata_dict = { "zarr_format": 3, "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": f" None: elif fill_value == "-Infinity": assert np.isneginf(m.fill_value) assert d["fill_value"] == "-Infinity" - - -@pytest.mark.parametrize("dtype_str", dtypes) -def test_dtypes(dtype_str: str) -> None: - dt = get_data_type_from_numpy(dtype_str) - np_dtype = dt.unwrap() - assert isinstance(np_dtype, dt.dtype_cls) - assert np_dtype.type(0) == dt.cast_value(0) diff --git a/tests/test_v2.py b/tests/test_v2.py index c14f7496ae..dfd81f525a 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -107,7 +107,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) -@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), ("O", "Y")]) +@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), (str, "Y")]) def test_v2_encode_decode_with_data(dtype, value): dtype, value = dtype, value expected = np.full((3,), value, dtype=dtype) @@ -121,18 +121,6 @@ def test_v2_encode_decode_with_data(dtype, value): np.testing.assert_equal(data, expected) -@pytest.mark.parametrize("dtype", [str, "str"]) -async def test_create_dtype_str(dtype: Any) -> None: - data = ["a", "bb", "ccc"] - arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) - assert arr.dtype.kind == "O" - assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = data - result = arr[:] - np.testing.assert_array_equal(result, np.array(data, dtype="object")) - - @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: From 3507eff7b6013b4ef7d3ffcfe93bce497781883c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 5 Mar 2025 19:50:54 +0100 Subject: [PATCH 018/129] fix dtype mechanics in bytescodec --- src/zarr/codecs/bytes.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 694a52b455..af47eb6037 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -72,14 +72,8 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - if chunk_spec.dtype.unwrap().itemsize > 0: - if self.endian == Endian.little: - prefix = "<" - else: - prefix = ">" - dtype = np.dtype(f"{prefix}{chunk_spec.dtype.unwrap().str[1:]}") - else: - dtype = np.dtype(f"|{chunk_spec.dtype.unwrap().str[1:]}") + + dtype = chunk_spec.dtype.with_endianness(self.endian).unwrap() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): From 53205cab591b3e9470bf09b731645ea073c7cda5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 7 Mar 2025 23:14:22 +0100 Subject: [PATCH 019/129] remove __post_init__ magic in favor of more explicit declaration --- src/zarr/codecs/bytes.py | 5 ++-- src/zarr/core/metadata/dtype.py | 48 +++++++++++++++++---------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index af47eb6037..ab721cf651 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -72,8 +72,9 @@ async def _decode_single( chunk_spec: ArraySpec, ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) - - dtype = chunk_spec.dtype.with_endianness(self.endian).unwrap() + # TODO: remove endianness enum in favor of literal union + endian_str = self.endian.value if self.endian is not None else None + dtype = chunk_spec.dtype.with_endianness(endian_str).unwrap() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 17e67fbb05..33aa22b398 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -16,12 +16,10 @@ TypeVar, cast, get_args, - get_origin, ) import numpy as np import numpy.typing as npt -from typing_extensions import get_original_bases from zarr.abc.metadata import Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING @@ -245,23 +243,6 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype endianness: Endianness | None = "native" - def __init_subclass__(cls) -> None: - # TODO: wrap this in some *very informative* error handling - generic_args = get_args(get_original_bases(cls)[0]) - # the logic here is that if a subclass was created with generic type parameters - # specified explicitly, then we bind that type parameter to the dtype_cls attribute - if len(generic_args) > 0: - cls.dtype_cls = generic_args[0] - else: - # but if the subclass was created without generic type parameters specified explicitly, - # then we check the parent DTypeWrapper classes and retrieve their generic type parameters - for base in cls.__orig_bases__: - if get_origin(base) is DTypeWrapper: - generic_args = get_args(base) - cls.dtype_cls = generic_args[0] - break - return super().__init_subclass__() - def to_dict(self) -> dict[str, JSON]: return {"name": self.name} @@ -314,6 +295,7 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal @dataclass(frozen=True, kw_only=True) class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): name = "bool" + dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType def default_value(self) -> np.bool_: return np.False_ @@ -350,41 +332,49 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType name = "int8" @dataclass(frozen=True, kw_only=True) class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType name = "uint8" @dataclass(frozen=True, kw_only=True) class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): + dtype_cls = np.dtypes.Int16DType name = "int16" @dataclass(frozen=True, kw_only=True) class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): + dtype_cls = np.dtypes.UInt16DType name = "uint16" @dataclass(frozen=True, kw_only=True) class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): + dtype_cls = np.dtypes.Int32DType name = "int32" @dataclass(frozen=True, kw_only=True) class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): + dtype_cls = np.dtypes.UInt32DType name = "uint32" @dataclass(frozen=True, kw_only=True) class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): + dtype_cls = np.dtypes.Int64DType name = "int64" @dataclass(frozen=True, kw_only=True) class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): + dtype_cls = np.dtypes.UInt64DType name = "uint64" @@ -407,21 +397,25 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @dataclass(frozen=True, kw_only=True) class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType name = "float16" @dataclass(frozen=True, kw_only=True) class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): + dtype_cls = np.dtypes.Float32DType name = "float32" @dataclass(frozen=True, kw_only=True) class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): + dtype_cls = np.dtypes.Float64DType name = "float64" @dataclass(frozen=True, kw_only=True) class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType name = "complex64" def default_value(self) -> np.complex64: @@ -444,6 +438,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): + dtype_cls = np.dtypes.Complex128DType name = "complex128" def default_value(self) -> np.complex128: @@ -480,7 +475,8 @@ def unwrap(self) -> TDType: @dataclass(frozen=True, kw_only=True) class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): - name = "numpy/static_byte_string" + dtype_cls = np.dtypes.BytesDType + name = "numpy.static_byte_string" item_size_bits = 8 def default_value(self) -> np.bytes_: @@ -500,6 +496,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType name = "r*" item_size_bits = 8 @@ -532,7 +529,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): - name = "numpy/static_unicode_string" + dtype_cls = np.dtypes.StrDType + name = "numpy.static_unicode_string" item_size_bits = 32 # UCS4 is 32 bits per code point def default_value(self) -> np.str_: @@ -554,7 +552,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): - name = "numpy/vlen_string" + dtype_cls = np.dtypes.StringDType + name = "numpy.vlen_string" def default_value(self) -> str: return "" @@ -582,7 +581,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): - name = "numpy/vlen_string" + dtype_cls = np.dtypes.ObjectDType + name = "numpy.vlen_string" endianness: Endianness = field(default=None) def default_value(self) -> str: @@ -617,6 +617,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType name = "numpy/datetime64" unit: DateUnit | TimeUnit = "s" @@ -647,6 +648,7 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType name = "numpy/struct" fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] From ba9c06e59afdcb0971d2017b9d05b4b955bc84d2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 9 Mar 2025 12:53:32 +0100 Subject: [PATCH 020/129] fix tests --- src/zarr/core/metadata/v2.py | 7 ++++++- tests/test_metadata/test_dtype.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6aef469b71..6b1f364a08 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -177,7 +177,12 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): - zarray_dict["compressor"] = zarray_dict["compressor"].get_config() + codec_config = zarray_dict["compressor"].get_config() + # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 + if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): + codec_config.pop("checksum") + zarray_dict["compressor"] = codec_config + if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] new_filters = [] diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index d0a0243a9f..8a1bcdedd1 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -116,7 +116,7 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), (FixedLengthAsciiString(length=3), np.bytes_(b"")), - (StaticRawBytes(length=3), np.void(b"")), + (StaticRawBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicodeString(length=3), np.str_("")), ( Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), From 04f3b849d368d7634f4c3956dad8d2bf4e8c857d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 Mar 2025 10:46:28 +0100 Subject: [PATCH 021/129] refactor data types --- src/zarr/api/asynchronous.py | 3 +- src/zarr/codecs/_v2.py | 6 +- src/zarr/codecs/blosc.py | 4 +- src/zarr/codecs/bytes.py | 6 +- src/zarr/codecs/sharding.py | 5 +- src/zarr/core/_info.py | 2 +- src/zarr/core/array.py | 44 +- src/zarr/core/array_spec.py | 9 +- src/zarr/core/buffer/cpu.py | 7 +- src/zarr/core/codec_pipeline.py | 2 +- src/zarr/core/dtype/__init__.py | 115 ++++ src/zarr/core/dtype/_numpy.py | 821 +++++++++++++++++++++++ src/zarr/core/dtype/common.py | 679 ++++++++++++++----- src/zarr/core/dtype/registry.py | 206 +----- src/zarr/core/dtype/wrapper.py | 316 ++++----- src/zarr/core/metadata/dtype.py | 808 ---------------------- src/zarr/core/metadata/v2.py | 21 +- src/zarr/core/metadata/v3.py | 19 +- src/zarr/registry.py | 2 +- src/zarr/testing/strategies.py | 8 +- tests/conftest.py | 4 +- tests/test_array.py | 28 +- tests/test_codecs/test_vlen.py | 2 +- tests/test_metadata/test_consolidated.py | 3 +- tests/test_metadata/test_dtype.py | 51 +- tests/test_metadata/test_v2.py | 4 +- tests/test_metadata/test_v3.py | 9 +- 27 files changed, 1722 insertions(+), 1462 deletions(-) create mode 100644 src/zarr/core/dtype/__init__.py create mode 100644 src/zarr/core/dtype/_numpy.py diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 352630e6d1..ac4782007d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -28,7 +28,7 @@ _default_zarr_format, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import ZDTypeLike, get_data_type_from_native_dtype +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -36,7 +36,6 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.errors import NodeTypeValidationError from zarr.storage._common import make_store_path diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index f0b7bfc9c9..c03e3c55fb 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType: try: - chunk = chunk.view(chunk_spec.dtype.unwrap()) + chunk = chunk.view(chunk_spec.dtype.to_dtype()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.unwrap()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype.unwrap(), order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.to_dtype(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 6ef1540acf..37207d52c4 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -143,13 +143,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = array_spec.dtype.item_size new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.unwrap().itemsize) + new_codec = replace(new_codec, typesize=dtype.to_dtype().itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, shuffle=( BloscShuffle.bitshuffle - if dtype.unwrap().itemsize == 1 + if dtype.to_dtype().itemsize == 1 else BloscShuffle.shuffle ), ) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index ab721cf651..157b5443dc 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,7 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: @@ -57,7 +57,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.unwrap().itemsize == 1: + if array_spec.dtype.to_dtype().itemsize == 1: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -74,7 +74,7 @@ async def _decode_single( assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union endian_str = self.endian.value if self.endian is not None else None - dtype = chunk_spec.dtype.with_endianness(endian_str).unwrap() + dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 10abb13aa1..32559e7fb8 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -51,7 +51,6 @@ get_indexer, morton_order_iter, ) -from zarr.core.metadata.dtype import DTypeWrapper from zarr.core.metadata.v3 import parse_codecs from zarr.registry import get_ndbuffer_class, get_pipeline_class, register_codec @@ -60,7 +59,7 @@ from typing import Self from zarr.core.common import JSON - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.dtype.wrapper import DTypeWrapper MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -493,7 +492,7 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( shape=indexer.shape, - dtype=shard_spec.dtype.unwrap(), + dtype=shard_spec.dtype.to_dtype(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 22ef37eef8..880d27fff7 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat -from zarr.core.metadata.dtype import DTypeWrapper +from zarr.core.dtype.wrapper import DTypeWrapper # from zarr.core.metadata.v3 import DataType diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2a8bc8a630..926a9bc472 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -70,13 +70,12 @@ from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( - VariableLengthBytes, - VariableLengthUTF8, - ZDType, - ZDTypeLike, + DTypeWrapper, + FixedLengthAsciiString, + FixedLengthUnicodeString, + VariableLengthString, parse_data_type, ) -from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -111,13 +110,6 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.dtype import ( - DTypeWrapper, - FixedLengthAsciiString, - FixedLengthUnicodeString, - VariableLengthString, - get_data_type_from_numpy, -) from zarr.core.metadata.v2 import ( parse_compressor, parse_filters, @@ -594,7 +586,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike[Any] | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | DTypeWrapper[Any, Any], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, @@ -623,11 +615,8 @@ async def _create( See :func:`AsyncArray.create` for more details. Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - # TODO: delete this and be more strict about where parsing occurs - if not isinstance(dtype, DTypeWrapper): - dtype_parsed = get_data_type_from_numpy(np.dtype(dtype)) - else: - dtype_parsed = dtype + + dtype_parsed = parse_data_type(dtype) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -638,9 +627,9 @@ async def _create( if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size if chunks: - _chunks = normalize_chunks(chunks, shape, dtype_parsed.unwrap().itemsize) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.to_dtype().itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.unwrap().itemsize) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.to_dtype().itemsize) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @@ -745,7 +734,7 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.unwrap().kind in ("U", "T", "S"): + if dtype.to_dtype().kind in ("U", "T", "S"): warn( f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", @@ -1111,9 +1100,9 @@ def dtype(self) -> np.dtype[Any]: Data type of the array """ if self.metadata.zarr_format == 2: - return self.metadata.dtype.unwrap() + return self.metadata.dtype.to_dtype() else: - return self.metadata.data_type.unwrap() + return self.metadata.data_type.to_dtype() @property def order(self) -> MemoryOrder: @@ -4257,10 +4246,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - if not isinstance(dtype, DTypeWrapper): - dtype_wrapped = get_data_type_from_numpy(dtype) - else: - dtype_wrapped = dtype + dtype_wrapped = parse_data_type(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4282,7 +4268,7 @@ async def init_array( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_wrapped.unwrap().itemsize, + item_size=dtype_wrapped.to_dtype().itemsize, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -4689,7 +4675,7 @@ def _get_default_chunk_encoding_v3( elif isinstance(dtype, FixedLengthAsciiString): serializer = VLenBytesCodec() else: - if dtype.unwrap().itemsize == 1: + if dtype.to_dtype().itemsize == 1: serializer = BytesCodec(endian=None) else: serializer = BytesCodec() diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index ef111ba20c..d5f6b00862 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,7 +11,7 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import DTypeWrapper, get_data_type_from_numpy +from zarr.core.dtype import parse_data_type if TYPE_CHECKING: from typing import NotRequired @@ -20,7 +20,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.dtype.wrapper import DTypeWrapper class ArrayConfigParams(TypedDict): @@ -106,10 +106,7 @@ def __init__( prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - if not isinstance(dtype, DTypeWrapper): - dtype_parsed = get_data_type_from_numpy(dtype) - else: - dtype_parsed = dtype + dtype_parsed = parse_data_type(dtype) fill_value_parsed = parse_fill_value(fill_value) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index d60801ba38..0205f16ab1 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -10,7 +10,6 @@ import numpy.typing as npt from zarr.core.buffer import core -from zarr.core.metadata.dtype import DTypeWrapper from zarr.registry import ( register_buffer, register_ndbuffer, @@ -159,11 +158,7 @@ def create( if fill_value is None or (isinstance(fill_value, int) and fill_value == 0): return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order)) else: - return cls( - np.full( - shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order - ) - ) + return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order)) @classmethod def empty( diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index f5ad7b668f..4e5f6603ff 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -319,7 +319,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype.unwrap(), + dtype=chunk_spec.dtype.to_dtype(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py new file mode 100644 index 0000000000..432eabf2ce --- /dev/null +++ b/src/zarr/core/dtype/__init__.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, get_args + +import numpy as np + +from zarr.core.dtype.common import _NUMPY_SUPPORTS_VLEN_STRING + +if TYPE_CHECKING: + import numpy.typing as npt + + from zarr.core.common import JSON + +from zarr.core.dtype._numpy import ( + Bool, + Complex64, + Complex128, + DateTime64, + FixedLengthAsciiString, + FixedLengthBytes, + FixedLengthUnicodeString, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, +) +from zarr.core.dtype.registry import DataTypeRegistry +from zarr.core.dtype.wrapper import DTypeWrapper + +__all__ = [ + "Complex64", + "Complex128", + "DTypeWrapper", + "DateTime64", + "FixedLengthAsciiString", + "FixedLengthBytes", + "FixedLengthUnicodeString", + "Float16", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "Structured", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "VariableLengthString", + "data_type_registry", + "parse_data_type", +] + +data_type_registry = DataTypeRegistry() + +INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +FLOAT_DTYPE = Float16 | Float32 | Float64 +COMPLEX_DTYPE = Complex64 | Complex128 +STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString +DTYPE = ( + Bool + | INTEGER_DTYPE + | FLOAT_DTYPE + | COMPLEX_DTYPE + | STRING_DTYPE + | FixedLengthBytes + | Structured + | DateTime64 +) + +for dtype in get_args(DTYPE): + data_type_registry.register(dtype._zarr_v3_name, dtype) + + +def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: + data_type_registry.lazy_load() + if not isinstance(dtype, np.dtype): + if dtype in (str, "str"): + if _NUMPY_SUPPORTS_VLEN_STRING: + np_dtype = np.dtype("T") + else: + np_dtype = np.dtype("O") + elif isinstance(dtype, list): + # this is a valid _VoidDTypeLike check + np_dtype = np.dtype([tuple(d) for d in dtype]) + else: + np_dtype = np.dtype(dtype) + else: + np_dtype = dtype + return data_type_registry.match_dtype(np_dtype) + + +def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any, Any]: + return data_type_registry.match_json(dtype) + + +def parse_data_type( + dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], +) -> DTypeWrapper[Any, Any]: + if isinstance(dtype, DTypeWrapper): + return dtype + elif isinstance(dtype, dict): + return get_data_type_from_dict(dtype) + else: + return get_data_type_from_numpy(dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py new file mode 100644 index 0000000000..b98cc100e3 --- /dev/null +++ b/src/zarr/core/dtype/_numpy.py @@ -0,0 +1,821 @@ +from __future__ import annotations + +import base64 +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, TypeGuard, cast, get_args + +import numpy as np + +from zarr.core.dtype.common import ( + _NUMPY_SUPPORTS_VLEN_STRING, + DataTypeValidationError, + Endianness, + JSONFloat, + bytes_from_json, + bytes_to_json, + check_json_bool, + check_json_complex_float, + check_json_complex_float_v3, + check_json_float_v2, + check_json_int, + check_json_str, + complex_from_json, + complex_to_json, + datetime_from_json, + datetime_to_json, + endianness_from_numpy_str, + endianness_to_numpy_str, + float_from_json, + float_to_json, +) +from zarr.core.dtype.wrapper import DTypeWrapper, TDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + + +@dataclass(frozen=True, kw_only=True) +class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): + """ + Wrapper for numpy boolean dtype. + + Attributes + ---------- + name : str + The name of the dtype. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] + The numpy dtype class. + """ + + _zarr_v3_name = "bool" + dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.bool_: + """ + Get the default value for the boolean dtype. + + Returns + ------- + np.bool_ + The default value. + """ + return np.False_ + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + """ + Wrap a numpy boolean dtype without checking. + + Parameters + ---------- + dtype : np.dtypes.BoolDType + The numpy dtype to wrap. + + Returns + ------- + Self + The wrapped dtype. + """ + return cls() + + def to_dtype(self) -> np.dtypes.BoolDType: + return self.dtype_cls() + + def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: + """ + Convert a boolean value to JSON-serializable format. + + Parameters + ---------- + data : object + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bool + The JSON-serializable format. + """ + return bool(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + """ + Read a JSON-serializable value as a numpy boolean scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ + if check_json_bool(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") + + +@dataclass(frozen=True, kw_only=True) +class Int8(DTypeWrapper[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name = "int8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.Int8DType: + return self.dtype_cls() + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int8: + return self.to_dtype().type(0) + + def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.UInt8DType: + return self.dtype_cls() + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint8: + return self.to_dtype().type(0) + + def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name = "int16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int16: + return self.cast_value(0) + + def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name = "uint16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint16: + return self.cast_value(0) + + def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name = "int32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int32: + return self.cast_value(0) + + def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): + dtype_cls = np.dtypes.UInt32DType + _zarr_v3_name = "uint32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint32: + return self.cast_value(0) + + def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name = "int64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Int64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Int64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.int64: + return self.cast_value(0) + + def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): + dtype_cls = np.dtypes.UInt64DType + _zarr_v3_name = "uint64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.UInt64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.uint64: + return self.cast_value(0) + + def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: + return int(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: + if check_json_int(data): + return self.cast_value(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + +@dataclass(frozen=True, kw_only=True) +class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType + _zarr_v3_name = "float16" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float16DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float16DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.float16: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): + dtype_cls = np.dtypes.Float32DType + _zarr_v3_name = "float32" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float32DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float32DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def cast_value(self, value: object) -> np.float32: + return self.to_dtype().type(value) + + def default_value(self) -> np.float32: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): + dtype_cls = np.dtypes.Float64DType + _zarr_v3_name = "float64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Float64DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Float64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.float64: + return self.to_dtype().type(0.0) + + def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: + return float_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: + if check_json_float_v2(data): + return self.to_dtype().type(float_from_json(data, zarr_format)) + raise TypeError(f"Invalid type: {data}. Expected a float.") + + +@dataclass(frozen=True, kw_only=True) +class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType + _zarr_v3_name = "complex64" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.Complex64DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.complex64: + return np.complex64(0.0) + + def to_json_value( + self, data: np.complex64, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: + if check_json_complex_float(data, zarr_format=zarr_format): + return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + + +@dataclass(frozen=True, kw_only=True) +class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): + dtype_cls = np.dtypes.Complex128DType + _zarr_v3_name = "complex128" + endianness: Endianness | None = "native" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: + return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + + def to_dtype(self) -> np.dtypes.Complex128DType: + return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def default_value(self) -> np.complex128: + return np.complex128(0.0) + + def to_json_value( + self, data: np.complex128, zarr_format: ZarrFormat + ) -> tuple[JSONFloat, JSONFloat]: + return complex_to_json(data, zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: + if check_json_complex_float_v3(data): + return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) + raise TypeError(f"Invalid type: {data}. Expected a complex float.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.static_byte_string" + item_size_bits: ClassVar[int] = 8 + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.BytesDType: + return self.dtype_cls(self.length) + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} + + def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): + dtype_cls = np.dtypes.VoidDType[Any] + _zarr_v3_name = "r*" + item_size_bits: ClassVar[int] = 8 + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[Any]) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def default_value(self) -> np.void: + return self.cast_value(("\x00" * self.length).encode("ascii")) + + def to_dtype(self) -> np.dtypes.VoidDType[Any]: + # Numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly + return np.dtype(f"V{self.length}") + + def get_name(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return super().get_name(zarr_format=zarr_format) + # note that we don't return self._zarr_v3_name + # because the name is parametrized by the length + return f"r{self.length * self.item_size_bits}" + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Reject structured dtypes by ensuring that dtype.fields is None + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is None + + @classmethod + def check_json(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + # Overriding the base class implementation because the r* dtype + # does not have a name that will can appear in array metadata + # Instead, array metadata will contain names like "r8", "r16", etc + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and re.match(r"^r\d+$", data["name"]) + ) + + def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data.tobytes()).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data)) + raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): + dtype_cls = np.dtypes.StrDType[int] + _zarr_v3_name = "numpy.static_unicode_string" + item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + endianness: Endianness | None = "native" + length: int = 1 + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + return cls( + length=dtype.itemsize // (cls.item_size_bits // 8), + endianness=endianness_from_numpy_str(dtype.byteorder), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + return self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)) + + def default_value(self) -> np.str_: + return np.str_("") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} + + def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.cast_value(data) + + +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): + dtype_cls = np.dtypes.StringDType + _zarr_v3_name = "numpy.vlen_string" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: + return cls() + + def default_value(self) -> str: + return "" + + def cast_value(self, value: object) -> str: + return str(value) + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.cast_value(data) + +else: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name = "numpy.vlen_string" + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: + return cls() + + def to_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + def cast_value(self, value: object) -> str: + return str(value) + + def default_value(self) -> str: + return "" + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3)} + + def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: + return data + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + String literals pass through + """ + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] + + +@dataclass(frozen=True, kw_only=True) +class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType + _zarr_v3_name = "numpy.datetime64" + unit: DateUnit | TimeUnit = "s" + endianness: Endianness = "native" + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + def to_dict(self) -> dict[str, JSON]: + return {"name": self.get_name(zarr_format=3), "configuration": {"unit": self.unit}} + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + if unit not in get_args(DateUnit | TimeUnit): + raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') + return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) + + def cast_value(self, value: object) -> np.datetime64: + return self.to_dtype().type(value, self.unit) + + def to_dtype(self) -> np.dtypes.DateTime64DType: + # Numpy does not allow creating datetime64 via + # np.dtypes.DateTime64Dtype() + return np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) + + +@dataclass(frozen=True, kw_only=True) +class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): + dtype_cls = np.dtypes.VoidDType + _zarr_v3_name = "numpy.structured" + fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] + + def default_value(self) -> np.void: + return self.cast_value(0) + + def cast_value(self, value: object) -> np.void: + return np.array([value], dtype=self.to_dtype())[0] + + @classmethod + def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + """ + Check that this dtype is a numpy structured dtype + + Parameters + ---------- + dtype : np.dtypes.DTypeLike + The dtype to check. + + Returns + ------- + TypeGuard[np.dtypes.VoidDType] + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + from zarr.core.dtype import get_data_type_from_numpy + + fields: list[tuple[str, DTypeWrapper[Any, Any]]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + for key, (dtype_instance, _) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_numpy(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + + def get_name(self, zarr_format: ZarrFormat) -> str | list[tuple[str, str]]: + if zarr_format == 2: + return [[k, d.get_name(zarr_format=2)] for k, d in self.fields] + return self._zarr_v3_name + + def to_dict(self) -> dict[str, JSON]: + base_dict = {"name": self.get_name(zarr_format=3)} + field_configs = [(f_name, f_dtype.to_dict()) for f_name, f_dtype in self.fields] + base_dict["configuration"] = {"fields": field_configs} + return base_dict + + @classmethod + def check_json(cls, data: JSON) -> bool: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and "fields" in data["configuration"] + ) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + if cls.check_json(data): + from zarr.core.dtype import get_data_type_from_dict + + fields = tuple( + (f_name, get_data_type_from_dict(f_dtype)) + for f_name, f_dtype in data["configuration"]["fields"] + ) + return cls(fields=fields) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + + def to_dtype(self) -> np.dtypes.VoidDType: + return cast(np.void, np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields])) + + def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(data.tobytes(), zarr_format) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 3cc31df9e3..1dbf22c3c2 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,235 +1,602 @@ from __future__ import annotations -import warnings -from collections.abc import Mapping, Sequence -from dataclasses import dataclass -from typing import ( - ClassVar, - Final, - Generic, - Literal, - TypedDict, - TypeGuard, - TypeVar, -) +import base64 +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast, get_args -from typing_extensions import ReadOnly +import numpy as np -from zarr.core.common import NamedConfig +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype._numpy import DateUnit, TimeUnit -EndiannessStr = Literal["little", "big"] -ENDIANNESS_STR: Final = "little", "big" +Endianness = Literal["little", "big", "native"] +EndiannessNumpy = Literal[">", "<", "=", "|"] +JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] -SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] -SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") -JSONFloatV2 = float | SpecialFloatStrings -JSONFloatV3 = float | SpecialFloatStrings | str -ObjectCodecID = Literal["vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2"] -# These are the ids of the known object codecs for zarr v2. -OBJECT_CODEC_IDS: Final = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") +class DataTypeValidationError(ValueError): ... -# This is a wider type than our standard JSON type because we need -# to work with typeddict objects which are assignable to Mapping[str, object] -DTypeJSON = str | int | float | Sequence["DTypeJSON"] | None | Mapping[str, object] -# The DTypeJSON_V2 type exists because ZDType.from_json takes a single argument, which must contain -# all the information necessary to decode the data type. Zarr v2 supports multiple distinct -# data types that all used the "|O" data type identifier. These data types can only be -# discriminated on the basis of their "object codec", i.e. a special data type specific -# compressor or filter. So to figure out what data type a zarr v2 array has, we need the -# data type identifier from metadata, as well as an object codec id if the data type identifier -# is "|O". -# So we will pack the name of the dtype alongside the name of the object codec id, if applicable, -# in a single dict, and pass that to the data type inference logic. -# These type variables have a very wide bound because the individual zdtype -# classes can perform a very specific type check. +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: + """ + Convert an endianness literal to its numpy string representation. -# This is the JSON representation of a structured dtype in zarr v2 -StructuredName_V2 = Sequence["str | StructuredName_V2"] + Parameters + ---------- + endianness : Endianness or None + The endianness to convert. + + Returns + ------- + Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + ) -# This models the type of the name a dtype might have in zarr v2 array metadata -DTypeName_V2 = StructuredName_V2 | str -TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True) -TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True) +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: + """ + Convert a numpy endianness string literal to a human-readable literal value. + Parameters + ---------- + endianness : Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Returns + ------- + Endianness or None + The human-readable representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "<": + return "little" + case ">": + return "big" + case "=": + return "native" + case "|": + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) -class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): - name: ReadOnly[TDTypeNameV2_co] - object_codec_id: ReadOnly[TObjectCodecID_co] +def check_json_bool(data: JSON) -> TypeGuard[bool]: + """ + Check if a JSON value is a boolean. -DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] + Parameters + ---------- + data : JSON + The JSON value to check. + Returns + ------- + Bool + True if the data is a boolean, False otherwise. + """ + return bool(isinstance(data, bool)) -def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2]: + +def check_json_str(data: JSON) -> TypeGuard[str]: """ - A type guard for the inner elements of a structured dtype. This is a recursive check because - the type is itself recursive. + Check if a JSON value is a string. + + Parameters + ---------- + data : JSON + The JSON value to check. - This check ensures that all the elements are 2-element sequences beginning with a string - and ending with either another string or another 2-element sequence beginning with a string and - ending with another instance of that type. + Returns + ------- + Bool + True if the data is a string, False otherwise. """ - if isinstance(data, (str, Mapping)): - return False - if not isinstance(data, Sequence): - return False - if len(data) != 2: - return False - if not (isinstance(data[0], str)): - return False - if isinstance(data[-1], str): - return True - elif isinstance(data[-1], Sequence): - return check_structured_dtype_v2_inner(data[-1]) - return False + return bool(isinstance(data, str)) -def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]: +def check_json_int(data: JSON) -> TypeGuard[int]: """ - Check that all the elements of a sequence are valid zarr v2 structured dtype identifiers + Check if a JSON value is an integer. + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is an integer, False otherwise. """ - return all(check_structured_dtype_v2_inner(d) for d in data) + return bool(isinstance(data, int)) -def check_dtype_name_v2(data: object) -> TypeGuard[DTypeName_V2]: +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: """ - Type guard for narrowing the type of a python object to an valid zarr v2 dtype name. + Check if a JSON value represents a float (v2). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. """ - if isinstance(data, str): + if data == "NaN" or data == "Infinity" or data == "-Infinity": return True - elif isinstance(data, Sequence): - return check_structured_dtype_name_v2(data) - return False + return isinstance(data, float | int) -def check_dtype_spec_v2(data: object) -> TypeGuard[DTypeSpec_V2]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: """ - Type guard for narrowing a python object to an instance of DTypeSpec_V2 + Check if a JSON value represents a float (v3). + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a float, False otherwise. """ - if not isinstance(data, Mapping): - return False - if set(data.keys()) != {"name", "object_codec_id"}: - return False - if not check_dtype_name_v2(data["name"]): - return False - return isinstance(data["object_codec_id"], str | None) + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) -# By comparison, The JSON representation of a dtype in zarr v3 is much simpler. -# It's either a string, or a structured dict -DTypeSpec_V3 = str | NamedConfig[str, Mapping[str, object]] +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + """ + Check if a JSON value represents a float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) -def check_dtype_spec_v3(data: object) -> TypeGuard[DTypeSpec_V3]: +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Type guard for narrowing the type of a python object to an instance of - DTypeSpec_V3, i.e either a string or a dict with a "name" field that's a string and a - "configuration" field that's a mapping with string keys. + Check if a JSON value represents a complex float, as per the zarr v3 spec + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. """ - if isinstance(data, str) or ( # noqa: SIM103 - isinstance(data, Mapping) - and set(data.keys()) == {"name", "configuration"} - and isinstance(data["configuration"], Mapping) - and all(isinstance(k, str) for k in data["configuration"]) - ): - return True - return False + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v3(data[0]) + and check_json_float_v3(data[1]) + ) -def unpack_dtype_json(data: DTypeSpec_V2 | DTypeSpec_V3) -> DTypeJSON: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Return the array metadata form of the dtype JSON representation. For the Zarr V3 form of dtype - metadata, this is a no-op. For the Zarr V2 form of dtype metadata, this unpacks the dtype name. + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + + Parameters + ---------- + data : JSON + The JSON value to check. + + Returns + ------- + Bool + True if the data is a complex float, False otherwise. """ - if isinstance(data, Mapping) and set(data.keys()) == {"name", "object_codec_id"}: - return data["name"] - return data + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) -class DataTypeValidationError(ValueError): ... +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data represents a complex float, False otherwise. + """ + if zarr_format == 2: + return check_json_complex_float_v2(data) + return check_json_complex_float_v3(data) -class ScalarTypeValidationError(ValueError): ... +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v2). + Parameters + ---------- + data : float or np.floating + The float value to convert. -@dataclass(frozen=True, kw_only=True) -class HasLength: + Returns + ------- + JSONFloat + The JSON representation of the float. """ - A mix-in class for data types with a length attribute, such as fixed-size collections - of unicode strings, or bytes. + if np.isnan(data): + return "NaN" + elif np.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + return float(data) - Attributes + +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: + """ + Convert a float to JSON (v3). + + Parameters ---------- - length : int - The length of the scalars belonging to this data type. Note that this class does not assign - a unit to the length. Child classes may assign units. + data : float or np.floating + The float value to convert. + + Returns + ------- + JSONFloat + The JSON representation of the float. """ + # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly + # so we just reuse the v2 routine here + return float_to_json_v2(data) - length: int +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: + """ + Convert a float to JSON, parametrized by the zarr format version. -@dataclass(frozen=True, kw_only=True) -class HasEndianness: + Parameters + ---------- + data : float or np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSONFloat + The JSON representation of the float. """ - A mix-in class for data types with an endianness attribute + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: """ + Convert a complex number to JSON (v2). - endianness: EndiannessStr = "little" + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. + """ + return float_to_json_v2(data.real), float_to_json_v2(data.imag) -@dataclass(frozen=True, kw_only=True) -class HasItemSize: +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: """ - A mix-in class for data types with an item size attribute. - This mix-in bears a property ``item_size``, which denotes the size of each element of the data - type, in bytes. + Convert a complex number to JSON (v3). + + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + + Returns + ------- + tuple[JSONFloat, JSONFloat] + The JSON representation of the complex number. """ + return float_to_json_v3(data.real), float_to_json_v3(data.imag) - @property - def item_size(self) -> int: - raise NotImplementedError +def complex_to_json( + data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat]: + """ + Convert a complex number to JSON, parametrized by the zarr format version. -@dataclass(frozen=True, kw_only=True) -class HasObjectCodec: + Parameters + ---------- + data : complex or np.complexfloating + The complex value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + tuple[JSONFloat, JSONFloat] or JSONFloat + The JSON representation of the complex number. """ - A mix-in class for data types that require an object codec id. - This class bears the property ``object_codec_id``, which is the string name of an object - codec that is required to encode and decode the data type. + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - In zarr-python 2.x certain data types like variable-length strings or variable-length arrays - used the catch-all numpy "object" data type for their in-memory representation. But these data - types cannot be stored as numpy object data types, because the object data type does not define - a fixed memory layout. So these data types required a special codec, called an "object codec", - that effectively defined a compact representation for the data type, which was used to encode - and decode the data type. - Zarr-python 2.x would not allow the creation of arrays with the "object" data type if an object - codec was not specified, and thus the name of the object codec is effectively part of the data - type model. +def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: + """ + Convert bytes to JSON. + + Parameters + ---------- + data : bytes + The structured scalar value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The bytes encoded as ascii using the base64 alphabet. """ + if zarr_format == 2: + return base64.b64encode(data).decode("ascii") + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") - object_codec_id: ClassVar[str] +def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: + """ + Convert a JSON string to bytes -class UnstableSpecificationWarning(FutureWarning): ... + Parameters + ---------- + data : str + The JSON string to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bytes + The bytes. + """ + if zarr_format == 2: + return base64.b64decode(data.encode("ascii")) + raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") -def v3_unstable_dtype_warning(dtype: object) -> None: +def float_from_json_v2(data: JSONFloat) -> float: """ - Emit this warning when a data type does not have a stable zarr v3 spec + Convert a JSON float to a float (Zarr v2). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. """ - msg = ( - f"The data type ({dtype}) does not have a Zarr V3 specification. " - "That means that the representation of arrays saved with this data type may change without " - "warning in a future version of Zarr Python. " - "Arrays stored with this data type may be unreadable by other Zarr libraries. " - "Use this data type at your own risk! " - "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " - "status of data type specifications for Zarr V3." - ) - warnings.warn(msg, category=UnstableSpecificationWarning, stacklevel=2) + match data: + case "NaN": + return float("nan") + case "Infinity": + return float("inf") + case "-Infinity": + return float("-inf") + case _: + return float(data) + + +def float_from_json_v3(data: JSONFloat) -> float: + """ + Convert a JSON float to a float (v3). + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + + Returns + ------- + float + The float value. + """ + # todo: support the v3-specific NaN handling + return float_from_json_v2(data) + + +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: + """ + Convert a JSON float to a float based on zarr format. + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + float + The float value. + """ + if zarr_format == 2: + return float_from_json_v2(data) + else: + return float_from_json_v3(data) + + +def complex_from_json_v2( + data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number (v2). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + + Returns + ------- + np.complexfloating + The complex number. + """ + return dtype.type(complex(float_from_json_v2(data[0]), float_from_json_v2(data[1]))) + + +def complex_from_json_v3( + data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + + Returns + ------- + np.complexfloating + The complex number. + """ + return dtype.type(complex(float_from_json_v3(data[0]), float_from_json_v3(data[1]))) + + +def complex_from_json( + data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat +) -> np.complexfloating[Any, Any]: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + dtype : Any + The numpy dtype. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.complexfloating + The complex number. + """ + if zarr_format == 2: + return complex_from_json_v2(data, dtype) + else: + if check_json_complex_float_v3(data): + return complex_from_json_v3(data, dtype) + else: + raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + +def datetime_to_json(data: np.datetime64) -> int: + """ + Convert a datetime64 to a JSON integer. + + Parameters + ---------- + data : np.datetime64 + The datetime64 value to convert. + + Returns + ------- + int + The JSON representation of the datetime64. + """ + return data.view(np.int64).item() + + +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: + """ + Convert a JSON integer to a datetime64. + + Parameters + ---------- + data : int + The JSON integer to convert. + unit : DateUnit or TimeUnit + The unit of the datetime64. + + Returns + ------- + np.datetime64 + The datetime64 value. + """ + return cast(np.datetime64, np.int64(data).view(f"datetime64[{unit}]")) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index cb9ab50044..d4f1f03258 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,208 +1,50 @@ from __future__ import annotations -import contextlib from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Self +from typing import TYPE_CHECKING, Any, Self -import numpy as np - -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeJSON, -) +from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.common import JSON + from zarr.core.dtype.wrapper import DTypeWrapper, TDType -# This class is different from the other registry classes, which inherit from -# dict. IMO it's simpler to just do a dataclass. But long-term we should -# have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - """ - A registry for ZDType classes. - - This registry is a mapping from Zarr data type names to their - corresponding ZDType classes. - - Attributes - ---------- - contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] - The mapping from Zarr data type names to their corresponding - ZDType classes. - """ - - contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( - default_factory=dict, init=False - ) - - _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def _lazy_load(self) -> None: - """ - Load all data types from the lazy load list and register them with - the registry. After loading, clear the lazy load list. - """ - for e in self._lazy_load_list: - self.register(e.load()._zarr_v3_name, e.load()) - - self._lazy_load_list.clear() + contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - """ - Register a data type with the registry. + def lazy_load(self) -> None: + for e in self.lazy_load_list: + self.register(e.name, e.load()) - Parameters - ---------- - key : str - The Zarr V3 name of the data type. - cls : type[ZDType[TBaseDType, TBaseScalar]] - The class of the data type to register. + self.lazy_load_list.clear() - Notes - ----- - This method is idempotent. If the data type is already registered, this - method does nothing. - """ + def register(self: Self, key: str, cls: type[DTypeWrapper[Any, Any]]) -> None: + # don't register the same dtype twice if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls - def unregister(self, key: str) -> None: - """ - Unregister a data type from the registry. - - Parameters - ---------- - key : str - The key associated with the ZDType class to be unregistered. - - Returns - ------- - None - - Raises - ------ - KeyError - If the data type is not found in the registry. - """ - if key in self.contents: - del self.contents[key] - else: - raise KeyError(f"Data type '{key}' not found in registry.") - - def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: - """ - Retrieve a registered ZDType class by its key. - - Parameters - ---------- - key : str - The key associated with the desired ZDType class. - - Returns - ------- - type[ZDType[TBaseDType, TBaseScalar]] - The ZDType class registered under the given key. - - Raises - ------ - KeyError - If the key is not found in the registry. - """ - + def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: return self.contents[key] - def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a native data type, e.g. a NumPy data type, to a registered ZDType. - - Parameters - ---------- - dtype : TBaseDType - The native data type to match. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the provided NumPy data type. - - Raises - ------ - ValueError - If the data type is a NumPy "Object" type, which is ambiguous, or if multiple - or no Zarr data types are found that match the provided dtype. - - Notes - ----- - This function attempts to resolve a Zarr data type from a given native data type. - If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type - can represent multiple Zarr data types. In such cases, a specific Zarr data type - should be explicitly constructed instead of relying on dynamic resolution. - - If multiple matches are found, it will also raise a ValueError. In this case - conflicting data types must be unregistered, or the Zarr data type should be explicitly - constructed. - """ - - if dtype == np.dtype("O"): - msg = ( - f"Zarr data type resolution from {dtype} failed. " - 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' - 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' - "data type. " - "In this case you should construct your array by providing a specific Zarr data " - 'type. For a list of Zarr data types that are compatible with the numpy "Object"' - "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - matched: list[ZDType[TBaseDType, TBaseScalar]] = [] + def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: + self.lazy_load() for val in self.contents.values(): - with contextlib.suppress(DataTypeValidationError): - matched.append(val.from_native_dtype(dtype)) - if len(matched) == 1: - return matched[0] - elif len(matched) > 1: - msg = ( - f"Zarr data type resolution from {dtype} failed. " - f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " - "You should unregister one of these data types, or avoid Zarr data type inference " - "entirely by providing a specific Zarr data type when creating your array." - "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") - - def match_json( - self, data: DTypeJSON, *, zarr_format: ZarrFormat - ) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a JSON representation of a data type to a registered ZDType. - - Parameters - ---------- - data : DTypeJSON - The JSON representation of a data type to match. - zarr_format : ZarrFormat - The Zarr format version to consider when matching data types. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the JSON representation. - - Raises - ------ - ValueError - If no matching Zarr data type is found for the given JSON data. - """ + try: + return val.from_dtype(dtype) + except DataTypeValidationError: + pass + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + def match_json(self, data: JSON) -> DTypeWrapper[Any, Any]: + self.lazy_load() for val in self.contents.values(): try: - return val.from_json(data, zarr_format=zarr_format) + return val.from_dict(data) except DataTypeValidationError: pass - raise ValueError(f"No Zarr data type found that matches {data!r}") + raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 776aea81d8..002bd100e9 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -1,303 +1,279 @@ -""" -Wrapper for native array data types. - -The ``ZDType`` class is an abstract base class for wrapping native array data types, e.g. NumPy dtypes. -``ZDType`` provides a common interface for working with data types in a way that is independent of the -underlying data type system. - -The wrapper class encapsulates a native data type. Instances of the class can be created from a -native data type instance, and a native data type instance can be created from an instance of the -wrapper class. - -The wrapper class is responsible for: -- Serializing and deserializing a native data type to Zarr V2 or Zarr V3 metadata. - This ensures that the data type can be properly stored and retrieved from array metadata. -- Serializing and deserializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for - storing a fill value for an array in a manner that is valid for the data type. - -You can add support for a new data type in Zarr by subclassing ``ZDType`` wrapper class and adapt its methods -to support your native data type. The wrapper class must be added to a data type registry -(defined elsewhere) before array creation routines or array reading routines can use your new data -type. -""" - from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - ClassVar, - Generic, - Literal, - Self, - TypeGuard, - TypeVar, - overload, -) +from typing import TYPE_CHECKING, Any, ClassVar, Generic, Self, TypeGuard, TypeVar, cast import numpy as np +from zarr.abc.metadata import Metadata +from zarr.core.dtype.common import DataTypeValidationError + if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 -# This the upper bound for the scalar types we support. It's numpy scalars + str, -# because the new variable-length string dtype in numpy does not have a corresponding scalar type -TBaseScalar = np.generic | str | bytes -# This is the bound for the dtypes that we support. If we support non-numpy dtypes, -# then this bound will need to be widened. -TBaseDType = np.dtype[np.generic] +TScalar = TypeVar("TScalar", bound=np.generic | str) +# TODO: figure out an interface or protocol that non-numpy dtypes can use +TDType = TypeVar("TDType", bound=np.dtype[Any]) -# These two type parameters are covariant because we want -# x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] -# to type check -TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) -TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) - -@dataclass(frozen=True, kw_only=True, slots=True) -class ZDType(ABC, Generic[TDType_co, TScalar_co]): +@dataclass(frozen=True, kw_only=True) +class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): """ - Abstract base class for wrapping native array data types, e.g. numpy dtypes + Abstract base class for wrapping numpy dtypes. Attributes ---------- dtype_cls : ClassVar[type[TDType]] - The wrapped dtype class. This is a class variable. + The numpy dtype class. This is a class variable. Instances of this class cannot set it. _zarr_v3_name : ClassVar[str] - The name given to the data type by a Zarr v3 data type specification. This is a - class variable, and it should generally be unique across different data types. + The name given to the wrapped data type by a zarr v3 data type specification. Note that this + is not necessarily the same name that will appear in metadata documents, as some data types + have names that depend on their configuration. """ - # this class will create a native data type + # this class will create a numpy dtype # mypy currently disallows class variables to contain type parameters - # but it seems OK for us to use it here: + # but it seems like it should be OK for us to use it here: # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 - dtype_cls: ClassVar[type[TDType_co]] # type: ignore[misc] + dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] @classmethod - def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + @abstractmethod + def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: """ - Check that a native data type matches the dtype_cls class attribute. - - Used as a type guard. + Wrap a native dtype without checking. Parameters ---------- dtype : TDType - The dtype to check. + The native dtype to wrap. Returns ------- - Bool - True if the dtype matches, False otherwise. + Self + The wrapped dtype. """ - return type(dtype) is cls.dtype_cls + raise NotImplementedError @classmethod - @abstractmethod - def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: + def from_dtype(cls: type[Self], dtype: TDType) -> Self: """ - Create a ZDType instance from a native data type. - - This method is used when taking a user-provided native data type, like a NumPy data type, - and creating the corresponding ZDType instance from them. + Wrap a dtype object. Parameters ---------- dtype : TDType - The native data type object to wrap. + The dtype object to wrap. Returns ------- Self - The ZDType that wraps the native data type. + The wrapped dtype. Raises ------ TypeError - If the native data type is not consistent with the wrapped data type. + If the dtype does not match the dtype_cls class attribute. """ - raise NotImplementedError # pragma: no cover + if cls.check_dtype(dtype): + return cls._from_dtype_unsafe(dtype) + raise DataTypeValidationError( + f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." + ) @abstractmethod - def to_native_dtype(self: Self) -> TDType_co: + def to_dtype(self: Self) -> TDType: """ - Return an instance of the wrapped data type. This operation inverts ``from_native_dtype``. + Return an instance of the wrapped dtype. Returns ------- TDType - The native data type wrapped by this ZDType. + The unwrapped dtype. """ - raise NotImplementedError # pragma: no cover + raise NotImplementedError - @classmethod @abstractmethod - def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + def to_dict(self) -> dict[str, JSON]: + """ + Convert the wrapped data type to a dictionary. - @classmethod - @abstractmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + Returns + ------- + dict[str, JSON] + The dictionary representation of the wrapped data type + """ + raise NotImplementedError - @classmethod - def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self: + def cast_value(self: Self, value: object) -> TScalar: """ - Create an instance of this ZDType from JSON data. + Cast a value to an instance of the scalar type. + This implementation assumes a numpy-style dtype class that has a + ``type`` method for casting scalars. Non-numpy dtypes will need to + override this method. Parameters ---------- - data : DTypeJSON - The JSON representation of the data type. - - zarr_format : ZarrFormat - The zarr format version. + value : object + The value to cast. Returns ------- - Self - An instance of this data type. + TScalar + The cast value. """ - if zarr_format == 2: - return cls._from_json_v2(data) - if zarr_format == 3: - return cls._from_json_v3(data) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return cast(TScalar, self.to_dtype().type(value)) - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ... + @abstractmethod + def default_value(self) -> TScalar: + """ + Get the default value for the wrapped data type. This is a method, rather than an attribute, + because the default value for some data types may depend on parameters that are not known + until a concrete data type is wrapped. - @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... + Returns + ------- + TScalar + The default value for this data type. + """ + ... - @abstractmethod - def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: + @classmethod + def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: """ - Serialize this ZDType to JSON. + Check that a data type matches the dtype_cls class attribute. Used as a type guard. Parameters ---------- - zarr_format : ZarrFormat - The zarr format version. + dtype : TDType + The dtype to check. Returns ------- - DTypeJSON_V2 | DTypeJSON_V3 - The JSON-serializable representation of the wrapped data type + Bool + True if the dtype matches, False otherwise. """ - raise NotImplementedError # pragma: no cover + return type(dtype) is cls.dtype_cls - @abstractmethod - def _check_scalar(self, data: object) -> bool: + @classmethod + def check_json(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: """ - Check that an python object is a valid scalar value for the wrapped data type. + Check that a JSON representation of a data type matches the dtype_cls class attribute. Used + as a type guard. This base implementation checks that the input is a dictionary, + that the key "name" is in that dictionary, and that the value of "name" + matches the _zarr_v3_name class attribute. Parameters ---------- - data : object - A value to check. + data : JSON + The JSON representation of the data type. Returns ------- Bool - True if the object is valid, False otherwise. + True if the JSON representation matches, False otherwise. """ - raise NotImplementedError # pragma: no cover + return "name" in data and data["name"] == cls._zarr_v3_name - @abstractmethod - def cast_scalar(self, data: object) -> TScalar_co: + @classmethod + def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: """ - Cast a python object to the wrapped scalar type. - - The type of the provided scalar is first checked for compatibility. - If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. + Wrap a JSON representation of a data type. Parameters ---------- - data : object - The python object to cast. + data : dict[str, JSON] + The JSON representation of the data type. Returns ------- - TScalar - The cast value. + Self + The wrapped data type. """ - raise NotImplementedError # pragma: no cover + if cls.check_json(data): + return cls._from_json_unsafe(data) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - @abstractmethod - def default_scalar(self) -> TScalar_co: + @classmethod + def _from_json_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: """ - Get the default scalar value for the wrapped data type. + Wrap a JSON representation of a data type. - This is a method, rather than an attribute, because the default value for some data types depends on parameters that are - not known until a concrete data type is wrapped. For example, data types parametrized by a - length like fixed-length strings or bytes will generate scalars consistent with that length. + Parameters + ---------- + data : dict[str, JSON] + The JSON representation of the data type. Returns ------- - TScalar - The default value for this data type. + Self + The wrapped data type. """ - raise NotImplementedError # pragma: no cover + config = data.get("configuration", {}) + return cls(**config) - @abstractmethod - def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def get_name(self, zarr_format: ZarrFormat) -> str: """ - Read a JSON-serializable value as a scalar. + Return the name of the wrapped data type. Parameters ---------- - data : JSON - A JSON representation of a scalar value. zarr_format : ZarrFormat - The zarr format version. This is specified because the JSON serialization of scalars - differs between Zarr V2 and Zarr V3. + The zarr format version. Returns ------- - TScalar - The deserialized scalar value. + str + The name of the wrapped data type. + + Notes + ----- + This is a method, rather than an attribute, because the name of the data type may depend on + parameters that are not known until a concrete data type is wrapped. + + As the names of data types vary between zarr versions, this method takes a ``zarr_format`` + parameter """ - raise NotImplementedError # pragma: no cover + if zarr_format == 2: + return self.to_dtype().str + return self._zarr_v3_name @abstractmethod - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: """ - Serialize a python object to the JSON representation of a scalar. - - The value will first be cast to the scalar type associated with this ZDType, then serialized - to JSON. + Convert a single value to JSON-serializable format. Parameters ---------- data : object The value to convert. zarr_format : ZarrFormat - The zarr format version. This is specified because the JSON serialization of scalars - differs between Zarr V2 and Zarr V3. + The zarr format version. Returns ------- JSON - The JSON-serialized scalar. + The JSON-serializable format. """ - raise NotImplementedError # pragma: no cover + raise NotImplementedError + @abstractmethod + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: + """ + Read a JSON-serializable value as a scalar. -def scalar_failed_type_check_msg( - cls_instance: ZDType[TBaseDType, TBaseScalar], bad_scalar: object -) -> str: - """ - Generate an error message reporting that a particular value failed a type check when attempting - to cast that value to a scalar. - """ - return ( - f"The value {bad_scalar!r} failed a type check. " - f"It cannot be safely cast to a scalar compatible with {cls_instance}. " - f"Consult the documentation for {cls_instance} to determine the possible values that can " - "be cast to scalars of the wrapped data type." - ) + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + TScalar + The numpy scalar. + """ + raise NotImplementedError diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py index 33aa22b398..e69de29bb2 100644 --- a/src/zarr/core/metadata/dtype.py +++ b/src/zarr/core/metadata/dtype.py @@ -1,808 +0,0 @@ -from __future__ import annotations - -import base64 -from abc import ABC, abstractmethod -from collections.abc import Sequence -from dataclasses import dataclass, field, replace -from importlib.metadata import EntryPoint -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Generic, - Literal, - Self, - TypeGuard, - TypeVar, - cast, - get_args, -) - -import numpy as np -import numpy.typing as npt - -from zarr.abc.metadata import Metadata -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - -Endianness = Literal["little", "big", "native"] -DataTypeFlavor = Literal["boolean", "numeric", "string", "bytes"] -JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] - - -def endianness_to_numpy_str(endianness: Endianness | None) -> Literal[">", "<", "=", "|"]: - match endianness: - case "little": - return "<" - case "big": - return ">" - case "native": - return "=" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(endianness)} or None" - ) - - -def check_json_bool(data: JSON) -> TypeGuard[bool]: - """ - Check if a JSON value represents a boolean. - """ - return bool(isinstance(data, bool)) - - -def check_json_str(data: JSON) -> TypeGuard[str]: - """ - Check if a JSON value represents a string. - """ - return bool(isinstance(data, str)) - - -def check_json_int(data: JSON) -> TypeGuard[int]: - """ - Check if a JSON value represents an integer. - """ - return bool(isinstance(data, int)) - - -def check_json_float_v2(data: JSON) -> TypeGuard[float]: - if data == "NaN" or data == "Infinity" or data == "-Infinity": - return True - else: - return bool(isinstance(data, float | int)) - - -def check_json_float_v3(data: JSON) -> TypeGuard[float]: - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the zarr v3 spec - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v3(data[0]) - and check_json_float_v3(data[1]) - ) - - -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) - - -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - if zarr_format == 2: - return check_json_complex_float_v2(data) - else: - return check_json_complex_float_v3(data) - - -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: - if np.isnan(data): - return "NaN" - elif np.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - return float(data) - - -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: - # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just reuse the v2 routine here - return float_to_json_v2(data) - - -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: - """ - convert a float to JSON as per the zarr v3 spec - """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - return float_to_json_v2(data.real), float_to_json_v2(data.imag) - - -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - return float_to_json_v3(data.real), float_to_json_v3(data.imag) - - -def complex_to_json( - data: complex | np.complexfloating[Any], zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat] | JSONFloat: - if zarr_format == 2: - return complex_to_json_v2(data) - else: - return complex_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def structured_scalar_to_json(data: bytes, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return base64.b64encode(data).decode("ascii") - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") - - -def structured_scalar_from_json(data: str, zarr_format: ZarrFormat) -> bytes: - if zarr_format == 2: - return base64.b64decode(data.encode("ascii")) - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") - - -def float_from_json_v2(data: JSONFloat) -> float: - match data: - case "NaN": - return float("nan") - case "Infinity": - return float("inf") - case "-Infinity": - return float("-inf") - case _: - return float(data) - - -def float_from_json_v3(data: JSONFloat) -> float: - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - - -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) - - -def complex_from_json_v2(data: JSONFloat, dtype: Any) -> np.complexfloating[Any, Any]: - return dtype.type(complex(*data)) - - -def complex_from_json_v3( - data: tuple[JSONFloat, JSONFloat], dtype: Any -) -> np.complexfloating[Any, Any]: - return dtype.type(complex(*data)) - - -def complex_from_json( - data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complexfloating: - if zarr_format == 2: - return complex_from_json_v2(data, dtype) - else: - if check_json_complex_float_v3(data): - return complex_from_json_v3(data, dtype) - else: - raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def datetime_to_json(data: np.datetime64[Any]) -> int: - return data.view(np.int64).item() - - -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64[Any]: - return np.int64(data).view(f"datetime64[{unit}]") - - -TScalar = TypeVar("TScalar", bound=np.generic | str, covariant=True) -# TODO: figure out an interface or protocol that non-numpy dtypes can -TDType = TypeVar("TDType", bound=np.dtype[Any]) - - -@dataclass(frozen=True, kw_only=True) -class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): - name: ClassVar[str] - dtype_cls: ClassVar[type[TDType]] # this class will create a numpy dtype - endianness: Endianness | None = "native" - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - def cast_value(self: Self, value: object) -> TScalar: - return cast(TScalar, self.unwrap().type(value)) - - @abstractmethod - def default_value(self) -> TScalar: ... - - @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Check that a dtype matches the dtype_cls class attribute - """ - return type(dtype) is cls.dtype_cls - - @classmethod - def wrap(cls: type[Self], dtype: TDType) -> Self: - if cls.check_dtype(dtype): - return cls._wrap_unsafe(dtype) - raise TypeError(f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}.") - - @classmethod - @abstractmethod - def _wrap_unsafe(cls: type[Self], dtype: TDType) -> Self: - raise NotImplementedError - - def unwrap(self: Self) -> TDType: - endian_str = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(endian_str) - - def with_endianness(self: Self, endianness: Endianness) -> Self: - return replace(self, endianness=endianness) - - @abstractmethod - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> JSON: - """ - Convert a single value to JSON-serializable format. Depends on the zarr format. - """ - raise NotImplementedError - - @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - """ - Read a JSON-serializable value as a numpy scalar - """ - raise NotImplementedError - - -@dataclass(frozen=True, kw_only=True) -class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): - name = "bool" - dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType - - def default_value(self) -> np.bool_: - return np.False_ - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> bool: - return bool(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: - if check_json_bool(data): - return self.unwrap().type(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") - - -class IntWrapperBase(DTypeWrapper[TDType, TScalar]): - def default_value(self) -> TScalar: - return self.unwrap().type(0) - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - if check_json_int(data): - return self.unwrap().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - -@dataclass(frozen=True, kw_only=True) -class Int8(IntWrapperBase[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - name = "int8" - - -@dataclass(frozen=True, kw_only=True) -class UInt8(IntWrapperBase[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - name = "uint8" - - -@dataclass(frozen=True, kw_only=True) -class Int16(IntWrapperBase[np.dtypes.Int16DType, np.int16]): - dtype_cls = np.dtypes.Int16DType - name = "int16" - - -@dataclass(frozen=True, kw_only=True) -class UInt16(IntWrapperBase[np.dtypes.UInt16DType, np.uint16]): - dtype_cls = np.dtypes.UInt16DType - name = "uint16" - - -@dataclass(frozen=True, kw_only=True) -class Int32(IntWrapperBase[np.dtypes.Int32DType, np.int32]): - dtype_cls = np.dtypes.Int32DType - name = "int32" - - -@dataclass(frozen=True, kw_only=True) -class UInt32(IntWrapperBase[np.dtypes.UInt32DType, np.uint32]): - dtype_cls = np.dtypes.UInt32DType - name = "uint32" - - -@dataclass(frozen=True, kw_only=True) -class Int64(IntWrapperBase[np.dtypes.Int64DType, np.int64]): - dtype_cls = np.dtypes.Int64DType - name = "int64" - - -@dataclass(frozen=True, kw_only=True) -class UInt64(IntWrapperBase[np.dtypes.UInt64DType, np.uint64]): - dtype_cls = np.dtypes.UInt64DType - name = "uint64" - - -class FloatWrapperBase(DTypeWrapper[TDType, TScalar]): - def default_value(self) -> TScalar: - return self.unwrap().type(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: - if check_json_float_v2(data): - return self.unwrap().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Float16(FloatWrapperBase[np.dtypes.Float16DType, np.float16]): - dtype_cls = np.dtypes.Float16DType - name = "float16" - - -@dataclass(frozen=True, kw_only=True) -class Float32(FloatWrapperBase[np.dtypes.Float32DType, np.float32]): - dtype_cls = np.dtypes.Float32DType - name = "float32" - - -@dataclass(frozen=True, kw_only=True) -class Float64(FloatWrapperBase[np.dtypes.Float64DType, np.float64]): - dtype_cls = np.dtypes.Float64DType - name = "float64" - - -@dataclass(frozen=True, kw_only=True) -class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): - dtype_cls = np.dtypes.Complex64DType - name = "complex64" - - def default_value(self) -> np.complex64: - return np.complex64(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.Complex64DType) -> Self: - return cls() - - def to_json_value( - self, data: np.generic, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: - if check_json_complex_float_v3(data): - return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") - - -@dataclass(frozen=True, kw_only=True) -class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): - dtype_cls = np.dtypes.Complex128DType - name = "complex128" - - def default_value(self) -> np.complex128: - return np.complex128(0.0) - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.Complex128DType) -> Self: - return cls() - - def to_json_value( - self, data: np.generic, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float_v3(data): - return complex_from_json(data, dtype=self.unwrap(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") - - -@dataclass(frozen=True, kw_only=True) -class FlexibleWrapperBase(DTypeWrapper[TDType, TScalar]): - item_size_bits: ClassVar[int] - length: int = 0 - - @classmethod - def _wrap_unsafe(cls, dtype: TDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def unwrap(self) -> TDType: - endianness_code = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(endianness_code) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthAsciiString(FlexibleWrapperBase[np.dtypes.BytesDType, np.bytes_]): - dtype_cls = np.dtypes.BytesDType - name = "numpy.static_byte_string" - item_size_bits = 8 - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"length": self.length}} - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.unwrap().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") - - -@dataclass(frozen=True, kw_only=True) -class StaticRawBytes(FlexibleWrapperBase[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - name = "r*" - item_size_bits = 8 - - def default_value(self) -> np.void: - return self.cast_value(("\x00" * self.length).encode("ascii")) - - def to_dict(self) -> dict[str, JSON]: - return {"name": f"r{self.length * self.item_size_bits}"} - - @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Reject structured dtypes by ensuring that dtype.fields is None - """ - return type(dtype) is cls.dtype_cls and dtype.fields is None - - def unwrap(self) -> np.dtypes.VoidDType: - # this needs to be overridden because numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - endianness_code = endianness_to_numpy_str(self.endianness) - return np.dtype(f"{endianness_code}V{self.length}") - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - # todo: check that this is well-formed - return self.unwrap().type(base64.standard_b64decode(data)) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthUnicodeString(FlexibleWrapperBase[np.dtypes.StrDType, np.str_]): - dtype_cls = np.dtypes.StrDType - name = "numpy.static_unicode_string" - item_size_bits = 32 # UCS4 is 32 bits per code point - - def default_value(self) -> np.str_: - return np.str_("") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name, "configuration": {"length": self.length}} - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) - - -if _NUMPY_SUPPORTS_VLEN_STRING: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): - dtype_cls = np.dtypes.StringDType - name = "numpy.vlen_string" - - def default_value(self) -> str: - return "" - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: - return cls() - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - def unwrap(self) -> np.dtypes.StringDType: - # StringDType does not have endianness, so we ignore it here - return self.dtype_cls() - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.unwrap().type(data) - -else: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): - dtype_cls = np.dtypes.ObjectDType - name = "numpy.vlen_string" - endianness: Endianness = field(default=None) - - def default_value(self) -> str: - return "" - - def __post_init__(self) -> None: - if self.endianness is not None: - raise ValueError("VariableLengthString does not support endianness.") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.name} - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: - return cls() - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - String literals pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] - - -@dataclass(frozen=True, kw_only=True) -class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): - dtype_cls = np.dtypes.DateTime64DType - name = "numpy/datetime64" - unit: DateUnit | TimeUnit = "s" - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: - unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] - return cls(unit=unit) - - def cast_value(self, value: object) -> np.datetime64: - return self.unwrap().type(value, self.unit) - - def unwrap(self) -> np.dtypes.DateTime64DType: - return np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): - return datetime_from_json(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) - - -@dataclass(frozen=True, kw_only=True) -class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - name = "numpy/struct" - fields: tuple[tuple[str, DTypeWrapper[Any, Any], int], ...] - - def default_value(self) -> np.void: - return np.array([0], dtype=self.unwrap())[0] - - @classmethod - def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: - """ - Check that this dtype is a numpy structured dtype - """ - return super().check_dtype(dtype) and dtype.fields is not None - - @classmethod - def _wrap_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: - fields: list[tuple[str, DTypeWrapper[Any, Any], int]] = [] - - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - for key, (dtype_instance, offset) in dtype.fields.items(): - dtype_wrapped = data_type_registry.match_dtype(dtype_instance) - fields.append((key, dtype_wrapped, offset)) - - return cls(fields=tuple(fields)) - - def to_dict(self) -> dict[str, JSON]: - base_dict = super().to_dict() - if base_dict.get("configuration", {}) != {}: - raise ValueError( - "This data type wrapper cannot inherit from a data type wrapper that defines a configuration for its dict serialization" - ) - field_configs = [ - (f_name, f_dtype.to_dict(), f_offset) for f_name, f_dtype, f_offset in self.fields - ] - base_dict["configuration"] = {"fields": field_configs} - return base_dict - - @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: - fields = tuple( - (f_name, get_data_type_from_dict(f_dtype), f_offset) - for f_name, f_dtype, f_offset in data["fields"] - ) - return cls(fields=fields) - - def unwrap(self) -> np.dtypes.VoidDType: - return np.dtype([(key, dtype.unwrap()) for (key, dtype, _) in self.fields]) - - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return structured_scalar_to_json(data.tobytes(), zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = structured_scalar_from_json(data, zarr_format=zarr_format) - dtype = self.unwrap() - return np.array([as_bytes], dtype=dtype.str).view(dtype)[0] - - -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: - if dtype in (str, "str"): - if _NUMPY_SUPPORTS_VLEN_STRING: - np_dtype = np.dtype("T") - else: - np_dtype = np.dtype("O") - else: - np_dtype = np.dtype(dtype) - data_type_registry.lazy_load() - return data_type_registry.match_dtype(np_dtype) - - -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any.Any]: - data_type_registry.lazy_load() - dtype_name = dtype["name"] - dtype_cls = data_type_registry.get(dtype_name) - if dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype_name}") - return dtype_cls.from_dict(dtype.get("configuration", {})) - - -def resolve_dtype( - dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], -) -> DTypeWrapper[Any, Any]: - if isinstance(dtype, DTypeWrapper): - return dtype - elif isinstance(dtype, dict): - return get_data_type_from_dict(dtype) - else: - return get_data_type_from_numpy(dtype) - - -def get_data_type_by_name( - dtype: str, configuration: dict[str, JSON] | None = None -) -> DTypeWrapper[Any, Any]: - data_type_registry.lazy_load() - if configuration is None: - _configuration = {} - else: - _configuration = configuration - maybe_dtype_cls = data_type_registry.get(dtype) - if maybe_dtype_cls is None: - raise ValueError(f"No data type class matching name {dtype}") - return maybe_dtype_cls.from_dict(_configuration) - - -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def lazy_load(self) -> None: - for e in self.lazy_load_list: - self.register(e.load()) - - self.lazy_load_list.clear() - - def register(self: Self, cls: type[DTypeWrapper[Any, Any]]) -> None: - # don't register the same dtype twice - if cls.name not in self.contents or self.contents[cls.name] != cls: - self.contents[cls.name] = cls - - def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: - return self.contents[key] - - def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: - self.lazy_load() - for val in self.contents.values(): - try: - return val.wrap(dtype) - except TypeError: - pass - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - - -def register_data_type(cls: type[DTypeWrapper[Any, Any]]) -> None: - data_type_registry.register(cls) - - -data_type_registry = DataTypeRegistry() - -INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -FLOAT_DTYPE = Float16 | Float32 | Float64 -COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString -DTYPE = ( - Bool - | INTEGER_DTYPE - | FLOAT_DTYPE - | COMPLEX_DTYPE - | STRING_DTYPE - | StaticRawBytes - | Structured - | DateTime64 -) -for dtype in get_args(DTYPE): - register_data_type(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6b1f364a08..517758a5ee 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -7,11 +7,8 @@ import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.metadata.dtype import ( - DTypeWrapper, - Structured, - get_data_type_from_numpy, -) +from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype.wrapper import DTypeWrapper if TYPE_CHECKING: from typing import Literal, Self @@ -98,7 +95,7 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.unwrap()) + fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.to_dtype()) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -141,9 +138,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_numpy(parse_dtype(_data["dtype"])) + dtype = get_data_type_from_numpy(_data["dtype"]) _data["dtype"] = dtype - if dtype.unwrap().kind in "SV": + if dtype.to_dtype().kind in "SV": fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: fill_value = base64.standard_b64decode(fill_value_encoded) @@ -200,13 +197,7 @@ def to_dict(self) -> dict[str, JSON]: fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - _ = zarray_dict.pop("dtype") - dtype_json: JSON - if isinstance(self.dtype, Structured): - dtype_json = tuple(self.dtype.unwrap().descr) - else: - dtype_json = self.dtype.unwrap().str - zarray_dict["dtype"] = dtype_json + zarray_dict["dtype"] = self.dtype.get_name(zarr_format=2) return zarray_dict diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 880adddac5..297c418214 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -4,10 +4,9 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.metadata.dtype import ( +from zarr.core.dtype import ( DTypeWrapper, VariableLengthString, - get_data_type_by_name, get_data_type_from_dict, ) @@ -87,7 +86,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper[Any, Any]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -145,7 +144,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeWrapper + data_type: DTypeWrapper[Any, Any] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -160,7 +159,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeWrapper, + data_type: DTypeWrapper[Any, Any], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -180,14 +179,14 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) - fill_value_parsed = data_type.unwrap().type(fill_value) + fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.unwrap(), + dtype=data_type.to_dtype(), fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -222,7 +221,7 @@ def _validate_metadata(self) -> None: raise ValueError("`fill_value` is required.") for codec in self.codecs: codec.validate( - shape=self.shape, dtype=self.data_type.unwrap(), chunk_grid=self.chunk_grid + shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid ) @property @@ -296,9 +295,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type_json = _data.pop("data_type") if isinstance(data_type_json, str): - # check that the data_type attribute is valid - data_type = get_data_type_by_name(data_type_json) - + data_type = get_data_type_from_dict({"name": data_type_json}) else: data_type = get_data_type_from_dict(data_type_json) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 12281483ef..eb345b24b1 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Generic, TypeVar from zarr.core.config import BadConfigError, config -from zarr.core.metadata.dtype import data_type_registry +from zarr.core.dtype import data_type_registry if TYPE_CHECKING: from importlib.metadata import EntryPoint diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index d0726c3dd9..d0e54eeb51 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -16,8 +16,8 @@ from zarr.core.array import Array from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.common import ZarrFormat +from zarr.core.dtype import parse_data_type from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike @@ -138,8 +138,8 @@ def array_metadata( shape = draw(array_shapes()) ndim = len(shape) chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) - np_dtype = draw(dtypes()) - dtype = get_data_type_from_native_dtype(np_dtype) + np_dtype = draw(v3_dtypes()) + dtype = parse_data_type(np_dtype) fill_value = draw(npst.from_dtype(np_dtype)) if zarr_format == 2: return ArrayV2Metadata( diff --git a/tests/conftest.py b/tests/conftest.py index 0112a07055..c164168750 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.metadata.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -289,7 +289,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_parsed.unwrap().itemsize, + item_size=dtype_parsed.to_dtype().itemsize, ) if order is None: diff --git a/tests/test_array.py b/tests/test_array.py index 97c3a5b572..54a2db160b 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -40,28 +40,10 @@ ) from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition -from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype import ( - DateTime64, - Float32, - Float64, - Int16, - Structured, - TimeDelta64, - UInt8, - VariableLengthBytes, - VariableLengthUTF8, - ZDType, - parse_data_type, -) -from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr -from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str -from zarr.core.dtype.npy.string import UTF8Base +from zarr.core.common import JSON, MemoryOrder, ZarrFormat +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.dtype import get_data_type_from_numpy -from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -69,6 +51,8 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr.core.array_spec import ArrayConfigLike + from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata @@ -1081,7 +1065,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer="auto", - dtype=arr.metadata.data_type, + dtype=arr.metadata.data_type, # type: ignore[union-attr] ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1306,7 +1290,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 4234eac3d0..723450b680 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,7 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.metadata.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index c831bf9a9e..3a8cd5bb8b 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -21,7 +21,6 @@ from zarr.core.dtype import parse_data_type from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata -from zarr.core.metadata.dtype import get_data_type_from_numpy from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import StorePath @@ -505,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = get_data_type_from_numpy("uint8") + dtype = parse_data_type("uint8") await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index 8a1bcdedd1..ee19cdf845 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -5,15 +5,19 @@ import numpy as np import pytest -from zarr.core.metadata.dtype import ( +from zarr.core.dtype import ( DTYPE, + DTypeWrapper, + VariableLengthString, + data_type_registry, +) +from zarr.core.dtype._numpy import ( Bool, Complex64, Complex128, - DataTypeRegistry, DateTime64, - DTypeWrapper, FixedLengthAsciiString, + FixedLengthBytes, FixedLengthUnicodeString, Float16, Float32, @@ -22,15 +26,14 @@ Int16, Int32, Int64, - StaticRawBytes, Structured, UInt8, UInt16, UInt32, UInt64, - VariableLengthString, - data_type_registry, ) +from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.registry import DataTypeRegistry @pytest.fixture @@ -65,7 +68,7 @@ def dtype_registry() -> DataTypeRegistry: (Complex128, "complex128"), (FixedLengthUnicodeString, "U"), (FixedLengthAsciiString, "S"), - (StaticRawBytes, "V"), + (FixedLengthBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), (DateTime64, "datetime64[s]"), @@ -79,23 +82,23 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st """ dt = np.dtype(np_dtype) assert wrapper_cls.dtype_cls is type(dt) - wrapped = wrapper_cls.wrap(dt) + wrapped = wrapper_cls.from_dtype(dt) - with pytest.raises(TypeError, match="Invalid dtype"): - wrapper_cls.wrap("not a dtype") + with pytest.raises(DataTypeValidationError, match="Invalid dtype"): + wrapper_cls.from_dtype("not a dtype") assert isinstance(wrapped, wrapper_cls) - assert wrapped.unwrap() == dt + assert wrapped.to_dtype() == dt @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_dict_serialization(wrapper_cls: DTYPE) -> None: if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool(), 0),))) + instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() as_dict = instance.to_dict() - assert wrapper_cls.from_dict(data=as_dict.get("configuration", {})) == instance + assert wrapper_cls.from_dict(as_dict) == instance @pytest.mark.parametrize( @@ -116,10 +119,10 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), (FixedLengthAsciiString(length=3), np.bytes_(b"")), - (StaticRawBytes(length=3), np.void(b"\x00\x00\x00")), + (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicodeString(length=3), np.str_("")), ( - Structured(fields=(("a", Float64(), 0), ("b", Int8(), 8))), + Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), @@ -154,7 +157,7 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), - (StaticRawBytes(length=4), np.void(b"test"), "dGVzdA=="), + (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), @@ -187,7 +190,7 @@ def test_to_json_value_v2( (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), - (StaticRawBytes(length=4), "dGVzdA==", np.void(b"test")), + (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), (FixedLengthUnicodeString(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), @@ -208,8 +211,8 @@ def test_register(dtype_registry: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - dtype_registry.register(Bool) - assert dtype_registry.get("bool") == Bool + dtype_registry.register(Bool._zarr_v3_name, Bool) + assert dtype_registry.get(Bool._zarr_v3_name) == Bool assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) @staticmethod @@ -217,13 +220,13 @@ def test_override(dtype_registry: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - dtype_registry.register(Bool) + dtype_registry.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - dtype_registry.register(NewBool) + dtype_registry.register(NewBool._zarr_v3_name, NewBool) assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -236,7 +239,7 @@ def test_match_dtype( """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - dtype_registry.register(wrapper_cls) + dtype_registry.register(wrapper_cls._zarr_v3_name, wrapper_cls) assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) @staticmethod @@ -260,8 +263,8 @@ def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: Test that the registered dtypes can be retrieved from the registry. """ if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool(), 0),))) + instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() - assert data_type_registry.match_dtype(instance.unwrap()) == instance + assert data_type_registry.match_dtype(instance.to_dtype()) == instance diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 540935013f..627d615e74 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -9,11 +9,9 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.float import Float32, Float64 -from zarr.core.dtype.npy.int import Int16 +from zarr.core.dtype._numpy import Float32, Float64, Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata -from zarr.core.metadata.dtype import Float32, Float64, Int16 from zarr.core.metadata.v2 import parse_zarr_format if TYPE_CHECKING: diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 44887433e7..478a1405e2 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -11,11 +11,10 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config -from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype._numpy import DateTime64 +from zarr.core.dtype.common import complex_from_json from zarr.core.group import GroupMetadata, parse_node_type -from zarr.core.metadata.dtype import DateTime64, complex_from_json, get_data_type_from_numpy from zarr.core.metadata.v3 import ( ArrayV3Metadata, parse_dimension_names, @@ -132,7 +131,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_numpy(dtype_str) - expected = dtype.unwrap().type(complex(*fill_value)) + expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) From 925b9e20c85b68d231dc66034693352d0425a24f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 14:10:22 +0100 Subject: [PATCH 022/129] start design doc --- docs/user-guide/data_types.rst | 545 ++++++++------------------------- docs/user-guide/index.rst | 1 + src/zarr/core/dtype/_numpy.py | 6 +- src/zarr/core/dtype/wrapper.py | 41 +-- 4 files changed, 148 insertions(+), 445 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index d4b49ca43f..19095e1851 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,455 +1,156 @@ -.. _user-guide-data-types: +Data types +========== -Array data types -================ - -Zarr's Data Type Model +Zarr's data type model ---------------------- -Zarr is designed for interoperability with NumPy, so if you are familiar with NumPy or any other -N-dimensional array library, Zarr's model for array data types should seem familiar. However, Zarr -data types have some unique features that are described in this document. - -Zarr arrays operate under an essential design constraint: unlike NumPy arrays, Zarr arrays -are designed to be stored and accessed by other Zarr implementations. This means that, among other things, -Zarr data types must be serializable to metadata documents in accordance with the Zarr specifications, -which adds some unique aspects to the Zarr data type model. - -The following sections explain Zarr's data type model in greater detail and demonstrate the -Zarr Python APIs for working with Zarr data types. - -Array Data Types -^^^^^^^^^^^^^^^^ - -Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data -type is encoded in the JSON metadata for the array. This means that the data type of an array must be -JSON-serializable. - -In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. -Zarr V3 changed the name of this field to ``data_type`` and also defined new rules for the values -that can be assigned to the ``data_type`` field. - -For example, in Zarr V2, the boolean array data type was represented in array metadata as the -string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool"``. - -Scalars -^^^^^^^ +Every Zarr array has a "data type", which defines the meaning and physical layout of the +array's elements. Zarr is heavily influenced by `NumPy `_, and +Zarr arrays can use many of the same data types as numpy arrays:: + >>> import zarr + >>> import numpy as np + >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z + -Zarr also specifies how array elements, i.e., scalars, are encoded in array metadata. This is necessary -because Zarr uses a field in array metadata to define a default value for chunks that are not stored. -This field, called ``fill_value`` in both Zarr V2 and Zarr V3 metadata documents, contains a -JSON value that can be decoded to a scalar value compatible with the array's data type. +But Zarr data types and Numpy data types are also very different in one key respect: +Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. +So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to +array metadata. -For the boolean data type, the scalar encoding is simple—booleans are natively supported by -JSON, so Zarr saves booleans as JSON booleans. Other scalars, like floats or raw bytes, have -more elaborate encoding schemes, and in some cases, this scheme depends on the Zarr format version. +Data types in Zarr version 2 +----------------------------- -Data Types in Zarr Version 2 ----------------------------- +Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. +Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: -Version 2 of the Zarr format defined its data types relative to -`NumPy's data types `_, -and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr -V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: - -.. code-block:: python - - >>> import zarr - >>> import numpy as np - >>> import json - >>> - >>> store = {} - >>> np_dtype = np.dtype('int64') - >>> np_dtype.str - '>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) - >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> dtype_meta - '`_, - or "byte order," of the data type. As per the NumPy model, - in Zarr version 2 each data type has an endianness where applicable. - However, Zarr version 3 data types do not store endianness information. - -There are two special cases to consider: `"structured" data types <#structured-data-type>`_, and -`"object" <#object-data-type>`_ data types. - -Structured Data Type -^^^^^^^^^^^^^^^^^^^^ - -NumPy allows the construction of a so-called "structured" data types comprised of ordered collections -of named fields, where each field is itself a distinct NumPy data type. See the NumPy documentation -`here `_. - -Crucially, NumPy does not use a special data type for structured data types—instead, NumPy -implements structured data types as an optional feature of the so-called "Void" data type, which models -arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void -data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` -attribute does not convey information about the fields contained in a structured data type. -For these reasons, Zarr V2 uses a special data type encoding for structured data types. -They are stored in JSON as lists of pairs, where the first element is a string, and the second -element is a Zarr V2 data type specification. This representation supports recursion. - -For example: - -.. code-block:: python - - >>> store = {} - >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', [('subfield_c', '>f4'), ('subfield_d', 'i2')])]) - >>> np_dtype.str - '|V8' - >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) - >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> dtype_meta - [['field_a', '>i2'], ['field_b', [['subfield_c', '>f4'], ['subfield_d', '>> import zarr + >>> import numpy as np + >>> import json + >>> np_dtype = np.dtype('int64') + >>> z = zarr.create_array(shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> assert dtype_meta == np_dtype.str # True + >>> dtype_meta + , "configuration": {...}}`` -For more about data types in Zarr V3, see the -`V3 specification `_. - -Data Types in Zarr Python +Data types in Zarr-Python ------------------------- -The two Zarr formats that Zarr Python supports specify data types in different ways: data types in -Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data -types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data -types do not have any associated endianness information, unlike Zarr V2 data types. - -Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences. -We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_, -which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. - -In this context, a "native" data type is a Python class, typically defined in another library, that -models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. -Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called -`UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. - -As of this writing, the only native data types Zarr Python supports are NumPy data types. We could -avoid the "native data type" jargon and just say "NumPy data type," but we do not want to rule out the -possibility of using non-NumPy array backends in the future. - -Each data type supported by Zarr Python is modeled by a ``ZDType`` subclass, which provides an -API for the following operations: - -- Encoding and decoding a native data type -- Encoding and decoding a data type to and from Zarr V2 and Zarr V3 array metadata -- Encoding and decoding a scalar value to and from Zarr V2 and Zarr V3 array metadata -- Casting a Python object to a scalar value consistent with the data type - -List of data types -^^^^^^^^^^^^^^^^^^ - -The following section lists the data types built in to Zarr Python. With a few exceptions, Zarr -Python supports nearly all of the data types in NumPy. If you need a data type that is not listed -here, it's possible to create it yourself: see :ref:`adding-new-data-types`. - -Boolean -""""""" -- `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ - -Integral -"""""""" -- `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ -- `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ -- `Signed 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int32>`_ -- `Signed 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int64>`_ -- `Unsigned 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt8>`_ -- `Unsigned 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt16>`_ -- `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ -- `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ - -Floating-point -"""""""""""""" -- `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ -- `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ -- `64-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float64>`_ -- `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ -- `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ - -String -"""""" -- `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ -- `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ - -Bytes -""""" -- `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ -- `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ -- `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ - -Temporal -"""""""" -- `DateTime64 <../api/zarr/dtype/index.html#zarr.dtype.DateTime64>`_ -- `TimeDelta64 <../api/zarr/dtype/index.html#zarr.dtype.TimeDelta64>`_ - -Struct-like -""""""""""" -- `Structured <../api/zarr/dtype/index.html#zarr.dtype.Structured>`_ - -Example Usage -^^^^^^^^^^^^^ - -This section will demonstrates the basic usage of Zarr data types. - -Create a ``ZDType`` from a native data type: - -.. code-block:: python - - >>> from zarr.core.dtype import Int8 - >>> import numpy as np - >>> int8 = Int8.from_native_dtype(np.dtype('int8')) - -Convert back to a native data type: - -.. code-block:: python - - >>> native_dtype = int8.to_native_dtype() - >>> assert native_dtype == np.dtype('int8') - -Get the default scalar value for the data type: - -.. code-block:: python - - >>> default_value = int8.default_scalar() - >>> assert default_value == np.int8(0) - -Serialize to JSON for Zarr V2: - -.. code-block:: python - - >>> json_v2 = int8.to_json(zarr_format=2) - >>> json_v2 - {'name': '|i1', 'object_codec_id': None} - -.. note:: +Zarr-Python supports two different Zarr formats, and those two formats specify data types in rather different ways: +data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, +and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. - The representation returned by ``to_json(zarr_format=2)`` is more abstract than the literal contents - of Zarr V2 array metadata, because the JSON representation used by the ``ZDType`` classes must be - distinct across different data types. As noted `earlier <#object-data-type>`_, Zarr V2 identifies - multiple distinct data types with the "object" data type identifier ``"|O"``. Extra information - is needed to disambiguate these data types from one another. That's the reason for the - ``object_codec_id`` field you see here. +If that wasn't enough, we want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a +model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. -And for V3: +Here are the operations we need to perform on data types in Zarr-Python: -.. code-block:: python +* Round-trip native data types to fields in array metadata documents. + For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. + + In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` - >>> json_v3 = int8.to_json(zarr_format=3) - >>> json_v3 - 'int8' +* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users + to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for + parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. -Serialize a scalar value to JSON: +* Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications + define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type + can define this encoding separately. -.. code-block:: python +* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot + hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object + that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their + custom data type. - >>> json_value = int8.to_json_scalar(42, zarr_format=3) - >>> json_value - 42 +To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type +supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: -Deserialize a scalar value from JSON: - -.. code-block:: python - - >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) - >>> assert scalar_value == np.int8(42) - -.. _adding-new-data-types: - -Adding New Data Types -^^^^^^^^^^^^^^^^^^^^^ - -Each Zarr data type is a separate Python class that inherits from -`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. You can define a custom data type by -writing your own subclass of `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ and adding -your data type to the data type registry. A complete example of this process is included below. - -The source code for this example can be found in the ``examples/custom_dtype.py`` file in the Zarr -Python project directory. - -.. literalinclude:: ../../examples/custom_dtype.py - :language: python - -Data Type Resolution -^^^^^^^^^^^^^^^^^^^^ - -Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array -with a NumPy data type object: - -.. code-block:: python - - >>> from zarr import create_array - >>> import numpy as np - >>> a = create_array({}, shape=(10,), dtype=np.dtype('int')) - >>> a - - -Or a string representation of a NumPy data type: - -.. code-block:: python - - >>> a = create_array({}, shape=(10,), dtype='>> a - - -The ``Array`` object presents itself like a NumPy array, including exposing a NumPy -data type as its ``dtype`` attribute: - -.. code-block:: python - - >>> type(a.dtype) - - -But if we inspect the metadata for the array, we can see the Zarr data type object: - -.. code-block:: python - - >>> type(a.metadata.data_type) - - -This example illustrates a general problem Zarr Python has to solve: how can we allow users to -specify a data type as a string or a NumPy ``dtype`` object, and produce the right Zarr data type -from that input? We call this process "data type resolution." Zarr Python also performs data type -resolution when reading stored arrays, although in this case the input is a JSON value instead -of a NumPy data type. - -For simple data types like ``int``, the solution could be extremely simple: just -maintain a lookup table that maps a NumPy data type to the Zarr data type equivalent. But not all -data types are so simple. Consider this case: - -.. code-block:: python - - >>> from zarr import create_array - >>> import warnings - >>> import numpy as np - >>> warnings.simplefilter("ignore", category=FutureWarning) - >>> a = create_array({}, shape=(10,), dtype=[('a', 'f8'), ('b', 'i8')]) - >>> a.dtype # this is the NumPy data type - dtype([('a', '>> a.metadata.data_type # this is the Zarr data type - Structured(fields=(('a', Float64(endianness='little')), ('b', Int64(endianness='little')))) - -In this example, we created a -`NumPy structured data type `_. -This data type is a container that can hold any NumPy data type, which makes it recursive. It is -not possible to make a lookup table that relates all NumPy structured data types to their Zarr -equivalents, as there is a nearly unbounded number of different structured data types. So instead of -a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. - -Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," -is essentially a dictionary where the keys are strings (a canonical name for each data type), and the -values are the data type classes themselves. Dynamic data type resolution entails iterating over -these data type classes, invoking that class' `from_native_dtype <#api/dtype/ZDType.from_native_dtype>`_ -method, and returning a concrete data type instance if and only if exactly one of those constructor -invocations is successful. - -In plain language, we take some user input, like a NumPy data type, offer it to all the -known data type classes, and return an instance of the one data type class that can accept that user input. - -We want to avoid a situation where the same native data type matches multiple Zarr data types; that is, -a NumPy data type should *uniquely* specify a single Zarr data type. But data type resolution is -dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we -attempt data type resolution against *every* data type class, and if, for some reason, a native data -type matches multiple Zarr data types, we treat this as an error and raise an exception. - -If you have a NumPy data type and you want to get the corresponding ``ZDType`` instance, you can use -the ``parse_data_type`` function, which will use the dynamic resolution described above. ``parse_data_type`` -handles a range of input types: +(attribute) ``dtype_cls`` +^^^^^^^^^^^^^ +The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce +an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean +data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. +This attribute is used when we need to create an instance of the native data type, for example when +defining a Numpy array that will contain Zarr data. -- NumPy data types: +It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- +why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` +doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 +data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is +defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with +byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. - .. code-block:: python - >>> import numpy as np - >>> from zarr.dtype import parse_data_type - >>> my_dtype = np.dtype('>M8[10s]') - >>> parse_data_type(my_dtype, zarr_format=2) - DateTime64(endianness='big', scale_factor=10, unit='s') +(attribute) ``_zarr_v3_name`` +^^^^^^^^^^^^^ +The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names +are defined in the `Zarr V3 specification https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-types`_ For nearly all of the +data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, +which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. + +(class method) ``from_dtype(cls, dtype) -> Self`` +^^^^^^^^^ +This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform +validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some +data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. +A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. +If input validation succeeds, this method will call ``_from_dtype_unsafe``. + +(class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` +^^^^^^^^^^ +This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, +into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not +perform any input validation. Input validation should be done by the routine that calls this method. + +For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. +But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` +ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. + +(method) ``to_dtype(self) -> dtype`` +^^^^^^^ +This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together +with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. +That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. -- NumPy data type-compatible strings: +(method) ``to_dict(self) -> dict`` +^^^^^ +This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in +Zarr metadata. - .. code-block:: python +(method) ``cast_value(self, value: object) -> scalar`` +^^^^^ +Cast a python object to an instance of the wrapped data type. This is used for generating the default +value associated with this data type. - >>> dtype_str = '>M8[10s]' - >>> parse_data_type(dtype_str, zarr_format=2) - DateTime64(endianness='big', scale_factor=10, unit='s') -- ``ZDType`` instances: +(method) ``default_value(self) -> scalar`` +^^^^ +Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +for an array when a user has not requested one. - .. code-block:: python +Why is this a method and not a static attribute? Although some data types +can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, +a default value must be calculated based on the attributes of the wrapped data type. - >>> from zarr.dtype import DateTime64 - >>> zdt = DateTime64(endianness='big', scale_factor=10, unit='s') - >>> parse_data_type(zdt, zarr_format=2) # Use a ZDType (this is a no-op) - DateTime64(endianness='big', scale_factor=10, unit='s') +(method) `` -- Python dictionaries (requires ``zarr_format=3``). These dictionaries must be consistent with the - ``JSON`` form of the data type: - .. code-block:: python - >>> dt_dict = {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}} - >>> parse_data_type(dt_dict, zarr_format=3) - DateTime64(endianness='little', scale_factor=10, unit='s') - >>> parse_data_type(dt_dict, zarr_format=3).to_json(zarr_format=3) - {'name': 'numpy.datetime64', 'configuration': {'unit': 's', 'scale_factor': 10}} diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f92c576f32..1c0211cf15 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -8,6 +8,7 @@ User guide installation arrays + data_types groups attributes storage diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index b98cc100e3..362f7f361c 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -569,7 +569,7 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType return super().check_dtype(dtype) and dtype.fields is None @classmethod - def check_json(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: # Overriding the base class implementation because the r* dtype # does not have a name that will can appear in array metadata # Instead, array metadata will contain names like "r8", "r16", etc @@ -787,7 +787,7 @@ def to_dict(self) -> dict[str, JSON]: return base_dict @classmethod - def check_json(cls, data: JSON) -> bool: + def check_dict(cls, data: JSON) -> bool: return ( isinstance(data, dict) and "name" in data @@ -797,7 +797,7 @@ def check_json(cls, data: JSON) -> bool: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - if cls.check_json(data): + if cls.check_dict(data): from zarr.core.dtype import get_data_type_from_dict fields = tuple( diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 002bd100e9..eecb1f2562 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -39,24 +39,6 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] - @classmethod - @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: - """ - Wrap a native dtype without checking. - - Parameters - ---------- - dtype : TDType - The native dtype to wrap. - - Returns - ------- - Self - The wrapped dtype. - """ - raise NotImplementedError - @classmethod def from_dtype(cls: type[Self], dtype: TDType) -> Self: """ @@ -83,6 +65,25 @@ def from_dtype(cls: type[Self], dtype: TDType) -> Self: f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) + + @classmethod + @abstractmethod + def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: + """ + Wrap a native dtype without checking. + + Parameters + ---------- + dtype : TDType + The native dtype to wrap. + + Returns + ------- + Self + The wrapped dtype. + """ + raise NotImplementedError + @abstractmethod def to_dtype(self: Self) -> TDType: """ @@ -158,7 +159,7 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: return type(dtype) is cls.dtype_cls @classmethod - def check_json(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -192,7 +193,7 @@ def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: Self The wrapped data type. """ - if cls.check_json(data): + if cls.check_dict(data): return cls._from_json_unsafe(data) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") From e2fce7f4098e4aadb272e5e3bd6f8b3e2f86a903 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 14:11:03 +0100 Subject: [PATCH 023/129] more design doc --- docs/user-guide/data_types.rst | 90 +++++++++++++++++----------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 19095e1851..7a5825bf2f 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -4,19 +4,19 @@ Data types Zarr's data type model ---------------------- -Every Zarr array has a "data type", which defines the meaning and physical layout of the +Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr arrays can use many of the same data types as numpy arrays:: >>> import zarr >>> import numpy as np >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z - + -But Zarr data types and Numpy data types are also very different in one key respect: -Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. -So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for -reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to +But Zarr data types and Numpy data types are also very different in one key respect: +Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. +So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to array metadata. Data types in Zarr version 2 @@ -35,11 +35,11 @@ Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str >>> dtype_meta i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. - + For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. + In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` -* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users - to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for +* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users + to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. * Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type can define this encoding separately. -* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot - hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object - that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their +* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot + hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object + that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their custom data type. -To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type -supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: +To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type +supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: (attribute) ``dtype_cls`` ^^^^^^^^^^^^^ The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce -an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean -data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. -This attribute is used when we need to create an instance of the native data type, for example when -defining a Numpy array that will contain Zarr data. +an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean +data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. +This attribute is used when we need to create an instance of the native data type, for example when +defining a Numpy array that will contain Zarr data. -It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- +It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` -doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 -data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is -defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with +doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 +data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is +defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. (attribute) ``_zarr_v3_name`` ^^^^^^^^^^^^^ -The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names +The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names are defined in the `Zarr V3 specification https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-types`_ For nearly all of the data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, -which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. +which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. (class method) ``from_dtype(cls, dtype) -> Self`` ^^^^^^^^^ This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform -validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some -data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. +validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some +data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. -If input validation succeeds, this method will call ``_from_dtype_unsafe``. +If input validation succeeds, this method will call ``_from_dtype_unsafe``. (class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` ^^^^^^^^^^ This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, -into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not -perform any input validation. Input validation should be done by the routine that calls this method. +into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not +perform any input validation. Input validation should be done by the routine that calls this method. For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. -But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` +But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. (method) ``to_dtype(self) -> dtype`` ^^^^^^^ -This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together +This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. -(method) ``to_dict(self) -> dict`` +(method) ``to_dict(self) -> dict`` ^^^^^ -This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in +This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` ^^^^^ -Cast a python object to an instance of the wrapped data type. This is used for generating the default +Cast a python object to an instance of the wrapped data type. This is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` ^^^^ -Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value -for an array when a user has not requested one. +Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +for an array when a user has not requested one. -Why is this a method and not a static attribute? Although some data types +Why is this a method and not a static attribute? Although some data types can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, a default value must be calculated based on the attributes of the wrapped data type. -(method) `` +(method) ``check_dtype(cls, dtype)`` From a583cd33c5a49d299f0802324cef6f32d0ecaf05 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:27:06 +0100 Subject: [PATCH 024/129] update docs --- docs/user-guide/data_types.rst | 64 ++++++++++++++++++++++++---------- src/zarr/core/dtype/wrapper.py | 29 ++++++++------- 2 files changed, 59 insertions(+), 34 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 7a5825bf2f..83b9870755 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -6,24 +6,24 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and -Zarr arrays can use many of the same data types as numpy arrays:: +Zarr-Python supports creating arrays with Numpy data types:: >>> import zarr >>> import numpy as np >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z -But Zarr data types and Numpy data types are also very different in one key respect: -Zarr arrays are designed to be persisted to storage and later read, possibly by Zarr implementations in different programming languages. -So in addition to defining a memory layout for array elements, each Zarr data type defines a procedure for +But Zarr data types and Numpy data types are also very different: +Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. +To ensure that the data type can be interpreted correctly when reading an array, each Zarr data type defines a procedure for reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to -array metadata. +array metadata, and these serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifer for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: +Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: >>> import zarr >>> import numpy as np @@ -113,16 +113,6 @@ data types, additional checks are needed -- in Numpy "structured" data types and A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. If input validation succeeds, this method will call ``_from_dtype_unsafe``. -(class method) ``_from_dtype_unsafe(cls, dtype) -> Self`` -^^^^^^^^^^ -This method defines the procedure for converting a native data type instance, like ``np.dtype('uint8')``, -into a wrapper class instance. The ``unsafe`` prefix on the method name denotes that this method should not -perform any input validation. Input validation should be done by the routine that calls this method. - -For many data types, creating the wrapper class takes no arguments and so this method can just return ``cls()``. -But for data types with runtime attributes like endianness or length (for fixed-size strings), this ``_from_dtype_unsafe`` -ensures that those attributes of ``dtype`` are mapped on to the correct parameters in the ``DTypeWrapper`` class constructor. - (method) ``to_dtype(self) -> dtype`` ^^^^^^^ This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together @@ -137,20 +127,56 @@ Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` ^^^^^ -Cast a python object to an instance of the wrapped data type. This is used for generating the default +This method converts a python object to an instance of the wrapped data type. It is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` ^^^^ -Return the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value +This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value for an array when a user has not requested one. Why is this a method and not a static attribute? Although some data types can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, a default value must be calculated based on the attributes of the wrapped data type. -(method) ``check_dtype(cls, dtype)`` +(class method) ``check_dtype(cls, dtype) -> bool`` +^^^^^ +This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` +if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple +as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped +by the ``DTypeWrapper`` is the same as the class of ``dtype``. But there are some data types where this check alone is not sufficient, +in which case this method is overridden so that additional properties of ``dtype`` can be inspected and compared with +the expectations of ``cls``. + +(class method) ``from_dict(cls, dtype) -> Self`` +^^^^ +This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default +implementation first checks that the dictionary has the correct structure, and then uses its data +to instantiate the ``DTypeWrapper`` instance. + +(method) ``to_dict(self) -> dict[str, JSON]`` +^^^ +Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. +(class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` +^^^^ +This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is +2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. +If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types +the Zarr V3 name will be stored as the ``_zarr_v3_name`` class attribute, but for parametric data types the +name must be computed at runtime based on the parameters of the data type. + + +(method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` +^^^ +This method converts a scalar instance of the data type into a JSON-serialiazable value. +For some data types like bool and integers this conversion is simple -- just return a JSON boolean +or number -- but other data types define a JSON serialization for scalars that is a bit more involved. +And this JSON serialization depends on the Zarr format. + +(method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` +^^^ +Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index eecb1f2562..dc3a0cc5d2 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -65,7 +65,6 @@ def from_dtype(cls: type[Self], dtype: TDType) -> Self: f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) - @classmethod @abstractmethod def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: @@ -96,18 +95,6 @@ def to_dtype(self: Self) -> TDType: """ raise NotImplementedError - @abstractmethod - def to_dict(self) -> dict[str, JSON]: - """ - Convert the wrapped data type to a dictionary. - - Returns - ------- - dict[str, JSON] - The dictionary representation of the wrapped data type - """ - raise NotImplementedError - def cast_value(self: Self, value: object) -> TScalar: """ Cast a value to an instance of the scalar type. @@ -178,6 +165,18 @@ def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JS """ return "name" in data and data["name"] == cls._zarr_v3_name + @abstractmethod + def to_dict(self) -> dict[str, JSON]: + """ + Convert the wrapped data type to a dictionary. + + Returns + ------- + dict[str, JSON] + The dictionary representation of the wrapped data type + """ + raise NotImplementedError + @classmethod def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: """ @@ -194,11 +193,11 @@ def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: The wrapped data type. """ if cls.check_dict(data): - return cls._from_json_unsafe(data) + return cls._from_dict_unsafe(data) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") @classmethod - def _from_json_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: + def _from_dict_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: """ Wrap a JSON representation of a data type. From e0b662d231679abb04ab6aa74a910fac1349c206 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:34:40 +0100 Subject: [PATCH 025/129] fix sphinx warnings --- docs/user-guide/data_types.rst | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 83b9870755..6132eb2376 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -7,11 +7,11 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr-Python supports creating arrays with Numpy data types:: - >>> import zarr - >>> import numpy as np - >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) - >>> z - +>>> import zarr +>>> import numpy as np +>>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) +>>> z + But Zarr data types and Numpy data types are also very different: Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. @@ -36,8 +36,8 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st `_, or "byte order", of the data type. Following Numpy's example, + Zarr version 2 data types associate each data type with an endianness where applicable. Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``"`_ For nearly all of the data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. (class method) ``from_dtype(cls, dtype) -> Self`` -^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. @@ -114,25 +114,25 @@ A ``DTypeWrapper`` that wraps Numpy structured data types must do additional che If input validation succeeds, this method will call ``_from_dtype_unsafe``. (method) ``to_dtype(self) -> dtype`` -^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. (method) ``to_dict(self) -> dict`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in Zarr metadata. (method) ``cast_value(self, value: object) -> scalar`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method converts a python object to an instance of the wrapped data type. It is used for generating the default value associated with this data type. (method) ``default_value(self) -> scalar`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value for an array when a user has not requested one. @@ -141,7 +141,7 @@ can have a static default value, parametrized data types like fixed-length strin a default value must be calculated based on the attributes of the wrapped data type. (class method) ``check_dtype(cls, dtype) -> bool`` -^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped @@ -150,17 +150,17 @@ in which case this method is overridden so that additional properties of ``dtype the expectations of ``cls``. (class method) ``from_dict(cls, dtype) -> Self`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default implementation first checks that the dictionary has the correct structure, and then uses its data to instantiate the ``DTypeWrapper`` instance. (method) ``to_dict(self) -> dict[str, JSON]`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. (class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` -^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is 2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types @@ -169,14 +169,14 @@ name must be computed at runtime based on the parameters of the data type. (method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This method converts a scalar instance of the data type into a JSON-serialiazable value. For some data types like bool and integers this conversion is simple -- just return a JSON boolean or number -- but other data types define a JSON serialization for scalars that is a bit more involved. And this JSON serialization depends on the Zarr format. (method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` -^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. From ed0c76b18121ffa020d360f35cf7823c0d410a98 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 16:59:52 +0100 Subject: [PATCH 026/129] tweak docs --- docs/user-guide/data_types.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 6132eb2376..94e05de62d 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -7,17 +7,17 @@ Zarr's data type model Every Zarr array has a "data type", which defines the meaning and physical layout of the array's elements. Zarr is heavily influenced by `NumPy `_, and Zarr-Python supports creating arrays with Numpy data types:: ->>> import zarr ->>> import numpy as np ->>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) ->>> z - -But Zarr data types and Numpy data types are also very different: + >>> import zarr + >>> import numpy as np + >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z + + Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. -To ensure that the data type can be interpreted correctly when reading an array, each Zarr data type defines a procedure for -reading and writing that data type to Zarr array metadata, and also reading and writing **instances** of that data type to -array metadata, and these serialization procedures depend on the Zarr format. +This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for +encoding / decoding that data type to / from Zarr array metadata, and also encoding / decoding **instances** of that data type to / from +array metadata. These serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- @@ -56,7 +56,7 @@ Zarr-Python supports two different Zarr formats, and those two formats specify d data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -If that wasn't enough, we want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a +We also want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. Here are the operations we need to perform on data types in Zarr-Python: From 79a8fd2a4f16b90272962928e7e83cec564ba04a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 17:05:49 +0100 Subject: [PATCH 027/129] info about v3 data types --- docs/user-guide/data_types.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 94e05de62d..2c6a98753c 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -46,8 +46,9 @@ as are floats, with the caveat that `NaN`, positive infinity, and negative infin Data types in Zarr version 3 ---------------------------- +* Data type names are different -- Zarr V2 represented the 16 bit unsigned integer data type as ``>i2``; Zarr V3 represents the same data type as ``int16``. * No endianness -* Data type can be encoded as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` +* A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` Data types in Zarr-Python ------------------------- From 5e1536992b3a173b41c7daf8fa0835f251d3b1ec Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 13 Mar 2025 17:12:45 +0100 Subject: [PATCH 028/129] adjust note --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 2c6a98753c..8fcfaac794 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -37,7 +37,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st .. note:: The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, - Zarr version 2 data types associate each data type with an endianness where applicable. Zarr version 3 data types do not store endianness information. + in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``" Date: Thu, 13 Mar 2025 17:41:56 +0100 Subject: [PATCH 029/129] fix: use unparametrized types in direct assignment --- src/zarr/core/dtype/_numpy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 362f7f361c..caf46bb216 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -524,7 +524,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): - dtype_cls = np.dtypes.VoidDType[Any] + dtype_cls = np.dtypes.VoidDType _zarr_v3_name = "r*" item_size_bits: ClassVar[int] = 8 length: int = 1 @@ -591,8 +591,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): - dtype_cls = np.dtypes.StrDType[int] - _zarr_v3_name = "numpy.static_unicode_string" + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_unicode_string" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 From a050f3bfe8fd35b82a03e2a0d45873bc94095a96 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 10:04:13 +0100 Subject: [PATCH 030/129] start fixing config --- src/zarr/core/array.py | 77 ++++------------------------------- src/zarr/core/config.py | 39 ++++++------------ src/zarr/core/dtype/_numpy.py | 4 +- 3 files changed, 23 insertions(+), 97 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 926a9bc472..11a08a7d65 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,7 +72,6 @@ from zarr.core.dtype import ( DTypeWrapper, FixedLengthAsciiString, - FixedLengthUnicodeString, VariableLengthString, parse_data_type, ) @@ -4690,76 +4689,16 @@ def _get_default_chunk_encoding_v2( This is an empty tuple. No data types have default filters. """ - from numcodecs import VLenBytes as numcodecs_VLenBytes - from numcodecs import VLenUTF8 as numcodecs_VLenUTF8 - from numcodecs import Zstd as numcodecs_zstd - - if isinstance(dtype, VariableLengthString | FixedLengthUnicodeString): - filters = (numcodecs_VLenUTF8(),) - elif isinstance(dtype, FixedLengthAsciiString): - filters = (numcodecs_VLenBytes(),) + if dtype._zarr_v3_name in zarr_config.get("array.v2_default_filters"): + filters = zarr_config.get(f"array.v2_default_filters.{dtype._zarr_v3_name}") else: - filters = None - - compressor = numcodecs_zstd(level=0, checksum=False) - - -def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: - """ - Given a data type, return the default serializer for that data type. - - The default serializer for most data types is the ``BytesCodec``, which may or may not be - parameterized with an endianness, depending on whether the data type has endianness. Variable - length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and - ``VLenBytesCodec``, respectively. - - """ - serializer: ArrayBytesCodec = BytesCodec(endian=None) - - if isinstance(dtype, HasEndianness): - serializer = BytesCodec(endian="little") - elif isinstance(dtype, HasObjectCodec): - if dtype.object_codec_id == "vlen-bytes": - serializer = VLenBytesCodec() - elif dtype.object_codec_id == "vlen-utf8": - serializer = VLenUTF8Codec() - else: - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - raise ValueError(msg) - return serializer - - -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: - """ - Given a data type, return the default filters for that data type. - - For data types that require an object codec, namely variable length data types, - this is a tuple containing the object codec. Otherwise it's ``None``. - """ - if isinstance(dtype, HasObjectCodec): - if dtype.object_codec_id == "vlen-bytes": - from numcodecs import VLenBytes - - return (VLenBytes(),) - elif dtype.object_codec_id == "vlen-utf8": - from numcodecs import VLenUTF8 - - return (VLenUTF8(),) - else: - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - raise ValueError(msg) - return None - - -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: - """ - Given a data type, return the default compressors for that data type. + filters = zarr_config.get("array.v2_default_filters.default") - This is just the numcodecs ``Zstd`` codec. - """ - from numcodecs import Zstd - - return Zstd(level=0, checksum=False) + if dtype._zarr_v3_name in zarr_config.get("array.v2_default_compressor"): + compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") + else: + compressor = zarr_config.get("array.v2_default_compressor.default") + return filters, compressor def _parse_chunk_encoding_v2( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 9a15bf17d2..cb00a1f9ee 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,7 +36,7 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet - from zarr.core.dtype.wrapper import ZDType +from collections import defaultdict class BadConfigError(ValueError): @@ -106,37 +106,24 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "numeric": {"id": "zstd", "level": 0, "checksum": False}, - "string": {"id": "zstd", "level": 0, "checksum": False}, - "bytes": {"id": "zstd", "level": 0, "checksum": False}, - }, + "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { - "numeric": None, - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - "raw": None, + "default": None, + "numpy.variable_length_unicode_string": [{"id": "vlen-utf8"}], + "numpy.fixed_length_unicode_string": [{"id": "vlen-utf8"}], + "r*": [{"id": "vlen-bytes"}], }, - "v3_default_filters": {"boolean": [], "numeric": [], "string": [], "bytes": []}, + "v3_default_filters": defaultdict(list), "v3_default_serializer": { - "boolean": {"name": "bytes", "configuration": {"endian": "little"}}, - "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, - "string": {"name": "vlen-utf8"}, - "bytes": {"name": "vlen-bytes"}, + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "numpy.variable_length_unicode_string": [{"name": "vlen-utf8"}], + "numpy.fixed_length_unicode_string": [{"name": "vlen-utf8"}], + "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { - "boolean": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "numeric": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "bytes": [ + "default": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], + ] }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index caf46bb216..d61fedd4ab 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -627,7 +627,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.vlen_string" + _zarr_v3_name = "numpy.variable_length_string" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -658,7 +658,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.vlen_string" + _zarr_v3_name = "numpy.variable_length_string" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: From 48f883b6a6ead897d08d8d66ac7c1e5d28ccf9af Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 17 Mar 2025 10:12:38 +0100 Subject: [PATCH 031/129] Update src/zarr/core/_info.py Co-authored-by: Joe Hamman --- src/zarr/core/_info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 880d27fff7..0fb6b2d1eb 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -11,8 +11,6 @@ from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import DTypeWrapper -# from zarr.core.metadata.v3 import DataType - @dataclasses.dataclass(kw_only=True) class GroupInfo: From 6c70eacf80b8a7d40219656be9436741890c8914 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:20:39 +0100 Subject: [PATCH 032/129] add placeholder disclaimer to v3 data types summary --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 8fcfaac794..91cbeb1d7f 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -45,7 +45,7 @@ as are floats, with the caveat that `NaN`, positive infinity, and negative infin Data types in Zarr version 3 ---------------------------- - +(note: placeholder text) * Data type names are different -- Zarr V2 represented the 16 bit unsigned integer data type as ``>i2``; Zarr V3 represents the same data type as ``int16``. * No endianness * A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` From fe2754acc5742e79b837cba219c952594f83a3f4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:22:53 +0100 Subject: [PATCH 033/129] make example runnable --- docs/user-guide/data_types.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 91cbeb1d7f..7039d1850a 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -28,8 +28,9 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> import zarr >>> import numpy as np >>> import json + >>> store = {} >>> np_dtype = np.dtype('int64') - >>> z = zarr.create_array(shape=(1,), dtype=np_dtype, zarr_format=2) + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta From eb48a3c0519b18e8d23ea6b30efdb829a569979a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:25:31 +0100 Subject: [PATCH 034/129] placeholder section for adding a custom dtype --- docs/user-guide/data_types.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 7039d1850a..352e967c87 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -181,4 +181,7 @@ And this JSON serialization depends on the Zarr format. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. +Using a custom data type +------------------------ +TODO \ No newline at end of file From d5e376f2aec2cf732fd406b0ea5465c6073023e2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 12:38:46 +0100 Subject: [PATCH 035/129] define native data type and native scalar --- docs/user-guide/data_types.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 352e967c87..fffd622209 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -58,21 +58,23 @@ Zarr-Python supports two different Zarr formats, and those two formats specify d data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -We also want Zarr-Python to support data types beyond what's available in Numpy. So it's crucial that we have a -model of array data types that can adapt to the differences between Zarr V2 and V3 and doesn't over-fit to Numpy. +We aspire for Zarr-Python to eventually be array-library-agnostic. +In the context of data types, this means that we should not design an API that overfits to Numpy's data types. +We will use the term "native data type" to refer to a data type used by any external array library (including Numpy), e.g. ``np.dtypes.Float64DType()``. +We will also use the term "native scalar" or "native scalar type" to refer to a scalar value of a native data type. For example, ``np.float64(0)`` generates a scalar with the data dtype ``np.dtypes.Float64DType`` -Here are the operations we need to perform on data types in Zarr-Python: +Zarr-Python needs to support the following operations on native data types: * Round-trip native data types to fields in array metadata documents. For example, the Numpy data type ``np.dtype('>i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` -* Define a default fill value. This is not mandated by the Zarr specifications, but it's convenient for users +* Associate a default fill value with a native data type. This is not mandated by the Zarr specifications, but it's convenient for users to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. -* Round-trip scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications +* Round-trip native scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type can define this encoding separately. From fc3297a0509d2c5eadd0e8572c80f0f9859214d6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 14:32:12 +0100 Subject: [PATCH 036/129] update data type names --- src/zarr/core/array.py | 41 ++++++++++++++++++++--------------- src/zarr/core/config.py | 14 +++++------- src/zarr/core/dtype/_numpy.py | 33 +++++++++++++++------------- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 11a08a7d65..aa4d9a3851 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -29,9 +29,6 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec -from zarr.codecs.bytes import BytesCodec -from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec -from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes @@ -71,8 +68,6 @@ from zarr.core.config import config as zarr_config from zarr.core.dtype import ( DTypeWrapper, - FixedLengthAsciiString, - VariableLengthString, parse_data_type, ) from zarr.core.indexing import ( @@ -4664,21 +4659,29 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - filters = () - compressors = (ZstdCodec(level=0, checksum=False),) # TODO: find a registry-style solution for this that isn't bloated # We need to associate specific dtypes with specific encoding schemes - if isinstance(dtype, VariableLengthString): - serializer = VLenUTF8Codec() - elif isinstance(dtype, FixedLengthAsciiString): - serializer = VLenBytesCodec() + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_filters"): + filters = zarr_config.get(f"array.v3_default_filters.{dtype._zarr_v3_name}") else: - if dtype.to_dtype().itemsize == 1: - serializer = BytesCodec(endian=None) - else: - serializer = BytesCodec() - return filters, serializer, compressors + filters = zarr_config.get("array.v3_default_filters.default") + + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_compressors"): + compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") + else: + compressors = zarr_config.get("array.v3_default_compressors.default") + + if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): + serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") + else: + serializer = zarr_config.get("array.v3_default_serializer.default") + + return ( + tuple(_parse_array_array_codec(f) for f in filters), + _parse_array_bytes_codec(serializer), + tuple(_parse_bytes_bytes_codec(c) for c in compressors), + ) def _get_default_chunk_encoding_v2( @@ -4698,7 +4701,11 @@ def _get_default_chunk_encoding_v2( compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") else: compressor = zarr_config.get("array.v2_default_compressor.default") - return filters, compressor + + if filters is not None: + filters = tuple(numcodecs.get_codec(f) for f in filters) + + return filters, numcodecs.get_codec(compressor) def _parse_chunk_encoding_v2( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index cb00a1f9ee..3b95df4396 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,8 +36,6 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet -from collections import defaultdict - class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -109,15 +107,15 @@ def enable_gpu(self) -> ConfigSet: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "numpy.variable_length_unicode_string": [{"id": "vlen-utf8"}], - "numpy.fixed_length_unicode_string": [{"id": "vlen-utf8"}], - "r*": [{"id": "vlen-bytes"}], + "variable_length_utf8": [{"id": "vlen-utf8"}], + "fixed_length_ucs4": [{"id": "vlen-utf8"}], + "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "v3_default_filters": defaultdict(list), + "v3_default_filters": {"default": ()}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy.variable_length_unicode_string": [{"name": "vlen-utf8"}], - "numpy.fixed_length_unicode_string": [{"name": "vlen-utf8"}], + "variable_length_utf8": {"name": "vlen-utf8"}, + "fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index d61fedd4ab..fa97503795 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -496,7 +496,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.static_byte_string" + _zarr_v3_name = "fixed_length_ascii" item_size_bits: ClassVar[int] = 8 length: int = 1 @@ -523,20 +523,20 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]): +class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType, np.void]): dtype_cls = np.dtypes.VoidDType _zarr_v3_name = "r*" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[Any]) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def default_value(self) -> np.void: return self.cast_value(("\x00" * self.length).encode("ascii")) - def to_dtype(self) -> np.dtypes.VoidDType[Any]: + def to_dtype(self) -> np.dtypes.VoidDType: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return np.dtype(f"V{self.length}") @@ -577,7 +577,7 @@ def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: isinstance(data, dict) and "name" in data and isinstance(data["name"], str) - and re.match(r"^r\d+$", data["name"]) + and (re.match(r"^r\d+$", data["name"]) is not None) ) def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: @@ -592,7 +592,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_unicode_string" + _zarr_v3_name = "fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 @@ -605,7 +605,10 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: ) def to_dtype(self) -> np.dtypes.StrDType[int]: - return self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)) + return cast( + np.dtypes.StrDType[int], + self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)), + ) def default_value(self) -> np.str_: return np.str_("") @@ -627,7 +630,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_string" + _zarr_v3_name = "variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: @@ -658,14 +661,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_string" + _zarr_v3_name = "variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() + return cast(np.dtypes.ObjectDType, self.dtype_cls()) def cast_value(self, value: object) -> str: return str(value) @@ -695,7 +698,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): dtype_cls = np.dtypes.DateTime64DType - _zarr_v3_name = "numpy.datetime64" + _zarr_v3_name = "datetime64" unit: DateUnit | TimeUnit = "s" endianness: Endianness = "native" @@ -713,7 +716,7 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) def cast_value(self, value: object) -> np.datetime64: - return self.to_dtype().type(value, self.unit) + return cast(np.datetime64, self.to_dtype().type(value, self.unit)) def to_dtype(self) -> np.dtypes.DateTime64DType: # Numpy does not allow creating datetime64 via @@ -734,14 +737,14 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): dtype_cls = np.dtypes.VoidDType - _zarr_v3_name = "numpy.structured" + _zarr_v3_name = "structured" fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] def default_value(self) -> np.void: return self.cast_value(0) def cast_value(self, value: object) -> np.void: - return np.array([value], dtype=self.to_dtype())[0] + return cast(np.void, np.array([value], dtype=self.to_dtype())[0]) @classmethod def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: @@ -787,7 +790,7 @@ def to_dict(self) -> dict[str, JSON]: return base_dict @classmethod - def check_dict(cls, data: JSON) -> bool: + def check_dict(cls, data: JSON) -> TypeGuard[JSON]: return ( isinstance(data, dict) and "name" in data From 9c13c852eea7db79c4349e11fcba16ff80229416 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 16:11:26 +0100 Subject: [PATCH 037/129] fix config test failures --- src/zarr/core/array.py | 13 ++-- src/zarr/core/config.py | 2 +- src/zarr/core/dtype/_numpy.py | 2 +- tests/test_array.py | 13 +++- tests/test_config.py | 123 ++++++++++++++++++---------------- 5 files changed, 87 insertions(+), 66 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index aa4d9a3851..0d83b0472c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -29,6 +29,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec +from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config from zarr.core.attributes import Attributes @@ -4671,7 +4672,6 @@ def _get_default_chunk_encoding_v3( compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") else: compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") else: @@ -4821,11 +4821,14 @@ def _parse_chunk_encoding_v3( out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) - # TODO: ensure that the serializer is compatible with the ndarray produced by the - # array-array codecs. For example, if a sequence of array-array codecs produces an - # array with a single-byte data type, then the serializer should not specify endiannesss. + # specialize codecs as needed given the dtype + + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. - # TODO: add checks to ensure that the right serializer is used for vlen data types + if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 3b95df4396..e51f1a1de1 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -111,7 +111,7 @@ def enable_gpu(self) -> ConfigSet: "fixed_length_ucs4": [{"id": "vlen-utf8"}], "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "v3_default_filters": {"default": ()}, + "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, "variable_length_utf8": {"name": "vlen-utf8"}, diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index fa97503795..c562f0a593 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -711,7 +711,7 @@ def to_dict(self) -> dict[str, JSON]: @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] - if unit not in get_args(DateUnit | TimeUnit): + if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) diff --git a/tests/test_array.py b/tests/test_array.py index 54a2db160b..bebf6b8f0a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1236,12 +1236,19 @@ async def test_default_filters_compressors( sig = inspect.signature(create_array) if zarr_format == 3: - expected_filters, expected_serializer, expected_compressors = ( - _get_default_chunk_encoding_v3(dtype=zdtype) + expected_filters, expected_serializer, expected_compressors = _parse_chunk_encoding_v3( + compressors=sig.parameters["compressors"].default, + filters=sig.parameters["filters"].default, + serializer=sig.parameters["serializer"].default, + dtype=zdtype, ) elif zarr_format == 2: - default_filters, default_compressors = _get_default_chunk_encoding_v2(dtype=zdtype) + default_filters, default_compressors = _parse_chunk_encoding_v2( + compressor=sig.parameters["compressors"].default, + filters=sig.parameters["filters"].default, + dtype=zdtype, + ) if default_filters is None: expected_filters = () else: diff --git a/tests/test_config.py b/tests/test_config.py index da5b2cc488..b1b6ae0ecd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -18,11 +18,13 @@ Crc32cCodec, ShardingCodec, ) +from zarr.core.array import create_array from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config +from zarr.core.dtype import get_data_type_from_numpy from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -45,39 +47,55 @@ def test_config_defaults_set() -> None: # regression test for available defaults - assert ( - config.defaults - == [ - { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, + assert config.defaults == [ + { + "default_zarr_format": 3, + "array": { + "order": "C", + "write_empty_chunks": False, + "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, + "v2_default_filters": { + "default": None, + "variable_length_utf8": [{"id": "vlen-utf8"}], + "fixed_length_ucs4": [{"id": "vlen-utf8"}], + "fixed_length_ascii": [{"id": "vlen-bytes"}], }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, + "v3_default_filters": {"default": []}, + "v3_default_serializer": { + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "variable_length_utf8": {"name": "vlen-utf8"}, + "fixed_length_ucs4": {"name": "vlen-utf8"}, + "r*": {"name": "vlen-bytes"}, }, - "codecs": { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", # compatibility with earlier versions of ZEP1 - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + "v3_default_compressors": { + "default": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ] }, - "buffer": "zarr.buffer.cpu.Buffer", - "ndbuffer": "zarr.buffer.cpu.NDBuffer", - } - ] - ) + }, + "async": {"concurrency": 10, "timeout": None}, + "threading": {"max_workers": None}, + "json_indent": 2, + "codec_pipeline": { + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "batch_size": 1, + }, + "buffer": "zarr.core.buffer.cpu.Buffer", + "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", + "codecs": { + "blosc": "zarr.codecs.blosc.BloscCodec", + "gzip": "zarr.codecs.gzip.GzipCodec", + "zstd": "zarr.codecs.zstd.ZstdCodec", + "bytes": "zarr.codecs.bytes.BytesCodec", + "endian": "zarr.codecs.bytes.BytesCodec", + "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", + "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", + "transpose": "zarr.codecs.transpose.TransposeCodec", + "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", + "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", + }, + } + ] assert config.get("array.order") == "C" assert config.get("async.concurrency") == 10 assert config.get("async.timeout") is None @@ -295,31 +313,24 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize( - "key", - [ - "array.v2_default_compressor.numeric", - "array.v2_default_compressor.string", - "array.v2_default_compressor.bytes", - "array.v2_default_filters.string", - "array.v2_default_filters.bytes", - "array.v3_default_filters.numeric", - "array.v3_default_filters.raw", - "array.v3_default_filters.bytes", - "array.v3_default_serializer.numeric", - "array.v3_default_serializer.string", - "array.v3_default_serializer.bytes", - "array.v3_default_compressors.string", - "array.v3_default_compressors.bytes", - "array.v3_default_compressors", - ], -) -def test_deprecated_config(key: str) -> None: +@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) +async def test_default_codecs(dtype: str) -> None: """ - Test that a valuerror is raised when setting the default chunk encoding for a given - data type category + Test that the default compressors are sensitive to the current setting of the config. """ - - with pytest.raises(ValueError): - with zarr.config.set({key: "foo"}): - pass + zdtype = get_data_type_from_numpy(dtype) + expected_compressors = (GzipCodec(),) + new_conf = { + f"array.v3_default_compressors.{zdtype._zarr_v3_name}": [ + c.to_dict() for c in expected_compressors + ] + } + with config.set(new_conf): + arr = await create_array( + shape=(100,), + chunks=(100,), + dtype=dtype, + zarr_format=3, + store=MemoryStore(), + ) + assert arr.compressors == expected_compressors From 3e0e61bf36947d285e0489684624ded78b6629c1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 17 Mar 2025 16:27:10 +0100 Subject: [PATCH 038/129] call to_dtype once in blosc evolve_from_array_spec --- src/zarr/codecs/blosc.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 37207d52c4..66e2dbbc34 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -138,20 +138,14 @@ def to_dict(self) -> dict[str, JSON]: } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - item_size = 1 - if isinstance(array_spec.dtype, HasItemSize): - item_size = array_spec.dtype.item_size + dtype = array_spec.dtype.to_dtype() new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.to_dtype().itemsize) + new_codec = replace(new_codec, typesize=dtype.itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=( - BloscShuffle.bitshuffle - if dtype.to_dtype().itemsize == 1 - else BloscShuffle.shuffle - ), + shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), ) return new_codec From a8d815ac44aa1e17e8e8192dcbd0cc3b56c069b0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 19 Mar 2025 21:18:47 +0100 Subject: [PATCH 039/129] refactor dtypewrapper -> zdtype --- src/zarr/abc/codec.py | 8 +- src/zarr/api/asynchronous.py | 6 +- src/zarr/codecs/bytes.py | 9 +- src/zarr/codecs/sharding.py | 12 +- src/zarr/codecs/transpose.py | 4 +- src/zarr/core/_info.py | 11 +- src/zarr/core/array.py | 86 +- src/zarr/core/array_spec.py | 11 +- src/zarr/core/buffer/cpu.py | 2 +- src/zarr/core/codec_pipeline.py | 8 +- src/zarr/core/common.py | 2 - src/zarr/core/config.py | 10 +- src/zarr/core/dtype/__init__.py | 53 +- src/zarr/core/dtype/_numpy.py | 1190 +++++++++++++++++----- src/zarr/core/dtype/common.py | 71 +- src/zarr/core/dtype/registry.py | 20 +- src/zarr/core/dtype/wrapper.py | 157 ++- src/zarr/core/metadata/v2.py | 44 +- src/zarr/core/metadata/v3.py | 36 +- src/zarr/testing/strategies.py | 4 +- tests/conftest.py | 4 +- tests/test_array.py | 6 +- tests/test_codecs/test_vlen.py | 6 +- tests/test_config.py | 16 +- tests/test_metadata/test_consolidated.py | 2 +- tests/test_metadata/test_dtype.py | 120 ++- tests/test_metadata/test_v3.py | 16 +- 27 files changed, 1286 insertions(+), 628 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..f064fad02e 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -12,10 +12,10 @@ from collections.abc import Awaitable, Callable, Iterable from typing import Self - from zarr.abc.store import ByteGetter, ByteSetter, Store + from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar from zarr.core.indexing import SelectorTuple from zarr.core.metadata import ArrayMetadata @@ -97,7 +97,7 @@ def validate( self, *, shape: ChunkCoords, - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid, ) -> None: """Validates that the codec configuration is compatible with the array metadata. @@ -311,7 +311,7 @@ def supports_partial_encode(self) -> bool: ... @abstractmethod def validate( - self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid ) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ac4782007d..a65a469e8d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -28,7 +28,7 @@ _default_zarr_format, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -453,7 +453,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - zarr_dtype = get_data_type_from_numpy(arr.dtype) + zarr_dtype = get_data_type_from_native_dtype(arr.dtype) new = await AsyncArray._create( store_path, zarr_format=zarr_format, @@ -1005,7 +1005,7 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = get_data_type_from_numpy(dtype) + dtype_wrapped = get_data_type_from_native_dtype(dtype) if zarr_format == 2: if chunks is None: chunks = shape diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 157b5443dc..80972096c2 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,20 +3,21 @@ import sys from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration -from zarr.core.dtype.common import endianness_to_numpy_str +from zarr.core.dtype._numpy import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: from typing import Self from zarr.core.array_spec import ArraySpec + from zarr.core.dtype.common import Endianness class Endian(Enum): @@ -73,7 +74,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None + endian_str = cast( + "Endianness | None", self.endian.value if self.endian is not None else None + ) dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) as_array_like = chunk_bytes.as_array_like() diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 32559e7fb8..3a90fdfcca 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -43,7 +43,7 @@ parse_shapelike, product, ) -from zarr.core.dtype.npy.int import UInt64 +from zarr.core.dtype._numpy import UInt64 from zarr.core.indexing import ( BasicIndexer, SelectorTuple, @@ -59,7 +59,7 @@ from typing import Self from zarr.core.common import JSON - from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -406,7 +406,11 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self def validate( - self, *, shape: ChunkCoords, dtype: DTypeWrapper[Any, Any], chunk_grid: ChunkGrid + self, + *, + shape: ChunkCoords, + dtype: ZDType[_BaseDType, _BaseScalar], + chunk_grid: ChunkGrid, ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( @@ -445,7 +449,7 @@ async def _decode_single( # setup output array out = chunk_spec.prototype.nd_buffer.create( shape=shard_shape, - dtype=shard_spec.dtype.to_native_dtype(), + dtype=shard_spec.dtype.to_dtype(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index be89690441..7715d06265 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -16,7 +16,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: @@ -49,7 +49,7 @@ def to_dict(self) -> dict[str, JSON]: def validate( self, shape: tuple[int, ...], - dtype: ZDType[TBaseDType, TBaseScalar], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid, ) -> None: if len(self.order) != len(shape): diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 0fb6b2d1eb..f6b51fdb3c 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,14 +2,15 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: import numcodecs.abc + import numpy as np -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec -from zarr.core.common import ZarrFormat -from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec + from zarr.core.common import ZarrFormat + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclasses.dataclass(kw_only=True) @@ -80,7 +81,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | DTypeWrapper + _data_type: np.dtype[Any] | ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0d83b0472c..34292aa045 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -41,7 +41,7 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -68,7 +68,7 @@ from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( - DTypeWrapper, + ZDType, parse_data_type, ) from zarr.core.indexing import ( @@ -129,7 +129,7 @@ from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar from zarr.core.group import AsyncGroup from zarr.storage import StoreLike @@ -581,7 +581,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike | DTypeWrapper[Any, Any], + dtype: npt.DTypeLike | ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, @@ -611,7 +611,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_data_type(dtype) + dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -702,7 +702,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_shape: ChunkCoords, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -729,14 +729,6 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if dtype.to_dtype().kind in ("U", "T", "S"): - warn( - f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " - "may not be supported by other zarr implementations and may change in the future.", - category=UserWarning, - stacklevel=2, - ) - if fill_value is None: # v3 spec will not allow a null fill value fill_value_parsed = dtype.default_value() @@ -761,7 +753,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = DEFAULT_FILL_VALUE, @@ -809,7 +801,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -839,7 +831,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -996,7 +988,7 @@ def chunks(self) -> ChunkCoords: return self.metadata.chunks @cached_property - def chunk_grid(self) -> RegularChunkGrid: + def chunk_grid(self) -> ChunkGrid: if self.metadata.zarr_format == 2: return RegularChunkGrid(chunk_shape=self.chunks) else: @@ -1086,7 +1078,17 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def dtype(self) -> np.dtype[Any]: + def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: + """ + The zarr-specific representation of the array data type + """ + if self.metadata.zarr_format == 2: + return self.metadata.dtype + else: + return self.metadata.data_type + + @property + def dtype(self) -> _BaseDType: """Returns the data type of the array. Returns @@ -1094,10 +1096,7 @@ def dtype(self) -> np.dtype[Any]: np.dtype Data type of the array """ - if self.metadata.zarr_format == 2: - return self.metadata.dtype.to_dtype() - else: - return self.metadata.data_type.to_dtype() + return self._zdtype.to_dtype() @property def order(self) -> MemoryOrder: @@ -1326,7 +1325,7 @@ def get_chunk_spec( ) return ArraySpec( shape=self.chunk_grid.chunk_shape, - dtype=self.dtype, + dtype=self._zdtype, fill_value=self.metadata.fill_value, config=array_config, prototype=prototype, @@ -4241,7 +4240,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_data_type(dtype) + dtype_wrapped = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4655,25 +4654,30 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ + # the config will not allow keys to have "." characters in them + # so we will access the config by transforming "." to "__" + + dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + # TODO: find a registry-style solution for this that isn't bloated # We need to associate specific dtypes with specific encoding schemes - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_filters"): - filters = zarr_config.get(f"array.v3_default_filters.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_filters"): + filters = zarr_config.get(f"array.v3_default_filters.{dtype_name_conf}") else: filters = zarr_config.get("array.v3_default_filters.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_compressors"): - compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_compressors"): + compressors = zarr_config.get(f"array.v3_default_compressors.{dtype_name_conf}") else: compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"): - serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v3_default_serializer"): + serializer = zarr_config.get(f"array.v3_default_serializer.{dtype_name_conf}") else: serializer = zarr_config.get("array.v3_default_serializer.default") @@ -4685,20 +4689,24 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Given a data type, return the default filters for that data type. This is an empty tuple. No data types have default filters. """ - if dtype._zarr_v3_name in zarr_config.get("array.v2_default_filters"): - filters = zarr_config.get(f"array.v2_default_filters.{dtype._zarr_v3_name}") + # the config will not allow keys to have "." characters in them + # so we will access the config by transforming "." to "__" + dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + + if dtype_name_conf in zarr_config.get("array.v2_default_filters"): + filters = zarr_config.get(f"array.v2_default_filters.{dtype_name_conf}") else: filters = zarr_config.get("array.v2_default_filters.default") - if dtype._zarr_v3_name in zarr_config.get("array.v2_default_compressor"): - compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}") + if dtype_name_conf in zarr_config.get("array.v2_default_compressor"): + compressor = zarr_config.get(f"array.v2_default_compressor.{dtype_name_conf}") else: compressor = zarr_config.get("array.v2_default_compressor.default") @@ -4712,7 +4720,7 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. @@ -4782,7 +4790,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index d5f6b00862..e8e451944f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -11,16 +11,13 @@ parse_shapelike, ) from zarr.core.config import config as zarr_config -from zarr.core.dtype import parse_data_type if TYPE_CHECKING: from typing import NotRequired - import numpy.typing as npt - from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import DTypeWrapper + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar class ArrayConfigParams(TypedDict): @@ -92,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: DTypeWrapper[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -100,14 +97,12 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: npt.DTypeLike | DTypeWrapper[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) - dtype_parsed = parse_data_type(dtype) - fill_value_parsed = parse_fill_value(fill_value) object.__setattr__(self, "shape", shape_parsed) diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py index 0205f16ab1..9da0059d0b 100644 --- a/src/zarr/core/buffer/cpu.py +++ b/src/zarr/core/buffer/cpu.py @@ -150,7 +150,7 @@ def create( cls, *, shape: Iterable[int], - dtype: np.dtype[Any], + dtype: npt.DTypeLike, order: Literal["C", "F"] = "C", fill_value: Any | None = None, ) -> Self: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 4e5f6603ff..71600fee90 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -27,7 +27,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar T = TypeVar("T") U = TypeVar("U") @@ -133,7 +133,7 @@ def __iter__(self) -> Iterator[Codec]: yield from self.bytes_bytes_codecs def validate( - self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid ) -> None: for codec in self: codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) @@ -296,9 +296,7 @@ def _merge_chunk_array( is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if chunk_selection == () or is_scalar( - value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() - ): + if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype.to_dtype()): chunk_value = value else: chunk_value = value[out_selection] diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e86347d808..6fc46f6b06 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,8 +19,6 @@ overload, ) -from typing_extensions import ReadOnly - from zarr.core.config import config as zarr_config if TYPE_CHECKING: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index e51f1a1de1..8e0a55b8d0 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -107,15 +107,15 @@ def enable_gpu(self) -> ConfigSet: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "variable_length_utf8": [{"id": "vlen-utf8"}], - "fixed_length_ucs4": [{"id": "vlen-utf8"}], - "fixed_length_ascii": [{"id": "vlen-bytes"}], + "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], }, "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable_length_utf8": {"name": "vlen-utf8"}, - "fixed_length_ucs4": {"name": "vlen-utf8"}, + "numpy__variable_length_utf8": {"name": "vlen-utf8"}, + "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 432eabf2ce..4e594f8796 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -4,21 +4,23 @@ import numpy as np -from zarr.core.dtype.common import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.dtype._numpy import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar if TYPE_CHECKING: import numpy.typing as npt - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype._numpy import ( Bool, Complex64, Complex128, DateTime64, - FixedLengthAsciiString, + FixedLengthAscii, FixedLengthBytes, - FixedLengthUnicodeString, + FixedLengthUnicode, Float16, Float32, Float64, @@ -34,16 +36,15 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import DTypeWrapper +from zarr.core.dtype.wrapper import ZDType __all__ = [ "Complex64", "Complex128", - "DTypeWrapper", "DateTime64", - "FixedLengthAsciiString", + "FixedLengthAscii", "FixedLengthBytes", - "FixedLengthUnicodeString", + "FixedLengthUnicode", "Float16", "Float32", "Float64", @@ -57,6 +58,7 @@ "UInt32", "UInt64", "VariableLengthString", + "ZDType", "data_type_registry", "parse_data_type", ] @@ -66,7 +68,7 @@ INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 FLOAT_DTYPE = Float16 | Float32 | Float64 COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicodeString | VariableLengthString | FixedLengthAsciiString +STRING_DTYPE = FixedLengthUnicode | VariableLengthString | FixedLengthAscii DTYPE = ( Bool | INTEGER_DTYPE @@ -82,34 +84,39 @@ data_type_registry.register(dtype._zarr_v3_name, dtype) -def get_data_type_from_numpy(dtype: npt.DTypeLike) -> DTypeWrapper[Any, Any]: +def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): if dtype in (str, "str"): if _NUMPY_SUPPORTS_VLEN_STRING: - np_dtype = np.dtype("T") + na_dtype = np.dtype("T") else: - np_dtype = np.dtype("O") + na_dtype = np.dtype("O") elif isinstance(dtype, list): # this is a valid _VoidDTypeLike check - np_dtype = np.dtype([tuple(d) for d in dtype]) + na_dtype = np.dtype([tuple(d) for d in dtype]) else: - np_dtype = np.dtype(dtype) + na_dtype = np.dtype(dtype) else: - np_dtype = dtype - return data_type_registry.match_dtype(np_dtype) + na_dtype = dtype + return data_type_registry.match_dtype(na_dtype) -def get_data_type_from_dict(dtype: dict[str, JSON]) -> DTypeWrapper[Any, Any]: - return data_type_registry.match_json(dtype) +def get_data_type_from_json( + dtype: JSON, zarr_format: ZarrFormat +) -> ZDType[_BaseDType, _BaseScalar]: + return data_type_registry.match_json(dtype, zarr_format=zarr_format) def parse_data_type( - dtype: npt.DTypeLike | DTypeWrapper[Any, Any] | dict[str, JSON], -) -> DTypeWrapper[Any, Any]: - if isinstance(dtype, DTypeWrapper): + dtype: npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON], zarr_format: ZarrFormat +) -> ZDType[Any, Any]: + if isinstance(dtype, ZDType): return dtype elif isinstance(dtype, dict): - return get_data_type_from_dict(dtype) + # This branch assumes that the data type has been specified in the JSON form + # but it's also possible for numpy data types to be specified as dictionaries, which will + # cause an error in the `get_data_type_from_json`, but that's ok for now + return get_data_type_from_json(dtype, zarr_format=zarr_format) # type: ignore[arg-type] else: - return get_data_type_from_numpy(dtype) + return get_data_type_from_native_dtype(dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index c562f0a593..a8bd2b5951 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -2,13 +2,22 @@ import base64 import re +from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, TypeGuard, cast, get_args +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Literal, + Self, + TypeGuard, + cast, + get_args, +) import numpy as np from zarr.core.dtype.common import ( - _NUMPY_SUPPORTS_VLEN_STRING, DataTypeValidationError, Endianness, JSONFloat, @@ -16,27 +25,26 @@ bytes_to_json, check_json_bool, check_json_complex_float, - check_json_complex_float_v3, - check_json_float_v2, + check_json_float, check_json_int, check_json_str, complex_from_json, complex_to_json, datetime_from_json, datetime_to_json, - endianness_from_numpy_str, - endianness_to_numpy_str, float_from_json, float_to_json, ) -from zarr.core.dtype.wrapper import DTypeWrapper, TDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat +EndiannessNumpy = Literal[">", "<", "=", "|"] + @dataclass(frozen=True, kw_only=True) -class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): +class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ Wrapper for numpy boolean dtype. @@ -49,10 +57,37 @@ class Bool(DTypeWrapper[np.dtypes.BoolDType, np.bool_]): """ _zarr_v3_name = "bool" - dtype_cls: ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + _zarr_v2_names: ClassVar[tuple[str,...]] = ("|b1",) + dtype_cls = np.dtypes.BoolDType + + @classmethod + def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + return cls() + + def to_dtype(self: Self) -> np.dtypes.BoolDType: + return self.dtype_cls() + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: + """ + Check that the input is a valid JSON representation of a bool. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.bool_: """ @@ -65,26 +100,6 @@ def default_value(self) -> np.bool_: """ return np.False_ - @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: - """ - Wrap a numpy boolean dtype without checking. - - Parameters - ---------- - dtype : np.dtypes.BoolDType - The numpy dtype to wrap. - - Returns - ------- - Self - The wrapped dtype. - """ - return cls() - - def to_dtype(self) -> np.dtypes.BoolDType: - return self.dtype_cls() - def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: """ Convert a boolean value to JSON-serializable format. @@ -120,337 +135,730 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return self.cast_value(data) + return np.bool_(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") @dataclass(frozen=True, kw_only=True) -class Int8(DTypeWrapper[np.dtypes.Int8DType, np.int8]): +class Int8(ZDType[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.Int8DType: + def to_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["int8", "|i1"]]: + """ + Check that the input is a valid JSON representation of a 8-bit integer. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.int8: - return self.to_dtype().type(0) + """ + Get the default value. + + Returns + ------- + np.int8 + The default value. + """ + return np.int8(0) def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + """ + Convert a numpy 8-bit int to JSON-serializable format. + + Parameters + ---------- + data : np.int8 + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + int + The JSON-serializable form of the scalar. + """ return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + """ + Read a JSON-serializable value as a numpy int8 scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ if check_json_int(data): - return self.cast_value(data) + return np.int8(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt8(DTypeWrapper[np.dtypes.UInt8DType, np.uint8]): +class UInt8(ZDType[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.UInt8DType: + def to_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["uint8", "|u1"]]: + """ + Check that the input is a valid JSON representation of an unsigned 8-bit integer. + """ + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> np.uint8: - return self.to_dtype().type(0) + """ + Get the default value for this data type. + + Returns + ------- + np.uint8 + The default value. + """ + return np.uint8(0) def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: + """ + Convert a numpy unsigned 8-bit integer to JSON-serializable format. + + Parameters + ---------- + data : np.uint8 + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + int + The JSON-serializable form of the scalar. + """ return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: + """ + Read a JSON-serializable value as a numpy boolean scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + """ if check_json_int(data): - return self.cast_value(data) + return np.uint8(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int16(DTypeWrapper[np.dtypes.Int16DType, np.int16]): +class Int16(ZDType[np.dtypes.Int16DType, np.int16]): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int16", ">i2", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int16: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt16(DTypeWrapper[np.dtypes.UInt16DType, np.uint16]): +class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint16", ">u2", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint16: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int32(DTypeWrapper[np.dtypes.Int32DType, np.int32]): +class Int32(ZDType[np.dtypes.Int32DType, np.int32]): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int32", ">i4", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int32: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt32(DTypeWrapper[np.dtypes.UInt32DType, np.uint32]): +class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint32", ">u4", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint32: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Int64(DTypeWrapper[np.dtypes.Int64DType, np.int64]): +class Int64(ZDType[np.dtypes.Int64DType, np.int64]): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Int64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["int64", ">i8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.int64: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class UInt64(DTypeWrapper[np.dtypes.UInt64DType, np.uint64]): +class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.UInt64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["uint64", ">u8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.uint64: - return self.cast_value(0) + return self.to_dtype().type(0) def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: return int(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: if check_json_int(data): - return self.cast_value(data) + return self.to_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") @dataclass(frozen=True, kw_only=True) -class Float16(DTypeWrapper[np.dtypes.Float16DType, np.float16]): +class Float16(ZDType[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float16DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float", ">f2", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float16: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Float32(DTypeWrapper[np.dtypes.Float32DType, np.float32]): +class Float32(ZDType[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float32DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float32", ">f4", " np.float32: - return self.to_dtype().type(value) + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float32: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Float64(DTypeWrapper[np.dtypes.Float64DType, np.float64]): +class Float64(ZDType[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Float64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["float64", ">f8", " dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.float64: - return self.to_dtype().type(0.0) + return self.to_dtype().type(0) def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: return float_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: - if check_json_float_v2(data): + if check_json_float(data, zarr_format=zarr_format): return self.to_dtype().type(float_from_json(data, zarr_format)) raise TypeError(f"Invalid type: {data}. Expected a float.") @dataclass(frozen=True, kw_only=True) -class Complex64(DTypeWrapper[np.dtypes.Complex64DType, np.complex64]): +class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: - return cls() + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Complex64DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["complex64", ">c8", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.complex64: - return np.complex64(0.0) + return self.to_dtype().type(0) def to_json_value( self, data: np.complex64, zarr_format: ZarrFormat @@ -464,23 +872,51 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) -class Complex128(DTypeWrapper[np.dtypes.Complex128DType, np.complex128]): +class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: - return cls(endianness=endianness_from_numpy_str(dtype.byteorder)) + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) def to_dtype(self) -> np.dtypes.Complex128DType: - return self.dtype_cls().newbyteorder(endianness_to_numpy_str(self.endianness)) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[Literal["complex128", ">c16", " str: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.complex128: - return np.complex128(0.0) + return self.to_dtype().type(0) def to_json_value( self, data: np.complex128, zarr_format: ZarrFormat @@ -488,31 +924,66 @@ def to_json_value( return complex_to_json(data, zarr_format) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float_v3(data): + if check_json_complex_float(data, zarr_format=zarr_format): return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) raise TypeError(f"Invalid type: {data}. Expected a complex float.") @dataclass(frozen=True, kw_only=True) -class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_]): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "fixed_length_ascii" + _zarr_v3_name = "numpy.fixed_length_ascii" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType[int]) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def to_dtype(self) -> np.dtypes.BytesDType: + def to_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + def default_value(self) -> np.bytes_: return np.bytes_(b"") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} - def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") @@ -523,38 +994,61 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType - _zarr_v3_name = "r*" +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void]): + # np.dtypes.VoidDType is specified in an odd way in numpy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "numpy.void" item_size_bits: ClassVar[int] = 8 length: int = 1 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - def default_value(self) -> np.void: - return self.cast_value(("\x00" * self.length).encode("ascii")) - - def to_dtype(self) -> np.dtypes.VoidDType: + def to_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly - return np.dtype(f"V{self.length}") + return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - def get_name(self, zarr_format: ZarrFormat) -> str: + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: - return super().get_name(zarr_format=zarr_format) - # note that we don't return self._zarr_v3_name - # because the name is parametrized by the length - return f"r{self.length * self.item_size_bits}" + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and (re.match(r"^r\d+$", data["name"]) is not None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": f"r{self.length * self.item_size_bits}"} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ - Reject structured dtypes by ensuring that dtype.fields is None + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. Parameters ---------- @@ -566,19 +1060,10 @@ def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[np.dtypes.VoidDType Bool True if the dtype matches, False otherwise. """ - return super().check_dtype(dtype) and dtype.fields is None + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - @classmethod - def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: - # Overriding the base class implementation because the r* dtype - # does not have a name that will can appear in array metadata - # Instead, array metadata will contain names like "r8", "r16", etc - return ( - isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) - ) + def default_value(self) -> np.void: + return self.to_dtype().type(("\x00" * self.length).encode("ascii")) def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data.tobytes()).decode("ascii") @@ -590,63 +1075,123 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]): +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "fixed_length_ucs4" + _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point endianness: Endianness | None = "native" length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), - endianness=endianness_from_numpy_str(dtype.byteorder), + endianness=endianness_from_numpy_str(byte_order), ) def to_dtype(self) -> np.dtypes.StrDType[int]: - return cast( - np.dtypes.StrDType[int], - self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)), - ) + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") def default_value(self) -> np.str_: return np.str_("") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"length": self.length}} - def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.cast_value(data) + return self.to_dtype().type(data) + + +_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]): + class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "variable_length_utf8" + _zarr_v3_name = "numpy.variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: return cls() - def default_value(self) -> str: - return "" + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() - def cast_value(self, value: object) -> str: - return str(value) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy string dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + # Note that we are checking for the object dtype name. + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + # Note: unlike many other numpy data types, we don't serialize the .str attribute + # of the data type to JSON. This is because Zarr was using `|O` for strings before the + # numpy variable length string data type existed, and we want to be consistent with + # that practice + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - def to_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() + def default_value(self) -> str: + return "" def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) @@ -654,37 +1199,55 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.cast_value(data) + return data else: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]): + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "variable_length_utf8" + _zarr_v3_name = "numpy.variable_length_utf8" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: - return cast(np.dtypes.ObjectDType, self.dtype_cls()) + return self.dtype_cls() - def cast_value(self, value: object) -> str: - return str(value) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy O dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() def default_value(self) -> str: return "" - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3)} - def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: return data def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ - String literals pass through + Strings pass through """ if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") @@ -696,35 +1259,72 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) -class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]): - dtype_cls = np.dtypes.DateTime64DType - _zarr_v3_name = "datetime64" +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit = "s" - endianness: Endianness = "native" - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - def to_dict(self) -> dict[str, JSON]: - return {"name": self.get_name(zarr_format=3), "configuration": {"unit": self.unit}} + endianness: Endianness | None = "native" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: - unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] + unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder)) - - def cast_value(self, value: object) -> np.datetime64: - return cast(np.datetime64, self.to_dtype().type(value, self.unit)) + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) def to_dtype(self) -> np.dtypes.DateTime64DType: # Numpy does not allow creating datetime64 via # np.dtypes.DateTime64Dtype() - return np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) + return cast( + "np.dtypes.DateTime64DType", + np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ), ) + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) + and data[-1] == "]" + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and "unit" in data["configuration"] + and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): return datetime_from_json(data, self.unit) @@ -735,19 +1335,19 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) -class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]): - dtype_cls = np.dtypes.VoidDType +class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" - fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...] + fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] def default_value(self) -> np.void: return self.cast_value(0) def cast_value(self, value: object) -> np.void: - return cast(np.void, np.array([value], dtype=self.to_dtype())[0]) + return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod - def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]: + def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -764,54 +1364,90 @@ def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDTyp return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self: - from zarr.core.dtype import get_data_type_from_numpy + def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, DTypeWrapper[Any, Any]]] = [] + fields: list[tuple[str, ZDType[Any, Any]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") - for key, (dtype_instance, _) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_numpy(dtype_instance) + # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only + # care about the first element in either case. + for key, (dtype_instance, *_) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) fields.append((key, dtype_wrapped)) return cls(fields=tuple(fields)) - def get_name(self, zarr_format: ZarrFormat) -> str | list[tuple[str, str]]: + def to_json(self, zarr_format: ZarrFormat) -> JSON: + fields = [ + (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields + ] if zarr_format == 2: - return [[k, d.get_name(zarr_format=2)] for k, d in self.fields] - return self._zarr_v3_name - - def to_dict(self) -> dict[str, JSON]: - base_dict = {"name": self.get_name(zarr_format=3)} - field_configs = [(f_name, f_dtype.to_dict()) for f_name, f_dtype in self.fields] - base_dict["configuration"] = {"fields": field_configs} - return base_dict + return fields + elif zarr_format == 3: + base_dict = {"name": self._zarr_v3_name} + base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] + return cast("JSON", base_dict) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def check_dict(cls, data: JSON) -> TypeGuard[JSON]: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and "fields" in data["configuration"] - ) + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[dict[str, JSON] | list[Any]]: + # the actual JSON form is recursive and hard to annotate, so we give up and do + # list[Any] for now + if zarr_format == 2: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data + ) + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: - if cls.check_dict(data): - from zarr.core.dtype import get_data_type_from_dict - - fields = tuple( - (f_name, get_data_type_from_dict(f_dtype)) - for f_name, f_dtype in data["configuration"]["fields"] - ) - return cls(fields=fields) + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + from zarr.core.dtype import get_data_type_from_json + + if cls.check_json(data, zarr_format=zarr_format): + if zarr_format == 2: + # structured dtypes are constructed directly from a list of lists + return cls( + fields=tuple( # type: ignore[misc] + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in data + ) + ) + elif zarr_format == 3: # noqa: SIM102 + if isinstance(data, dict) and "configuration" in data: + config = data["configuration"] + if isinstance(config, dict) and "fields" in config: + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in meta_fields + ) + return cls(fields=fields) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - def to_dtype(self) -> np.dtypes.VoidDType: - return cast(np.void, np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields])) + def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cast( + "np.dtypes.VoidDType[int]", + np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + ) def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(data.tobytes(), zarr_format) @@ -822,3 +1458,69 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_dtype() return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + + +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: + """ + Convert an endianness literal to its numpy string representation. + + Parameters + ---------- + endianness : Endianness or None + The endianness to convert. + + Returns + ------- + Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "little": + return "<" + case "big": + return ">" + case "native": + return "=" + case None: + return "|" + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + ) + + +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: + """ + Convert a numpy endianness string literal to a human-readable literal value. + + Parameters + ---------- + endianness : Literal[">", "<", "=", "|"] + The numpy string representation of the endianness. + + Returns + ------- + Endianness or None + The human-readable representation of the endianness. + + Raises + ------ + ValueError + If the endianness is invalid. + """ + match endianness: + case "<": + return "little" + case ">": + return "big" + case "=": + return "native" + case "|": + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 1dbf22c3c2..2c4910338e 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -2,7 +2,7 @@ import base64 from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast, get_args +from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast import numpy as np @@ -11,81 +11,12 @@ from zarr.core.dtype._numpy import DateUnit, TimeUnit Endianness = Literal["little", "big", "native"] -EndiannessNumpy = Literal[">", "<", "=", "|"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") - class DataTypeValidationError(ValueError): ... -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: - """ - Convert an endianness literal to its numpy string representation. - - Parameters - ---------- - endianness : Endianness or None - The endianness to convert. - - Returns - ------- - Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "little": - return "<" - case "big": - return ">" - case "native": - return "=" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" - ) - - -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: - """ - Convert a numpy endianness string literal to a human-readable literal value. - - Parameters - ---------- - endianness : Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Returns - ------- - Endianness or None - The human-readable representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "<": - return "little" - case ">": - return "big" - case "=": - return "native" - case "|": - return None - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" - ) - - def check_json_bool(data: JSON) -> TypeGuard[bool]: """ Check if a JSON value is a boolean. diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index d4f1f03258..0d07ab2b9d 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,20 +1,22 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Self from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON - from zarr.core.dtype.wrapper import DTypeWrapper, TDType + from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[DTypeWrapper[Any, Any]]] = field(default_factory=dict, init=False) + contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( + default_factory=dict, init=False + ) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: @@ -23,15 +25,15 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, key: str, cls: type[DTypeWrapper[Any, Any]]) -> None: + def register(self: Self, key: str, cls: type[ZDType[_BaseDType, _BaseScalar]]) -> None: # don't register the same dtype twice if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls - def get(self, key: str) -> type[DTypeWrapper[Any, Any]]: + def get(self, key: str) -> type[ZDType[_BaseDType, _BaseScalar]]: return self.contents[key] - def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: + def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: self.lazy_load() for val in self.contents.values(): try: @@ -40,11 +42,11 @@ def match_dtype(self, dtype: TDType) -> DTypeWrapper[Any, Any]: pass raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - def match_json(self, data: JSON) -> DTypeWrapper[Any, Any]: + def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: self.lazy_load() for val in self.contents.values(): try: - return val.from_dict(data) + return val.from_json(data, zarr_format=zarr_format) except DataTypeValidationError: pass raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index dc3a0cc5d2..8707c3cda0 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -2,25 +2,30 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, Generic, Self, TypeGuard, TypeVar, cast +from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar import numpy as np -from zarr.abc.metadata import Metadata from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -TScalar = TypeVar("TScalar", bound=np.generic | str) +# This the upper bound for the scalar types we support. It's numpy scalars + str, +# because the new variable-length string dtype in numpy does not have a corresponding scalar type +_BaseScalar = np.generic | str +# This is the bound for the dtypes that we support. If we support non-numpy dtypes, +# then this bound will need to be widened. +_BaseDType = np.dtype[np.generic] +TScalar = TypeVar("TScalar", bound=_BaseScalar) # TODO: figure out an interface or protocol that non-numpy dtypes can use -TDType = TypeVar("TDType", bound=np.dtype[Any]) +TDType = TypeVar("TDType", bound=_BaseDType) @dataclass(frozen=True, kw_only=True) -class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): +class ZDType(Generic[TDType, TScalar], ABC): """ - Abstract base class for wrapping numpy dtypes. + Abstract base class for wrapping native array data types, e.g. numpy dtypes Attributes ---------- @@ -32,13 +37,30 @@ class DTypeWrapper(Generic[TDType, TScalar], ABC, Metadata): have names that depend on their configuration. """ - # this class will create a numpy dtype + # this class will create a native data type # mypy currently disallows class variables to contain type parameters - # but it seems like it should be OK for us to use it here: + # but it seems OK for us to use it here: # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] + @classmethod + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: + """ + Check that a data type matches the dtype_cls class attribute. Used as a type guard. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return type(dtype) is cls.dtype_cls + @classmethod def from_dtype(cls: type[Self], dtype: TDType) -> Self: """ @@ -81,7 +103,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: Self The wrapped dtype. """ - raise NotImplementedError + ... @abstractmethod def to_dtype(self: Self) -> TDType: @@ -93,26 +115,7 @@ def to_dtype(self: Self) -> TDType: TDType The unwrapped dtype. """ - raise NotImplementedError - - def cast_value(self: Self, value: object) -> TScalar: - """ - Cast a value to an instance of the scalar type. - This implementation assumes a numpy-style dtype class that has a - ``type`` method for casting scalars. Non-numpy dtypes will need to - override this method. - - Parameters - ---------- - value : object - The value to cast. - - Returns - ------- - TScalar - The cast value. - """ - return cast(TScalar, self.to_dtype().type(value)) + ... @abstractmethod def default_value(self) -> TScalar: @@ -129,24 +132,8 @@ def default_value(self) -> TScalar: ... @classmethod - def check_dtype(cls: type[Self], dtype: TDType) -> TypeGuard[TDType]: - """ - Check that a data type matches the dtype_cls class attribute. Used as a type guard. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return type(dtype) is cls.dtype_cls - - @classmethod - def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]: + @abstractmethod + def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -158,87 +145,75 @@ def check_dict(cls: type[Self], data: dict[str, JSON]) -> TypeGuard[dict[str, JS data : JSON The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. + Returns ------- Bool True if the JSON representation matches, False otherwise. """ - return "name" in data and data["name"] == cls._zarr_v3_name + ... @abstractmethod - def to_dict(self) -> dict[str, JSON]: + def to_json(self, zarr_format: ZarrFormat) -> JSON: """ - Convert the wrapped data type to a dictionary. + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. Returns ------- - dict[str, JSON] - The dictionary representation of the wrapped data type + JSON + The JSON-serializable representation of the wrapped data type """ - raise NotImplementedError + ... @classmethod - def from_dict(cls: type[Self], data: dict[str, JSON]) -> Self: + def from_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: """ Wrap a JSON representation of a data type. Parameters ---------- - data : dict[str, JSON] + data : JSON The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. + Returns ------- Self The wrapped data type. """ - if cls.check_dict(data): - return cls._from_dict_unsafe(data) - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + if cls.check_json(data, zarr_format=zarr_format): + return cls._from_json_unsafe(data, zarr_format=zarr_format) + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") @classmethod - def _from_dict_unsafe(cls: type[Self], data: dict[str, JSON]) -> Self: + @abstractmethod + def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: """ Wrap a JSON representation of a data type. Parameters ---------- - data : dict[str, JSON] + data : JSON The JSON representation of the data type. - Returns - ------- - Self - The wrapped data type. - """ - config = data.get("configuration", {}) - return cls(**config) - - def get_name(self, zarr_format: ZarrFormat) -> str: - """ - Return the name of the wrapped data type. - - Parameters - ---------- zarr_format : ZarrFormat The zarr format version. Returns ------- - str - The name of the wrapped data type. - - Notes - ----- - This is a method, rather than an attribute, because the name of the data type may depend on - parameters that are not known until a concrete data type is wrapped. - - As the names of data types vary between zarr versions, this method takes a ``zarr_format`` - parameter + Self + The wrapped data type. """ - if zarr_format == 2: - return self.to_dtype().str - return self._zarr_v3_name + ... @abstractmethod def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: @@ -255,9 +230,9 @@ def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: Returns ------- JSON - The JSON-serializable format. + The JSON-serializable form of the scalar. """ - raise NotImplementedError + ... @abstractmethod def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: @@ -274,6 +249,6 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal Returns ------- TScalar - The numpy scalar. + The native scalar value. """ - raise NotImplementedError + ... diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 517758a5ee..02574440ff 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -2,13 +2,13 @@ import warnings from collections.abc import Iterable -from typing import TYPE_CHECKING, TypedDict, cast +from typing import TYPE_CHECKING, TypedDict import numcodecs.abc from zarr.abc.metadata import Metadata -from zarr.core.dtype import get_data_type_from_numpy -from zarr.core.dtype.wrapper import DTypeWrapper +from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.wrapper import TDType, TScalar, ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from typing import Literal, Self @@ -61,7 +61,7 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: DTypeWrapper[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None @@ -74,7 +74,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: DTypeWrapper[Any, Any], + dtype: ZDType[TDType, TScalar], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -89,7 +89,7 @@ def __init__( shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) # TODO: remove this - if not isinstance(dtype, DTypeWrapper): + if not isinstance(dtype, ZDType): raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) @@ -138,7 +138,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_numpy(_data["dtype"]) + dtype = get_data_type_from_native_dtype(_data["dtype"]) _data["dtype"] = dtype if dtype.to_dtype().kind in "SV": fill_value_encoded = _data.get("fill_value") @@ -182,6 +182,10 @@ def to_dict(self) -> dict[str, JSON]: if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] + # TODO: remove this when we can stratically type the output JSON data structure + # entirely + if not isinstance(raw_filters, list | tuple): + raise TypeError("Invalid type for filters. Expected a list or tuple.") new_filters = [] for f in raw_filters: if isinstance(f, numcodecs.abc.Codec): @@ -191,13 +195,10 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["filters"] = new_filters if self.fill_value is not None: - # There's a relationship between self.dtype and self.fill_value - # that mypy isn't aware of. The fact that we have S or V dtype here - # means we should have a bytes-type fill_value. - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) # type: ignore[arg-type] zarray_dict["fill_value"] = fill_value - zarray_dict["dtype"] = self.dtype.get_name(zarr_format=2) + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) return zarray_dict @@ -324,22 +325,3 @@ def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: raise ValueError(msg) from e return fill_value - - -def _default_compressor( - dtype: DTypeWrapper[Any, Any], -) -> dict[str, JSON] | None: - """Get the default filters and compressor for a dtype. - - https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html - """ - default_compressor = config.get("array.v2_default_compressor") - return cast(dict[str, JSON] | None, default_compressor.get(dtype.kind, None)) - - -def _default_filters( - dtype: DTypeWrapper, -) -> list[dict[str, JSON]] | None: - """Get the default filters and compressor for a dtype.""" - default_filters = config.get("array.v2_default_filters") - return cast(list[dict[str, JSON]] | None, default_filters.get(dtype.kind, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 297c418214..81d255969c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,9 +5,9 @@ from zarr.abc.metadata import Metadata from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import ( - DTypeWrapper, VariableLengthString, - get_data_type_from_dict, + ZDType, + get_data_type_from_json, ) if TYPE_CHECKING: @@ -16,6 +16,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar import json @@ -86,7 +87,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: DTypeWrapper[Any, Any]) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -144,7 +145,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: DTypeWrapper[Any, Any] + data_type: ZDType[_BaseDType, _BaseScalar] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -159,7 +160,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: DTypeWrapper[Any, Any], + data_type: ZDType[_BaseDType, _BaseScalar], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -173,7 +174,7 @@ def __init__( """ # TODO: remove this - if not isinstance(data_type, DTypeWrapper): + if not isinstance(data_type, ZDType): raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) @@ -186,7 +187,7 @@ def __init__( array_spec = ArraySpec( shape=shape_parsed, - dtype=data_type.to_dtype(), + dtype=data_type, fill_value=fill_value_parsed, config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. @@ -220,9 +221,7 @@ def _validate_metadata(self) -> None: if self.fill_value is None: raise ValueError("`fill_value` is required.") for codec in self.codecs: - codec.validate( - shape=self.shape, dtype=self.data_type.to_dtype(), chunk_grid=self.chunk_grid - ) + codec.validate(shape=self.shape, dtype=self.data_type, chunk_grid=self.chunk_grid) @property def ndim(self) -> int: @@ -294,10 +293,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - if isinstance(data_type_json, str): - data_type = get_data_type_from_dict({"name": data_type_json}) - else: - data_type = get_data_type_from_dict(data_type_json) + data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) @@ -322,9 +318,15 @@ def to_dict(self) -> dict[str, JSON]: # the metadata document if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") - # if data_type has no configuration, we just serialize the name - if "configuration" not in out_dict["data_type"]: - out_dict["data_type"] = out_dict["data_type"]["name"] + + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with + # to_json, from_json, and have ZDType inherit from `Metadata` + # until then, we have this hack here + dtype_meta = out_dict["data_type"] + + if isinstance(dtype_meta, ZDType): + out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) + return out_dict def update_shape(self, shape: ChunkCoords) -> Self: diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index d0e54eeb51..96c4ec749d 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -17,7 +17,7 @@ from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.common import ZarrFormat -from zarr.core.dtype import parse_data_type +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike @@ -139,7 +139,7 @@ def array_metadata( ndim = len(shape) chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) np_dtype = draw(v3_dtypes()) - dtype = parse_data_type(np_dtype) + dtype = get_data_type_from_native_dtype(np_dtype) fill_value = draw(npst.from_dtype(np_dtype)) if zarr_format == 2: return ArrayV2Metadata( diff --git a/tests/conftest.py b/tests/conftest.py index c164168750..58f2be8e14 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -277,7 +277,7 @@ def create_array_metadata( """ Create array metadata """ - dtype_parsed = get_data_type_from_numpy(dtype) + dtype_parsed = get_data_type_from_native_dtype(dtype) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/tests/test_array.py b/tests/test_array.py index bebf6b8f0a..cb1fcace75 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -41,7 +41,7 @@ from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync @@ -1204,7 +1204,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=get_data_type_from_numpy(dtype) + filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected @@ -1225,7 +1225,7 @@ async def test_default_filters_compressors( """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ - zdtype = get_data_type_from_numpy(dtype_str) + zdtype = get_data_type_from_native_dtype(dtype_str) arr = await create_array( store=store, dtype=dtype_str, diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 723450b680..5879782354 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -8,7 +8,7 @@ from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath @@ -53,12 +53,12 @@ def test_vlen_string( else: a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert a.metadata.data_type == get_data_type_from_native_dtype(data.dtype) assert a.dtype == data.dtype # test round trip b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == get_data_type_from_numpy(data.dtype) + assert b.metadata.data_type == get_data_type_from_native_dtype(data.dtype) assert a.dtype == data.dtype diff --git a/tests/test_config.py b/tests/test_config.py index b1b6ae0ecd..38d8c1c0bd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -56,15 +56,15 @@ def test_config_defaults_set() -> None: "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, "v2_default_filters": { "default": None, - "variable_length_utf8": [{"id": "vlen-utf8"}], - "fixed_length_ucs4": [{"id": "vlen-utf8"}], - "fixed_length_ascii": [{"id": "vlen-bytes"}], + "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], + "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], }, "v3_default_filters": {"default": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable_length_utf8": {"name": "vlen-utf8"}, - "fixed_length_ucs4": {"name": "vlen-utf8"}, + "numpy__variable_length_utf8": {"name": "vlen-utf8"}, + "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, "r*": {"name": "vlen-bytes"}, }, "v3_default_compressors": { @@ -318,10 +318,10 @@ async def test_default_codecs(dtype: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ - zdtype = get_data_type_from_numpy(dtype) + zdtype = get_data_type_from_native_dtype(dtype) expected_compressors = (GzipCodec(),) new_conf = { - f"array.v3_default_compressors.{zdtype._zarr_v3_name}": [ + f"array.v3_default_compressors.{zdtype._zarr_v3_name.replace('.', '__')}": [ c.to_dict() for c in expected_compressors ] } diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 3a8cd5bb8b..395e036db2 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -504,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = parse_data_type("uint8") + dtype = parse_data_type("uint8", zarr_format=2) await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) diff --git a/tests/test_metadata/test_dtype.py b/tests/test_metadata/test_dtype.py index ee19cdf845..db575ee16a 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_metadata/test_dtype.py @@ -1,14 +1,19 @@ from __future__ import annotations -from typing import Any, get_args +import re +from typing import TYPE_CHECKING, Any, get_args + +if TYPE_CHECKING: + from zarr.core.common import ZarrFormat + from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar import numpy as np import pytest from zarr.core.dtype import ( DTYPE, - DTypeWrapper, VariableLengthString, + ZDType, data_type_registry, ) from zarr.core.dtype._numpy import ( @@ -16,9 +21,9 @@ Complex64, Complex128, DateTime64, - FixedLengthAsciiString, + FixedLengthAscii, FixedLengthBytes, - FixedLengthUnicodeString, + FixedLengthUnicode, Float16, Float32, Float64, @@ -37,7 +42,7 @@ @pytest.fixture -def dtype_registry() -> DataTypeRegistry: +def data_type_registry_fixture() -> DataTypeRegistry: return DataTypeRegistry() @@ -66,15 +71,15 @@ def dtype_registry() -> DataTypeRegistry: (Float64, "float64"), (Complex64, "complex64"), (Complex128, "complex128"), - (FixedLengthUnicodeString, "U"), - (FixedLengthAsciiString, "S"), + (FixedLengthUnicode, "U"), + (FixedLengthAscii, "S"), (FixedLengthBytes, "V"), (VariableLengthString, VLEN_STRING_CODE), (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), (DateTime64, "datetime64[s]"), ], ) -def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | str) -> None: +def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype | str) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -92,13 +97,13 @@ def test_wrap(wrapper_cls: type[DTypeWrapper[Any, Any]], np_dtype: np.dtype | st @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: DTYPE) -> None: +def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None: if issubclass(wrapper_cls, Structured): instance = wrapper_cls(fields=((("a", Bool()),))) else: instance = wrapper_cls() - as_dict = instance.to_dict() - assert wrapper_cls.from_dict(as_dict) == instance + as_dict = instance.to_json(zarr_format=zarr_format) + assert wrapper_cls.from_json(as_dict, zarr_format=zarr_format) == instance @pytest.mark.parametrize( @@ -118,9 +123,9 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (Float64(), np.float64(0)), (Complex64(), np.complex64(0)), (Complex128(), np.complex128(0)), - (FixedLengthAsciiString(length=3), np.bytes_(b"")), + (FixedLengthAscii(length=3), np.bytes_(b"")), (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), - (FixedLengthUnicodeString(length=3), np.str_("")), + (FixedLengthUnicode(length=3), np.str_("")), ( Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], @@ -129,7 +134,9 @@ def test_dict_serialization(wrapper_cls: DTYPE) -> None: (DateTime64(unit="s"), np.datetime64("NaT")), ], ) -def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: Any) -> None: +def test_default_value( + wrapper: type[ZDType[_BaseDType, _BaseScalar]], expected_default: Any +) -> None: """ Test that the default_value method is correctly set for each dtype wrapper. """ @@ -156,15 +163,15 @@ def test_default_value(wrapper: type[DTypeWrapper[Any, Any]], expected_default: (Float64(), np.float64(42.0), 42.0), (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAsciiString(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicodeString(length=4), np.str_("test"), "test"), + (FixedLengthUnicode(length=4), np.str_("test"), "test"), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), ], ) def test_to_json_value_v2( - wrapper: type[DTypeWrapper[Any, Any]], input_value: Any, expected_json: Any + wrapper: type[ZDType[_BaseDType, _BaseScalar]], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -189,15 +196,15 @@ def test_to_json_value_v2( (Float64(), 42.0, np.float64(42.0)), (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (FixedLengthAsciiString(length=4), "dGVzdA==", np.bytes_(b"test")), + (FixedLengthAscii(length=4), "dGVzdA==", np.bytes_(b"test")), (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), - (FixedLengthUnicodeString(length=4), "test", np.str_("test")), + (FixedLengthUnicode(length=4), "test", np.str_("test")), (VariableLengthString(), "test", "test"), (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), ], ) def test_from_json_value( - wrapper: type[DTypeWrapper[Any, Any]], json_value: Any, expected_value: Any + wrapper: type[ZDType[_BaseDType, _BaseScalar]], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. @@ -207,43 +214,45 @@ def test_from_json_value( class TestRegistry: @staticmethod - def test_register(dtype_registry: DataTypeRegistry) -> None: + def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - dtype_registry.register(Bool._zarr_v3_name, Bool) - assert dtype_registry.get(Bool._zarr_v3_name) == Bool - assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @staticmethod - def test_override(dtype_registry: DataTypeRegistry) -> None: + def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - dtype_registry.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - dtype_registry.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(dtype_registry.match_dtype(np.dtype("bool")), NewBool) + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicodeString, "|U4")] + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] ) def test_match_dtype( - dtype_registry: DataTypeRegistry, wrapper_cls: type[DTypeWrapper[Any, Any]], dtype_str: str + data_type_registry_fixture: DataTypeRegistry, + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], + dtype_str: str, ) -> None: """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - dtype_registry.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(dtype_registry.match_dtype(np.dtype(dtype_str)), wrapper_cls) + data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) + assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) @staticmethod - def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: + def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that match_dtype raises an error if the dtype is not registered. """ @@ -251,14 +260,16 @@ def test_unregistered_dtype(dtype_registry: DataTypeRegistry) -> None: with pytest.raises( ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" ): - dtype_registry.match_dtype(np.dtype(outside_dtype)) + data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) with pytest.raises(KeyError): - dtype_registry.get(outside_dtype) + data_type_registry_fixture.get(outside_dtype) @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) - def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: + def test_registered_dtypes( + wrapper_cls: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat + ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ @@ -268,3 +279,40 @@ def test_registered_dtypes(wrapper_cls: DTypeWrapper[Any, Any]) -> None: instance = wrapper_cls() assert data_type_registry.match_dtype(instance.to_dtype()) == instance + assert ( + data_type_registry.match_json( + instance.to_json(zarr_format=zarr_format), zarr_format=zarr_format + ) + == instance + ) + + @staticmethod + @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + def test_match_dtype_unique( + wrapper_cls: ZDType[_BaseDType, _BaseScalar], + data_type_registry_fixture: DataTypeRegistry, + zarr_format: ZarrFormat, + ) -> None: + """ + Test that the match_dtype method uniquely specifies a registered data type. We create a local registry + that excludes the data type class being tested, and ensure that an instance of the wrapped data type + fails to match anything in the registry + """ + for _cls in get_args(DTYPE): + if _cls is not wrapper_cls: + data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) + + if issubclass(wrapper_cls, Structured): + instance = wrapper_cls(fields=((("a", Bool()),))) + else: + instance = wrapper_cls() + dtype_instance = instance.to_dtype() + + msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_dtype(dtype_instance) + + instance_dict = instance.to_json(zarr_format=zarr_format) + msg = f"No data type wrapper found that matches {instance_dict}" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 478a1405e2..1ecdb58718 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -11,7 +11,7 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config -from zarr.core.dtype import get_data_type_from_numpy +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import DateTime64 from zarr.core.dtype.common import complex_from_json from zarr.core.group import GroupMetadata, parse_node_type @@ -130,7 +130,7 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: as length-2 sequences """ zarr_format = 3 - dtype = get_data_type_from_numpy(dtype_str) + dtype = get_data_type_from_native_dtype(dtype_str) expected = dtype.to_dtype().type(complex(*fill_value)) observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) assert observed == expected @@ -144,7 +144,7 @@ def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not equal to 2 """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) match = f"Invalid type: {data}. Expected a sequence of two numbers." with pytest.raises(TypeError, match=re.escape(match)): complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) @@ -157,7 +157,7 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: Test that parse_fill_value(fill_value, dtype) raises TypeError for invalid non-sequential types. This test excludes bool because the bool constructor takes anything. """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): dtype_instance.from_json_value(fill_value, zarr_format=3) @@ -178,7 +178,7 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) This test excludes bool because the bool constructor takes anything, and complex because complex values can be created from length-2 sequences. """ - dtype_instance = get_data_type_from_numpy(dtype_str) + dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): dtype_instance.from_json_value(fill_value, zarr_format=3) @@ -279,10 +279,12 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: "node_type": "array", "shape": (1,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, - "data_type": dtype.to_dict(), + "data_type": dtype.to_json(zarr_format=3), "chunk_key_encoding": {"name": "default", "separator": "."}, "codecs": (BytesCodec(),), - "fill_value": dtype.to_json_value(dtype.cast_value(fill_value), zarr_format=3), + "fill_value": dtype.to_json_value( + dtype.to_dtype().type(fill_value, dtype.unit), zarr_format=3 + ), } metadata = ArrayV3Metadata.from_dict(metadata_dict) # ensure there isn't a TypeError here. From d90e6a0b888b46a2f08e3e90d6a292c26bc3bedc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 19 Mar 2025 22:33:01 +0100 Subject: [PATCH 040/129] update code examples in docs; remove native endianness --- docs/user-guide/arrays.rst | 14 ++++------- docs/user-guide/data_types.rst | 6 ++--- docs/user-guide/groups.rst | 6 ++--- docs/user-guide/performance.rst | 11 ++++----- src/zarr/core/_info.py | 5 ++-- src/zarr/core/dtype/_numpy.py | 41 ++++++++++++++++----------------- src/zarr/core/dtype/common.py | 2 +- 7 files changed, 37 insertions(+), 48 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 67b134d442..ad2a1e9cc6 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -183,8 +183,7 @@ which can be used to print useful diagnostics, e.g.:: >>> z.info Type : Array Zarr format : 3 - Data type : Int32(endianness='little') - Fill value : 0 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -201,8 +200,7 @@ prints additional diagnostics, e.g.:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : Int32(endianness='little') - Fill value : 0 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -279,8 +277,7 @@ Here is an example using a delta filter with the Blosc compressor:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : Int32(endianness='little') - Fill value : 0 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -597,8 +594,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za >>> a.info_complete() Type : Array Zarr format : 3 - Data type : UInt8() - Fill value : 0 + Data type : uint8 Shape : (10000, 10000) Shard shape : (1000, 1000) Chunk shape : (100, 100) @@ -609,7 +605,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Serializer : BytesCodec(endian=None) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) - No. bytes stored : 3981473 (3.8M) + No. bytes stored : 3981473 Storage ratio : 25.1 Shards Initialized : 100 diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index fffd622209..b964439706 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -10,9 +10,9 @@ Zarr-Python supports creating arrays with Numpy data types:: >>> import zarr >>> import numpy as np - >>> zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) + >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) >>> z - + Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for @@ -34,7 +34,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta - `_, or "byte order", of the data type. Following Numpy's example, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index a343c3617e..9b241d2455 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -128,8 +128,7 @@ property. E.g.:: >>> bar.info_complete() Type : Array Zarr format : 3 - Data type : Int64(endianness='little') - Fill value : 0 + Data type : int64 Shape : (1000000,) Chunk shape : (100000,) Order : C @@ -145,8 +144,7 @@ property. E.g.:: >>> baz.info Type : Array Zarr format : 3 - Data type : Float32(endianness='little') - Fill value : 0.0 + Data type : float32 Shape : (1000, 1000) Chunk shape : (100, 100) Order : C diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 0f31e5d7be..6a60edbbd3 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -52,7 +52,7 @@ a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') >>> z4.chunks - (625, 625) + (313, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there @@ -91,8 +91,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the >>> z6.info Type : Array Zarr format : 3 - Data type : UInt8() - Fill value : 0 + Data type : uint8 Shape : (10000, 10000, 1000) Shard shape : (1000, 1000, 1000) Chunk shape : (100, 100, 100) @@ -122,8 +121,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> c.info_complete() Type : Array Zarr format : 3 - Data type : Int32(endianness='little') - Fill value : 0 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -142,8 +140,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> f.info_complete() Type : Array Zarr format : 3 - Data type : Int32(endianness='little') - Fill value : 0 + Data type : int32 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : F diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index f6b51fdb3c..310ba27ea1 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,11 +2,10 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: import numcodecs.abc - import numpy as np from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat @@ -81,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: np.dtype[Any] | ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index a8bd2b5951..f8ebc807d3 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -2,6 +2,7 @@ import base64 import re +import sys from collections.abc import Sequence from dataclasses import dataclass from typing import ( @@ -40,7 +41,7 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -EndiannessNumpy = Literal[">", "<", "=", "|"] +EndiannessNumpy = Literal[">", "<", "|", "="] @dataclass(frozen=True, kw_only=True) @@ -57,7 +58,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ _zarr_v3_name = "bool" - _zarr_v2_names: ClassVar[tuple[str,...]] = ("|b1",) + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) dtype_cls = np.dtypes.BoolDType @classmethod @@ -314,7 +315,7 @@ class Int16(ZDType[np.dtypes.Int16DType, np.int16]): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: @@ -370,7 +371,7 @@ class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: @@ -426,7 +427,7 @@ class Int32(ZDType[np.dtypes.Int32DType, np.int32]): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: @@ -482,7 +483,7 @@ class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: @@ -538,7 +539,7 @@ class Int64(ZDType[np.dtypes.Int64DType, np.int64]): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: @@ -594,7 +595,7 @@ class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: @@ -650,7 +651,7 @@ class Float16(ZDType[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: @@ -706,7 +707,7 @@ class Float32(ZDType[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: @@ -762,7 +763,7 @@ class Float64(ZDType[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: @@ -818,7 +819,7 @@ class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: @@ -876,7 +877,7 @@ class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: @@ -1079,7 +1080,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - endianness: Endianness | None = "native" + endianness: Endianness | None = "little" length: int = 1 @classmethod @@ -1263,7 +1264,7 @@ class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit = "s" - endianness: Endianness | None = "native" + endianness: Endianness | None = "little" @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: @@ -1457,7 +1458,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise TypeError(f"Invalid type: {data}. Expected a string.") as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_dtype() - return cast(np.void, np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: @@ -1471,7 +1472,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: Returns ------- - Literal[">", "<", "=", "|"] + Literal[">", "<", "|"] The numpy string representation of the endianness. Raises @@ -1484,8 +1485,6 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: return "<" case "big": return ">" - case "native": - return "=" case None: return "|" raise ValueError( @@ -1513,12 +1512,12 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: If the endianness is invalid. """ match endianness: + case "=": + return sys.byteorder case "<": return "little" case ">": return "big" - case "=": - return "native" case "|": return None raise ValueError( diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 2c4910338e..4e24d64ad9 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -10,7 +10,7 @@ from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype._numpy import DateUnit, TimeUnit -Endianness = Literal["little", "big", "native"] +Endianness = Literal["little", "big"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] From b66f077e64494ba2dd771c0196d8626433cff9bd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 15:34:45 +0100 Subject: [PATCH 041/129] adjust type annotations --- src/zarr/api/asynchronous.py | 2 +- src/zarr/core/_info.py | 6 +++--- src/zarr/core/array.py | 4 ++-- src/zarr/core/array_spec.py | 6 +++--- src/zarr/core/dtype/__init__.py | 4 ++++ src/zarr/core/dtype/wrapper.py | 4 ++-- src/zarr/core/metadata/v3.py | 9 +++++---- tests/test_array.py | 12 ++++++------ tests/test_info.py | 4 +--- 9 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index a65a469e8d..d83e51f954 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1011,7 +1011,7 @@ async def create( chunks = shape default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) if filters is None: - filters = default_filters + filters = default_filters # type: ignore[assignment] if compressor is None: compressor = default_compressor elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 310ba27ea1..610ae48382 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,14 +2,14 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: import numcodecs.abc from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import ZDType @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[Any, Any] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 34292aa045..3eeb9e4362 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -742,7 +742,7 @@ def _create_metadata_v3( chunk_grid=chunk_grid_parsed, chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value_parsed, - codecs=codecs_parsed, + codecs=codecs_parsed, # type: ignore[arg-type] dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) @@ -1766,7 +1766,7 @@ def _info( ) -> Any: return ArrayInfo( _zarr_format=self.metadata.zarr_format, - _data_type=self.dtype, + _data_type=self._zdtype, _shape=self.shape, _order=self.order, _shard_shape=self.shards, diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e8e451944f..f1eac930c4 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import ZDType class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[Any, Any] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[Any, Any], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 4e594f8796..fc494030f1 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -84,7 +84,11 @@ data_type_registry.register(dtype._zarr_v3_name, dtype) +# TODO: find a better name for this function def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: + """ + Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. + """ data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): if dtype in (str, "str"): diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 8707c3cda0..3409fa7ca4 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,7 +22,7 @@ TDType = TypeVar("TDType", bound=_BaseDType) -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True, kw_only=True, slots=True) class ZDType(Generic[TDType, TScalar], ABC): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes @@ -62,7 +62,7 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: TDType) -> Self: + def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: """ Wrap a dtype object. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 81d255969c..26aa68a4c3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -100,7 +100,8 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseSc # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ - if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": + # TODO: Fix typing here + if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) @@ -321,11 +322,11 @@ def to_dict(self) -> dict[str, JSON]: # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` - # until then, we have this hack here + # until then, we have this hack here, which relies on the fact that to_dict will pass through + # any non-`Metadata` fields as-is. dtype_meta = out_dict["data_type"] - if isinstance(dtype_meta, ZDType): - out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) + out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) # type: ignore[unreachable] return out_dict diff --git a/tests/test_array.py b/tests/test_array.py index cb1fcace75..fed949b69e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -42,6 +42,7 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype._numpy import Float64 from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync @@ -447,7 +448,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=2, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=None, @@ -464,7 +465,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -489,7 +490,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._async_array._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -525,7 +526,6 @@ async def test_info_v2_async( expected = ArrayInfo( _zarr_format=2, _data_type=Float64(), - _fill_value=arr.metadata.fill_value, _shape=(8, 8), _chunk_shape=(2, 2), _shard_shape=None, @@ -550,7 +550,7 @@ async def test_info_v3_async( result = arr.info expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -577,7 +577,7 @@ async def test_info_complete_async( result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, - _data_type=arr.dtype, + _data_type=arr._zdtype, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..6d24863c14 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -5,7 +5,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo, GroupInfo, human_readable_size from zarr.core.common import ZarrFormat -from zarr.core.dtype.npy.int import Int32 +from zarr.core.dtype._numpy import Int32 ZARR_FORMATS = [2, 3] @@ -54,7 +54,6 @@ def test_array_info(zarr_format: ZarrFormat) -> None: info = ArrayInfo( _zarr_format=zarr_format, _data_type=Int32(), - _fill_value=0, _shape=(100, 100), _chunk_shape=(10, 100), _order="C", @@ -94,7 +93,6 @@ def test_array_info_complete( info = ArrayInfo( _zarr_format=zarr_format, _data_type=Int32(), - _fill_value=0, _shape=(100, 100), _chunk_shape=(10, 100), _order="C", From 9eae82a8d24cdff7138bf36cfa1c596610773576 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 15:35:08 +0100 Subject: [PATCH 042/129] fix info tests to use zdtype --- tests/test_info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_info.py b/tests/test_info.py index 6d24863c14..06ce8f1985 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -66,7 +66,6 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Type : Array Zarr format : {zarr_format} Data type : Int32(endianness='little') - Fill value : 0 Shape : (100, 100) Chunk shape : (10, 100) Order : C @@ -108,7 +107,6 @@ def test_array_info_complete( Type : Array Zarr format : {zarr_format} Data type : Int32(endianness='little') - Fill value : 0 Shape : (100, 100) Chunk shape : (10, 100) Order : C From d6727a31962fe4b36c8fffa61d6778104a4f68ea Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 16:31:05 +0100 Subject: [PATCH 043/129] remove dead code and add code coverage exemption to zarr format checks --- src/zarr/core/dtype/_numpy.py | 116 +++++++++++++++++----------------- src/zarr/core/dtype/common.py | 2 +- src/zarr/core/metadata/v3.py | 32 ++++++++-- 3 files changed, 86 insertions(+), 64 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index f8ebc807d3..55bd86a61d 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -77,14 +77,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["b return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -162,14 +162,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["i return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -247,14 +247,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["u return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -337,14 +337,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -352,7 +352,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int16: return self.to_dtype().type(0) @@ -393,14 +393,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -408,7 +408,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint16: return self.to_dtype().type(0) @@ -449,14 +449,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -464,7 +464,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int32: return self.to_dtype().type(0) @@ -505,14 +505,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -520,7 +520,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint32: return self.to_dtype().type(0) @@ -561,14 +561,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -576,7 +576,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.int64: return self.to_dtype().type(0) @@ -617,14 +617,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -632,7 +632,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.uint64: return self.to_dtype().type(0) @@ -673,14 +673,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -688,7 +688,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float16: return self.to_dtype().type(0) @@ -729,14 +729,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -744,7 +744,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float32: return self.to_dtype().type(0) @@ -785,14 +785,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -800,7 +800,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.float64: return self.to_dtype().type(0) @@ -841,14 +841,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -856,7 +856,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.complex64: return self.to_dtype().type(0) @@ -899,14 +899,14 @@ def check_json( return data in cls._zarr_v2_names elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -914,7 +914,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.complex128: return self.to_dtype().type(0) @@ -962,7 +962,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "length_bits" in data["configuration"] and isinstance(data["configuration"]["length_bits"], int) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -972,7 +972,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: "name": self._zarr_v3_name, "configuration": {"length_bits": self.length * self.item_size_bits}, } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -980,7 +980,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.bytes_: return np.bytes_(b"") @@ -1025,14 +1025,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and isinstance(data["name"], str) and (re.match(r"^r\d+$", data["name"]) is not None) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return {"name": f"r{self.length * self.item_size_bits}"} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1040,7 +1040,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: @@ -1113,7 +1113,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "length_bits" in data["configuration"] and isinstance(data["configuration"]["length_bits"], int) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -1123,7 +1123,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: "name": self._zarr_v3_name, "configuration": {"length_bits": self.length * self.item_size_bits}, } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1131,7 +1131,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: return np.str_("") @@ -1174,7 +1174,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == "|O" elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: @@ -1185,7 +1185,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: return "|O" elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1227,14 +1227,14 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == "|O" elif zarr_format == 3: return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1306,7 +1306,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and "unit" in data["configuration"] and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -1316,7 +1316,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: return self.to_dtype().str elif zarr_format == 3: return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1324,7 +1324,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): @@ -1391,7 +1391,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] return cast("JSON", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def check_json( @@ -1416,7 +1416,7 @@ def check_json( and isinstance(data["configuration"], dict) and "fields" in data["configuration"] ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: @@ -1441,7 +1441,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in meta_fields ) return cls(fields=fields) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") def to_dtype(self) -> np.dtypes.VoidDType[int]: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 4e24d64ad9..78dc6bdacd 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -530,4 +530,4 @@ def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: np.datetime64 The datetime64 value. """ - return cast(np.datetime64, np.int64(data).view(f"datetime64[{unit}]")) + return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 26aa68a4c3..0eb472bbc8 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -22,10 +22,8 @@ import json from collections.abc import Iterable from dataclasses import dataclass, field, replace -from enum import Enum from typing import Any, Literal -import numcodecs.abc import numpy as np from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec @@ -134,6 +132,33 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: ) +class V3JsonEncoder(json.JSONEncoder): + def __init__( + self, + *, + skipkeys: bool = False, + ensure_ascii: bool = True, + check_circular: bool = True, + allow_nan: bool = True, + sort_keys: bool = False, + indent: int | None = None, + separators: tuple[str, str] | None = None, + default: Callable[[object], object] | None = None, + ) -> None: + if indent is None: + indent = config.get("json_indent") + super().__init__( + skipkeys=skipkeys, + ensure_ascii=ensure_ascii, + check_circular=check_circular, + allow_nan=allow_nan, + sort_keys=sort_keys, + indent=indent, + separators=separators, + default=default, + ) + + class ArrayV3MetadataDict(TypedDict): """ A typed dictionary model for zarr v3 metadata. @@ -174,9 +199,6 @@ def __init__( Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ - # TODO: remove this - if not isinstance(data_type, ZDType): - raise TypeError shape_parsed = parse_shapelike(shape) chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) From 8b517ee38844ce4d61df1cd3096f413af39ccbb9 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 20 Mar 2025 17:50:53 +0100 Subject: [PATCH 044/129] fix: add special check for resolving int32 on windows --- src/zarr/core/dtype/_numpy.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 55bd86a61d..241626e6ac 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -429,6 +429,15 @@ class Int32(ZDType[np.dtypes.Int32DType, np.int32]): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: + # We override the base implementation to address a windows-specific, pre-numpy 2 issue where + # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + if dtype == np.dtypes.Int32DType(): + return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + else: + return super().from_dtype(dtype) + @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) From 97c645b45f85955b0cdf8c97627862dea5fe0b8c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 20 Mar 2025 22:26:05 +0100 Subject: [PATCH 045/129] add dtype entry point test --- src/zarr/core/dtype/registry.py | 3 ++ tests/package_with_entrypoint/__init__.py | 36 ++++--------- tests/{test_metadata => }/test_dtype.py | 66 +++++++++++++++++------ 3 files changed, 64 insertions(+), 41 deletions(-) rename tests/{test_metadata => }/test_dtype.py (84%) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0d07ab2b9d..4ad2158f96 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -12,6 +12,9 @@ from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +# This class is different from the other registry classes, which inherit from +# dict. IMO it's simpler to just do a dataclass. But long-term we should +# have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index ae86378cb5..704ee2c41c 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -1,4 +1,5 @@ -from __future__ import annotations +from collections.abc import Iterable +from typing import Any, Literal, Self from typing import TYPE_CHECKING @@ -9,15 +10,8 @@ from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.dtype.common import DataTypeValidationError, DTypeJSON, DTypeSpec_V2 -from zarr.core.dtype.npy.bool import Bool - -if TYPE_CHECKING: - from collections.abc import Iterable - from typing import Any, ClassVar, Literal, Self - - from zarr.core.array_spec import ArraySpec - from zarr.core.common import ZarrFormat +from zarr.core.common import BytesLike +from zarr.core.dtype import Bool class TestEntrypointCodec(ArrayBytesCodec): @@ -81,21 +75,13 @@ class TestDataType(Bool): This is a "data type" that serializes to "test" """ - _zarr_v3_name: ClassVar[Literal["test"]] = "test" # type: ignore[assignment] + _zarr_v3_name = "test" @classmethod - def from_json(cls, data: DTypeJSON, *, zarr_format: Literal[2, 3]) -> Self: - if zarr_format == 2 and data == {"name": cls._zarr_v3_name, "object_codec_id": None}: + def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: + if data == cls._zarr_v3_name: return cls() - if zarr_format == 3 and data == cls._zarr_v3_name: - return cls() - raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}" - ) - - def to_json(self, zarr_format: ZarrFormat) -> str | DTypeSpec_V2: # type: ignore[override] - if zarr_format == 2: - return {"name": self._zarr_v3_name, "object_codec_id": None} - if zarr_format == 3: - return self._zarr_v3_name - raise ValueError("zarr_format must be 2 or 3") + raise ValueError + + def to_json(self, zarr_format): + return self._zarr_v3_name diff --git a/tests/test_metadata/test_dtype.py b/tests/test_dtype.py similarity index 84% rename from tests/test_metadata/test_dtype.py rename to tests/test_dtype.py index db575ee16a..f690e6ce26 100644 --- a/tests/test_metadata/test_dtype.py +++ b/tests/test_dtype.py @@ -1,9 +1,16 @@ from __future__ import annotations +import os import re +import sys from typing import TYPE_CHECKING, Any, get_args +import zarr +from zarr.core.config import config + if TYPE_CHECKING: + from collections.abc import Generator + from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar @@ -15,6 +22,7 @@ VariableLengthString, ZDType, data_type_registry, + get_data_type_from_json, ) from zarr.core.dtype._numpy import ( Bool, @@ -47,6 +55,7 @@ def data_type_registry_fixture() -> DataTypeRegistry: _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType if _NUMPY_SUPPORTS_VLEN_STRING: VLEN_STRING_DTYPE = np.dtypes.StringDType() VLEN_STRING_CODE = "T" @@ -79,7 +88,9 @@ def data_type_registry_fixture() -> DataTypeRegistry: (DateTime64, "datetime64[s]"), ], ) -def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype | str) -> None: +def test_wrap( + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype[np.generic] | str +) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -90,14 +101,14 @@ def test_wrap(wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.d wrapped = wrapper_cls.from_dtype(dt) with pytest.raises(DataTypeValidationError, match="Invalid dtype"): - wrapper_cls.from_dtype("not a dtype") + wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] assert isinstance(wrapped, wrapper_cls) assert wrapped.to_dtype() == dt @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None: +def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: if issubclass(wrapper_cls, Structured): instance = wrapper_cls(fields=((("a", Bool()),))) else: @@ -127,16 +138,14 @@ def test_dict_serialization(wrapper_cls: DTYPE, zarr_format: ZarrFormat) -> None (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicode(length=3), np.str_("")), ( - Structured(fields=(("a", Float64()), ("b", Int8()))), + Structured(fields=(("a", Float64()), ("b", Int8()))), # type: ignore[arg-type] np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), (DateTime64(unit="s"), np.datetime64("NaT")), ], ) -def test_default_value( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], expected_default: Any -) -> None: +def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None: """ Test that the default_value method is correctly set for each dtype wrapper. """ @@ -171,7 +180,7 @@ def test_default_value( ], ) def test_to_json_value_v2( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], input_value: Any, expected_json: Any + wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -204,7 +213,7 @@ def test_to_json_value_v2( ], ) def test_from_json_value( - wrapper: type[ZDType[_BaseDType, _BaseScalar]], json_value: Any, expected_value: Any + wrapper: ZDType[_BaseDType, _BaseScalar], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. @@ -218,7 +227,7 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @@ -227,13 +236,13 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) # type: ignore[arg-type] assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -268,13 +277,13 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_registered_dtypes( - wrapper_cls: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], zarr_format: ZarrFormat ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) + instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] else: instance = wrapper_cls() @@ -289,7 +298,7 @@ def test_registered_dtypes( @staticmethod @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) def test_match_dtype_unique( - wrapper_cls: ZDType[_BaseDType, _BaseScalar], + wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], data_type_registry_fixture: DataTypeRegistry, zarr_format: ZarrFormat, ) -> None: @@ -303,7 +312,7 @@ def test_match_dtype_unique( data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) + instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] else: instance = wrapper_cls() dtype_instance = instance.to_dtype() @@ -316,3 +325,28 @@ def test_match_dtype_unique( msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + + +# this is copied from the registry tests -- we should deduplicate +here = os.path.abspath(os.path.dirname(__file__)) + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + sys.path.append(here) + zarr.registry._collect_entrypoints() + yield + sys.path.remove(here) + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + config.reset() + + +@pytest.mark.usefixtures("set_path") +def test_entrypoint_codec(zarr_format: ZarrFormat) -> None: + from package_with_entrypoint import TestDataType + + instance = TestDataType() + dtype_json = instance.to_json(zarr_format=zarr_format) + assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance From 5d617e5d4071fd0f12227d68bf67ac586a8ab73f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 21 Mar 2025 11:50:17 +0100 Subject: [PATCH 046/129] remove default parameters for parametric dtypes; add mixin classes for numpy dtypes; define zdtypelike --- src/zarr/core/array.py | 3 +- src/zarr/core/dtype/__init__.py | 26 ++++++------ src/zarr/core/dtype/_numpy.py | 71 ++++++++++++++++++--------------- src/zarr/core/dtype/common.py | 7 ++-- 4 files changed, 56 insertions(+), 51 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 3eeb9e4362..a0e4f9c1ab 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -69,6 +69,7 @@ from zarr.core.config import config as zarr_config from zarr.core.dtype import ( ZDType, + ZDTypeLike, parse_data_type, ) from zarr.core.indexing import ( @@ -581,7 +582,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: npt.DTypeLike | ZDType[_BaseDType, _BaseScalar], + dtype: ZDTypeLike | ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index fc494030f1..021b6b48e2 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,19 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, get_args - -import numpy as np - -from zarr.core.dtype._numpy import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar +from typing import TYPE_CHECKING, Any, TypeAlias, get_args if TYPE_CHECKING: - import numpy.typing as npt - - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import ZarrFormat +import numpy as np +import numpy.typing as npt +from zarr.core.common import JSON from zarr.core.dtype._numpy import ( + _NUMPY_SUPPORTS_VLEN_STRING, Bool, Complex64, Complex128, @@ -36,7 +33,7 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar __all__ = [ "Complex64", @@ -80,6 +77,8 @@ | DateTime64 ) +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON] + for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) @@ -112,9 +111,10 @@ def get_data_type_from_json( return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type( - dtype: npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON], zarr_format: ZarrFormat -) -> ZDType[Any, Any]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[Any, Any]: + """ + Interpret the input as a ZDType instance. + """ if isinstance(dtype, ZDType): return dtype elif isinstance(dtype, dict): diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 241626e6ac..4094403c3f 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -44,7 +44,25 @@ EndiannessNumpy = Literal[">", "<", "|", "="] -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True) +class HasEndianness: + """ + This is a mix-in class for data types with an endianness attribute + """ + + endianness: Endianness | None = "little" + + +@dataclass(frozen=True) +class HasLength: + """ + This is a mix-in class for data types with a length attribute + """ + + length: int + + +@dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ Wrapper for numpy boolean dtype. @@ -311,11 +329,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: @dataclass(frozen=True, kw_only=True) -class Int16(ZDType[np.dtypes.Int16DType, np.int16]): +class Int16(ZDType[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: @@ -367,11 +384,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: @dataclass(frozen=True, kw_only=True) -class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16]): +class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: @@ -423,16 +439,18 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: @dataclass(frozen=True, kw_only=True) -class Int32(ZDType[np.dtypes.Int32DType, np.int32]): +class Int32(ZDType[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, + # despite the two classes being different. Thus we will create an instance of `cls` with the + # latter dtype, after pulling in the byte order of the input if dtype == np.dtypes.Int32DType(): return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) else: @@ -488,11 +506,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: @dataclass(frozen=True, kw_only=True) -class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32]): +class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: @@ -544,11 +561,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: @dataclass(frozen=True, kw_only=True) -class Int64(ZDType[np.dtypes.Int64DType, np.int64]): +class Int64(ZDType[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: @@ -600,11 +616,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: @dataclass(frozen=True, kw_only=True) -class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64]): +class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: @@ -656,11 +671,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: @dataclass(frozen=True, kw_only=True) -class Float16(ZDType[np.dtypes.Float16DType, np.float16]): +class Float16(ZDType[np.dtypes.Float16DType, np.float16], HasEndianness): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: @@ -712,11 +726,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: @dataclass(frozen=True, kw_only=True) -class Float32(ZDType[np.dtypes.Float32DType, np.float32]): +class Float32(ZDType[np.dtypes.Float32DType, np.float32], HasEndianness): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: @@ -768,11 +781,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: @dataclass(frozen=True, kw_only=True) -class Float64(ZDType[np.dtypes.Float64DType, np.float64]): +class Float64(ZDType[np.dtypes.Float64DType, np.float64], HasEndianness): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " Self: @@ -824,11 +836,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: @dataclass(frozen=True, kw_only=True) -class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64]): +class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64], HasEndianness): dtype_cls = np.dtypes.Complex64DType _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: @@ -882,11 +893,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex6 @dataclass(frozen=True, kw_only=True) -class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128]): +class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: @@ -940,11 +950,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1 @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_]): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" item_size_bits: ClassVar[int] = 8 - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.BytesDType[int]) -> Self: @@ -1004,14 +1013,13 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void]): +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): # np.dtypes.VoidDType is specified in an odd way in numpy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "numpy.void" item_size_bits: ClassVar[int] = 8 - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: @@ -1085,12 +1093,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_]): +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - endianness: Endianness | None = "little" - length: int = 1 @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: @@ -1269,11 +1275,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: @dataclass(frozen=True, kw_only=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64]): +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit = "s" - endianness: Endianness | None = "little" + unit: DateUnit | TimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 78dc6bdacd..106da80a61 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -325,7 +325,7 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: Parameters ---------- data : bytes - The structured scalar value to convert. + The bytes to store. zarr_format : ZarrFormat The zarr format version. @@ -334,9 +334,8 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: str The bytes encoded as ascii using the base64 alphabet. """ - if zarr_format == 2: - return base64.b64encode(data).decode("ascii") - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + # TODO: decide if we are going to make this implementation zarr format-specific + return base64.b64encode(data).decode("ascii") def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: From 2fef5b2f5d39efcfca7772d82780985cc4c184eb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 14:45:27 +0100 Subject: [PATCH 047/129] refactor: use inheritance to remove boilerplate in dtype definitions --- src/zarr/core/dtype/_numpy.py | 900 ++++++++++++++------------------- src/zarr/core/dtype/common.py | 33 +- src/zarr/core/dtype/wrapper.py | 74 ++- src/zarr/core/metadata/v2.py | 6 +- tests/conftest.py | 22 +- tests/test_array.py | 79 +-- tests/test_dtype.py | 105 ++-- tests/test_metadata/test_v3.py | 19 +- 8 files changed, 552 insertions(+), 686 deletions(-) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 4094403c3f..38597f8fee 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -11,7 +11,12 @@ ClassVar, Literal, Self, + SupportsComplex, + SupportsFloat, + SupportsIndex, + SupportsInt, TypeGuard, + TypeVar, cast, get_args, ) @@ -21,7 +26,6 @@ from zarr.core.dtype.common import ( DataTypeValidationError, Endianness, - JSONFloat, bytes_from_json, bytes_to_json, check_json_bool, @@ -29,8 +33,8 @@ check_json_float, check_json_int, check_json_str, - complex_from_json, - complex_to_json, + complex_float_from_json, + complex_float_to_json, datetime_from_json, datetime_to_json, float_from_json, @@ -42,6 +46,9 @@ from zarr.core.common import JSON, ZarrFormat EndiannessNumpy = Literal[">", "<", "|", "="] +IntLike = SupportsInt | SupportsIndex | bytes | str +FloatLike = SupportsIndex | SupportsFloat | bytes | str +ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None @dataclass(frozen=True) @@ -80,7 +87,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.BoolDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.BoolDType: @@ -119,9 +126,9 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: np.bool_, zarr_format: ZarrFormat) -> bool: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: """ - Convert a boolean value to JSON-serializable format. + Convert a scalar to a python bool. Parameters ---------- @@ -154,188 +161,175 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return np.bool_(data) + return self._cast_value_unsafe(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") + def check_value(self, data: object) -> bool: + # Anything can become a bool + return True -@dataclass(frozen=True, kw_only=True) -class Int8(ZDType[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + def cast_value(self, value: object) -> np.bool_: + return self._cast_value_unsafe(value) - @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.Int8DType) -> Self: - return cls() + def _cast_value_unsafe(self, value: object) -> np.bool_: + return np.bool_(value) - def to_dtype(self: Self) -> np.dtypes.Int8DType: - return self.dtype_cls() - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["int8", "|i1"]]: +_NumpyIntDType = ( + np.dtypes.Int8DType + | np.dtypes.Int16DType + | np.dtypes.Int32DType + | np.dtypes.Int64DType + | np.dtypes.UInt8DType + | np.dtypes.UInt16DType + | np.dtypes.UInt32DType + | np.dtypes.UInt64DType +) +_NumpyIntScalar = ( + np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 +) +TIntDType_co = TypeVar("TIntDType_co", bound=_NumpyIntDType, covariant=True) +TIntScalar_co = TypeVar("TIntScalar_co", bound=_NumpyIntScalar, covariant=True) + + +@dataclass(frozen=True) +class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): + # This attribute holds the possible zarr v2 JSON names for the data type + _zarr_v2_names: ClassVar[tuple[str, ...]] + + def to_json(self, zarr_format: ZarrFormat) -> str: """ - Check that the input is a valid JSON representation of a 8-bit integer. + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return data in cls._zarr_v2_names + return self.to_dtype().str elif zarr_format == 3: - return data == cls._zarr_v3_name + return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of this data type. + """ if zarr_format == 2: - return self.to_dtype().str + return data in cls._zarr_v2_names elif zarr_format == 3: - return self._zarr_v3_name + return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() + def check_value(self, value: object) -> TypeGuard[IntLike]: + return isinstance(value, IntLike) - def default_value(self) -> np.int8: + def _cast_value_unsafe(self, value: object) -> TIntScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") + + def default_value(self) -> TIntScalar_co: """ - Get the default value. + Get the default value, which is 0 cast to this dtype Returns ------- - np.int8 + Int scalar The default value. """ - return np.int8(0) + return self._cast_value_unsafe(0) - def to_json_value(self, data: np.int8, zarr_format: ZarrFormat) -> int: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Convert a numpy 8-bit int to JSON-serializable format. + Read a JSON-serializable value as a numpy int scalar. Parameters ---------- - data : np.int8 - The value to convert. + data : JSON + The JSON-serializable value. zarr_format : ZarrFormat The zarr format version. Returns ------- - int - The JSON-serializable form of the scalar. + TScalar_co + The numpy scalar. """ - return int(data) + if check_json_int(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int8: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: """ - Read a JSON-serializable value as a numpy int8 scalar. + Convert an object to JSON-serializable scalar. Parameters ---------- - data : JSON - The JSON-serializable value. + data : _BaseScalar + The value to convert. zarr_format : ZarrFormat The zarr format version. Returns ------- - np.bool_ - The numpy boolean scalar. + int + The JSON-serializable form of the scalar. """ - if check_json_int(data): - return np.int8(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + return int(self.cast_value(data)) @dataclass(frozen=True, kw_only=True) -class UInt8(ZDType[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) +class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.UInt8DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.UInt8DType: + def to_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["uint8", "|u1"]]: - """ - Check that the input is a valid JSON representation of an unsigned 8-bit integer. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() - def default_value(self) -> np.uint8: - """ - Get the default value for this data type. - - Returns - ------- - np.uint8 - The default value. - """ - return np.uint8(0) - - def to_json_value(self, data: np.uint8, zarr_format: ZarrFormat) -> int: - """ - Convert a numpy unsigned 8-bit integer to JSON-serializable format. - - Parameters - ---------- - data : np.uint8 - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - Returns - ------- - int - The JSON-serializable form of the scalar. - """ - return int(data) +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint8: - """ - Read a JSON-serializable value as a numpy boolean scalar. + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. + def to_dtype(self: Self) -> np.dtypes.UInt8DType: + return self.dtype_cls() - Returns - ------- - np.bool_ - The numpy boolean scalar. - """ - if check_json_int(data): - return np.uint8(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() @dataclass(frozen=True, kw_only=True) -class Int16(ZDType[np.dtypes.Int16DType, np.int16], HasEndianness): +class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -343,54 +337,24 @@ def to_dtype(self) -> np.dtypes.Int16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int16", ">i2", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: + # This ensures that we get the endianness correct without annoying string parsing return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int16, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int16: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt16(ZDType[np.dtypes.UInt16DType, np.uint16], HasEndianness): +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -398,26 +362,6 @@ def to_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint16", ">u2", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -426,20 +370,9 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint16, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint16: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class Int32(ZDType[np.dtypes.Int32DType, np.int32], HasEndianness): +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: return super().from_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.Int32DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -465,26 +398,6 @@ def to_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int32", ">i4", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -493,26 +406,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int32: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int32, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int32: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt32(ZDType[np.dtypes.UInt32DType, np.uint32], HasEndianness): +class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -520,26 +422,6 @@ def to_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint32", ">u4", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -548,26 +430,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint32: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint32, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint32: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class Int64(ZDType[np.dtypes.Int64DType, np.int64], HasEndianness): +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -575,26 +446,6 @@ def to_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["int64", ">i8", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -603,26 +454,15 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.int64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.int64, zarr_format: ZarrFormat) -> int: - return int(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.int64: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - @dataclass(frozen=True, kw_only=True) -class UInt64(ZDType[np.dtypes.UInt64DType, np.uint64], HasEndianness): +class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -630,26 +470,6 @@ def to_dtype(self) -> np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["uint64", ">u8", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: @@ -658,47 +478,45 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.uint64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.uint64, zarr_format: ZarrFormat) -> int: - return int(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.uint64: - if check_json_int(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") +TFloatDType_co = TypeVar( + "TFloatDType_co", + bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, + covariant=True, +) +TFloatScalar_co = TypeVar( + "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True +) -@dataclass(frozen=True, kw_only=True) -class Float16(ZDType[np.dtypes.Float16DType, np.float16], HasEndianness): - dtype_cls = np.dtypes.Float16DType - _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Float16DType: + def to_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float", ">f2", " str: """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Convert the wrapped data type to a JSON-serializable form. - def to_json(self, zarr_format: ZarrFormat) -> str: + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: @@ -713,39 +531,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.float16: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.float16, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float16: - if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Float32(ZDType[np.dtypes.Float32DType, np.float32], HasEndianness): - dtype_cls = np.dtypes.Float32DType - _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Float32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float32", ">f4", " TypeGuard[JSON]: """ - Check that the input is a valid JSON representation of a signed 16-bit integer. + Check that the input is a valid JSON representation of this data type. """ if zarr_format == 2: return data in cls._zarr_v2_names @@ -753,62 +542,122 @@ def check_json( return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_value(self, value: object) -> TypeGuard[FloatLike]: + return isinstance(value, FloatLike) - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") + + def default_value(self) -> TFloatScalar_co: + """ + Get the default value, which is 0 cast to this dtype + + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) - def default_value(self) -> np.float32: - return self.to_dtype().type(0) + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + """ + Read a JSON-serializable value as a numpy float. - def to_json_value(self, data: np.float32, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float32: + Returns + ------- + TScalar_co + The numpy float. + """ if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") + return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: + """ + Convert an object to a JSON-serializable float. + + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSON + The JSON-serializable form of the float, which is potentially a number or a string. + See the zarr specifications for details on the JSON encoding for floats. + """ + return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) + + +@dataclass(frozen=True, kw_only=True) +class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + dtype_cls = np.dtypes.Float16DType + _zarr_v3_name = "float16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f4", "f8", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Float64DType: + def to_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["float64", ">f8", " str: """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Convert the wrapped data type to a JSON-serializable form. - def to_json(self, zarr_format: ZarrFormat) -> str: + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: @@ -823,39 +672,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.float64: - return self.to_dtype().type(0) - - def to_json_value(self, data: np.float64, zarr_format: ZarrFormat) -> JSONFloat: - return float_to_json(data, zarr_format) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.float64: - if check_json_float(data, zarr_format=zarr_format): - return self.to_dtype().type(float_from_json(data, zarr_format)) - raise TypeError(f"Invalid type: {data}. Expected a float.") - - -@dataclass(frozen=True, kw_only=True) -class Complex64(ZDType[np.dtypes.Complex64DType, np.complex64], HasEndianness): - dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Complex64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["complex64", ">c8", " TypeGuard[JSON]: """ - Check that the input is a valid JSON representation of a signed 16-bit integer. + Check that the input is a valid JSON representation of this data type. """ if zarr_format == 2: return data in cls._zarr_v2_names @@ -863,90 +683,79 @@ def check_json( return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_value(self, value: object) -> bool: + return isinstance(value, ComplexLike) - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") - def default_value(self) -> np.complex64: - return self.to_dtype().type(0) + def default_value(self) -> TComplexScalar_co: + """ + Get the default value, which is 0 cast to this dtype - def to_json_value( - self, data: np.complex64, zarr_format: ZarrFormat - ) -> tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) + Returns + ------- + Int scalar + The default value. + """ + return self._cast_value_unsafe(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex64: - if check_json_complex_float(data, zarr_format=zarr_format): - return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + """ + Read a JSON-serializable value as a numpy float. + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. -@dataclass(frozen=True, kw_only=True) -class Complex128(ZDType[np.dtypes.Complex128DType, np.complex128], HasEndianness): - dtype_cls = np.dtypes.Complex128DType - _zarr_v3_name = "complex128" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: + """ + Convert an object to a JSON-serializable float. - def to_dtype(self) -> np.dtypes.Complex128DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + Parameters + ---------- + data : _BaseScalar + The value to convert. + zarr_format : ZarrFormat + The zarr format version. - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[Literal["complex128", ">c16", " str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.complex128: - return self.to_dtype().type(0) +@dataclass(frozen=True, kw_only=True) +class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + dtype_cls = np.dtypes.Complex64DType + _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " tuple[JSONFloat, JSONFloat]: - return complex_to_json(data, zarr_format) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex128: - if check_json_complex_float(data, zarr_format=zarr_format): - return complex_from_json(data, dtype=self.to_dtype(), zarr_format=zarr_format) - raise TypeError(f"Invalid type: {data}. Expected a complex float.") +@dataclass(frozen=True, kw_only=True) +class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): + dtype_cls = np.dtypes.Complex128DType + _zarr_v3_name = "complex128" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.BytesDType[int]: @@ -1003,14 +812,20 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> np.bytes_: return np.bytes_(b"") - def to_json_value(self, data: np.bytes_, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): @@ -1022,7 +837,7 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.VoidDType[int]: @@ -1083,14 +898,20 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidD def default_value(self) -> np.void: return self.to_dtype().type(("\x00" * self.length).encode("ascii")) - def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data.tobytes()).decode("ascii") + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data)) raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.void: + return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + @dataclass(frozen=True, kw_only=True) class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): @@ -1099,7 +920,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), @@ -1151,7 +972,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> np.str_: return np.str_("") - def to_json_value(self, data: np.str_, *, zarr_format: ZarrFormat) -> str: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: @@ -1159,6 +980,12 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: raise TypeError(f"Invalid type: {data}. Expected a string.") return self.to_dtype().type(data) + def check_value(self, data: object) -> bool: + return isinstance(data, str | np.str_ | bytes) + + def _cast_value_unsafe(self, value: object) -> np.str_: + return self.to_dtype().type(value) + _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") @@ -1171,7 +998,7 @@ class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.StringDType: @@ -1217,6 +1044,12 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) + else: @dataclass(frozen=True, kw_only=True) @@ -1225,7 +1058,7 @@ class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: @@ -1258,8 +1091,8 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def default_value(self) -> str: return "" - def to_json_value(self, data: str, *, zarr_format: ZarrFormat) -> str: - return data + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ @@ -1269,19 +1102,25 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) + DateUnit = Literal["Y", "M", "W", "D"] TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] -@dataclass(frozen=True, kw_only=True) +@dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" unit: DateUnit | TimeUnit @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') @@ -1345,8 +1184,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime return datetime_from_json(data, self.unit) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) # type: ignore[arg-type] + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def _cast_value_unsafe(self, value: object) -> np.datetime64: + return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] @dataclass(frozen=True, kw_only=True) @@ -1356,9 +1206,9 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] def default_value(self) -> np.void: - return self.cast_value(0) + return self._cast_value_unsafe(0) - def cast_value(self, value: object) -> np.void: + def _cast_value_unsafe(self, value: object) -> np.void: return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod @@ -1379,7 +1229,7 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[int]) -> Self: + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[Any, Any]]] = [] @@ -1464,8 +1314,16 @@ def to_dtype(self) -> np.dtypes.VoidDType[int]: np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), ) - def to_json_value(self, data: np.generic, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(data.tobytes(), zarr_format) + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if not check_json_str(data): diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 106da80a61..a53d2e7866 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -31,7 +31,7 @@ def check_json_bool(data: JSON) -> TypeGuard[bool]: Bool True if the data is a boolean, False otherwise. """ - return bool(isinstance(data, bool)) + return isinstance(data, bool) def check_json_str(data: JSON) -> TypeGuard[str]: @@ -293,7 +293,7 @@ def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JS return float_to_json_v3(data.real), float_to_json_v3(data.imag) -def complex_to_json( +def complex_float_to_json( data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: """ @@ -424,9 +424,7 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def complex_from_json_v2( - data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType -) -> np.complexfloating[Any, Any]: +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: """ Convert a JSON complex float to a complex number (v2). @@ -434,20 +432,16 @@ def complex_from_json_v2( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. Returns ------- np.complexfloating The complex number. """ - return dtype.type(complex(float_from_json_v2(data[0]), float_from_json_v2(data[1]))) + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) -def complex_from_json_v3( - data: tuple[JSONFloat, JSONFloat], dtype: np.dtypes.Complex64DType | np.dtypes.Complex128DType -) -> np.complexfloating[Any, Any]: +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: """ Convert a JSON complex float to a complex number (v3). @@ -455,20 +449,16 @@ def complex_from_json_v3( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. Returns ------- np.complexfloating The complex number. """ - return dtype.type(complex(float_from_json_v3(data[0]), float_from_json_v3(data[1]))) + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) -def complex_from_json( - data: tuple[JSONFloat, JSONFloat], dtype: Any, zarr_format: ZarrFormat -) -> np.complexfloating[Any, Any]: +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: """ Convert a JSON complex float to a complex number based on zarr format. @@ -476,8 +466,6 @@ def complex_from_json( ---------- data : tuple[JSONFloat, JSONFloat] The JSON complex float to convert. - dtype : Any - The numpy dtype. zarr_format : ZarrFormat The zarr format version. @@ -487,12 +475,9 @@ def complex_from_json( The complex number. """ if zarr_format == 2: - return complex_from_json_v2(data, dtype) + return complex_float_from_json_v2(data) else: - if check_json_complex_float_v3(data): - return complex_from_json_v3(data, dtype) - else: - raise TypeError(f"Invalid type: {data}. Expected a sequence of two numbers.") + return complex_float_from_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 3409fa7ca4..74e7bf79e1 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -17,13 +17,14 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. _BaseDType = np.dtype[np.generic] -TScalar = TypeVar("TScalar", bound=_BaseScalar) +TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) # TODO: figure out an interface or protocol that non-numpy dtypes can use -TDType = TypeVar("TDType", bound=_BaseDType) +# These two type parameters are covariant because we want isinstance(ZDType[Subclass](), ZDType[BaseDType]) to be True +TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) @dataclass(frozen=True, kw_only=True, slots=True) -class ZDType(Generic[TDType, TScalar], ABC): +class ZDType(Generic[TDType_co, TScalar_co], ABC): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes @@ -41,11 +42,11 @@ class ZDType(Generic[TDType, TScalar], ABC): # mypy currently disallows class variables to contain type parameters # but it seems OK for us to use it here: # https://github.com/python/typing/discussions/1424#discussioncomment-7989934 - dtype_cls: ClassVar[type[TDType]] # type: ignore[misc] + dtype_cls: ClassVar[type[TDType_co]] # type: ignore[misc] _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType]: + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -89,7 +90,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: + def _from_dtype_unsafe(cls: type[Self], dtype: _BaseDType) -> Self: """ Wrap a native dtype without checking. @@ -106,7 +107,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TDType) -> Self: ... @abstractmethod - def to_dtype(self: Self) -> TDType: + def to_dtype(self: Self) -> TDType_co: """ Return an instance of the wrapped dtype. @@ -117,8 +118,61 @@ def to_dtype(self: Self) -> TDType: """ ... + def cast_value(self, data: object) -> TScalar_co: + """ + Cast a value to the wrapped scalar type. The type is first checked for compatibility. If it's + incompatible with the associated scalar type, a ``TypeError`` will be raised. + + Parameters + ---------- + data : TScalar + The scalar value to cast. + + Returns + ------- + TScalar + The cast value. + """ + if self.check_value(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid value: {data}") + + @abstractmethod + def check_value(self, data: object) -> bool: + """ + Check that a value is a valid value for the wrapped data type. + + Parameters + ---------- + data : object + A value to check. + + Returns + ------- + Bool + True if the value is valid, False otherwise. + """ + ... + + @abstractmethod + def _cast_value_unsafe(self, data: object) -> TScalar_co: + """ + Cast a value to the wrapped data type. This method should not perform any input validation. + + Parameters + ---------- + data : TScalar + The scalar value to cast. + + Returns + ------- + TScalar + The cast value. + """ + ... + @abstractmethod - def default_value(self) -> TScalar: + def default_value(self) -> TScalar_co: """ Get the default value for the wrapped data type. This is a method, rather than an attribute, because the default value for some data types may depend on parameters that are not known @@ -216,7 +270,7 @@ def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> S ... @abstractmethod - def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert a single value to JSON-serializable format. @@ -235,7 +289,7 @@ def to_json_value(self, data: TScalar, *, zarr_format: ZarrFormat) -> JSON: ... @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar: + def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ Read a JSON-serializable value as a scalar. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 02574440ff..90ef3c3192 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -8,7 +8,7 @@ from zarr.abc.metadata import Metadata from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.wrapper import TDType, TScalar, ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TDType_co, TScalar_co, ZDType, _BaseDType, _BaseScalar if TYPE_CHECKING: from typing import Literal, Self @@ -74,7 +74,7 @@ def __init__( self, *, shape: ChunkCoords, - dtype: ZDType[TDType, TScalar], + dtype: ZDType[TDType_co, TScalar_co], chunks: ChunkCoords, fill_value: Any, order: MemoryOrder, @@ -195,7 +195,7 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["filters"] = new_filters if self.fill_value is not None: - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) # type: ignore[arg-type] + fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) diff --git a/tests/conftest.py b/tests/conftest.py index 58f2be8e14..d8ead82406 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,8 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype +from zarr.core.dtype._numpy import DateTime64, HasLength, Structured from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -432,10 +433,15 @@ def meta_from_array( ) -def skip_object_dtype(dtype: ZDType[Any, Any]) -> None: - if dtype.dtype_cls is type(np.dtype("O")): - msg = ( - f"{dtype} uses the numpy object data type, which is not a valid target for data " - "type resolution" - ) - pytest.skip(msg) +# Generate a collection of zdtype instances for use in testing. +zdtype_examples: tuple[ZDType[Any, Any], ...] = () +for wrapper_cls in data_type_registry.contents.values(): + # The Structured dtype has to be constructed with some actual fields + if wrapper_cls is Structured: + zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + elif issubclass(wrapper_cls, HasLength): + zdtype_examples += (wrapper_cls(length=1),) + elif issubclass(wrapper_cls, DateTime64): + zdtype_examples += (wrapper_cls(unit="s"),) + else: + zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_array.py b/tests/test_array.py index fed949b69e..4f436a84f4 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -43,13 +43,14 @@ from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import Float64 +from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath -from .test_dtype.conftest import zdtype_examples +from .conftest import zdtype_examples if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike @@ -205,7 +206,7 @@ def test_array_fill_value_default( ) else: arr = zarr.create_array(store=store, shape=shape, dtype=zdtype, zarr_format=3, chunks=shape) - expected_fill_value = zdtype.default_scalar() + expected_fill_value = zdtype.default_value() if isinstance(expected_fill_value, np.datetime64 | np.timedelta64): if np.isnat(expected_fill_value): assert np.isnat(arr.fill_value) @@ -1065,7 +1066,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer="auto", - dtype=arr.metadata.data_type, # type: ignore[union-attr] + dtype=arr._zdtype, ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1659,74 +1660,4 @@ async def test_sharding_coordinate_selection() -> None: shards=(2, 4, 4), ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) - result = arr[1, [0, 1]] # type: ignore[index] - assert isinstance(result, NDArrayLike) - assert (result == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() - - -@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) -def test_array_repr(store: Store) -> None: - shape = (2, 3, 4) - dtype = "uint8" - arr = zarr.create_array(store, shape=shape, dtype=dtype) - assert str(arr) == f"" - - -class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]): - object_codec_id = "unknown" # type: ignore[assignment] - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - """ - Create a NumPy object dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.ObjectDType - The NumPy object dtype. - """ - return np.dtype("o") # type: ignore[return-value] - - -@pytest.mark.parametrize( - "dtype", [VariableLengthUTF8(), VariableLengthBytes(), UnknownObjectDtype()] -) -def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None: - """ - Test that a valuerror is raised when checking the chunk encoding for a v2 array with a - data type that requires an object codec, but where no object codec is specified - """ - if isinstance(dtype, VariableLengthUTF8): - codec_name = "the numcodecs.VLenUTF8 codec" - elif isinstance(dtype, VariableLengthBytes): - codec_name = "the numcodecs.VLenBytes codec" - else: - codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" # type: ignore[attr-defined] - msg = ( - f"Data type {dtype} requires {codec_name}, " - "but no such codec was specified in the filters or compressor parameters for " - "this array. " - ) - with pytest.raises(ValueError, match=re.escape(msg)): - _parse_chunk_encoding_v2(filters=None, compressor=None, dtype=dtype) - - -def test_unknown_object_codec_default_serializer_v3() -> None: - """ - Test that we get a valueerrror when trying to create the default serializer for a data type - that requires an unknown object codec - """ - dtype = UnknownObjectDtype() - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - with pytest.raises(ValueError, match=re.escape(msg)): - default_serializer_v3(dtype) - - -def test_unknown_object_codec_default_filters_v2() -> None: - """ - Test that we get a valueerrror when trying to create the default serializer for a data type - that requires an unknown object codec - """ - dtype = UnknownObjectDtype() - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - with pytest.raises(ValueError, match=re.escape(msg)): - default_filters_v2(dtype) + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() diff --git a/tests/test_dtype.py b/tests/test_dtype.py index f690e6ce26..122949664c 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype.py @@ -8,6 +8,8 @@ import zarr from zarr.core.config import config +from .conftest import zdtype_examples + if TYPE_CHECKING: from collections.abc import Generator @@ -64,6 +66,17 @@ def data_type_registry_fixture() -> DataTypeRegistry: VLEN_STRING_CODE = "O" +def test_zdtype_examples() -> None: + """ + Test that all the elements of the exported union type DTYPE have an example in the variable + zdtype_examples, which we use for testing. + + If this test fails, that means that either there is a data type that does not have an example, + or there is a data type that is missing from the DTYPE union type. + """ + assert set(map(type, zdtype_examples)) == set(get_args(DTYPE)) + + @pytest.mark.parametrize( ("wrapper_cls", "np_dtype"), [ @@ -88,9 +101,7 @@ def data_type_registry_fixture() -> DataTypeRegistry: (DateTime64, "datetime64[s]"), ], ) -def test_wrap( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], np_dtype: np.dtype[np.generic] | str -) -> None: +def test_wrap(wrapper_cls: type[ZDType[Any, Any]], np_dtype: np.dtype[np.generic] | str) -> None: """ Test that the wrapper class has the correct dtype class bound to the dtype_cls variable Test that the ``wrap`` method produces an instance of the wrapper class @@ -102,19 +113,17 @@ def test_wrap( with pytest.raises(DataTypeValidationError, match="Invalid dtype"): wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] - assert isinstance(wrapped, wrapper_cls) assert wrapped.to_dtype() == dt -@pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) -def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) - else: - instance = wrapper_cls() - as_dict = instance.to_json(zarr_format=zarr_format) - assert wrapper_cls.from_json(as_dict, zarr_format=zarr_format) == instance +@pytest.mark.parametrize("zdtype", zdtype_examples) +def test_to_json_roundtrip(zdtype: ZDType[Any, Any], zarr_format: ZarrFormat) -> None: + """ + Test that a zdtype instance can round-trip through its JSON form + """ + as_dict = zdtype.to_json(zarr_format=zarr_format) + assert zdtype.from_json(as_dict, zarr_format=zarr_format) == zdtype @pytest.mark.parametrize( @@ -138,7 +147,7 @@ def test_dict_serialization(wrapper_cls: Any, zarr_format: ZarrFormat) -> None: (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), (FixedLengthUnicode(length=3), np.str_("")), ( - Structured(fields=(("a", Float64()), ("b", Int8()))), # type: ignore[arg-type] + Structured(fields=(("a", Float64()), ("b", Int8()))), np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], ), (VariableLengthString(), ""), @@ -188,6 +197,42 @@ def test_to_json_value_v2( assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json +# NOTE! This test is currently a direct copy of the v2 version. When or if we change JSON serialization +# in a v3-specific manner, this test must be changed. +# TODO: Apply zarr-v3-specific changes to this test as needed +@pytest.mark.parametrize( + ("wrapper", "input_value", "expected_json"), + [ + (Bool(), np.bool_(True), True), + (Int8(), np.int8(42), 42), + (UInt8(), np.uint8(42), 42), + (Int16(), np.int16(42), 42), + (UInt16(), np.uint16(42), 42), + (Int32(), np.int32(42), 42), + (UInt32(), np.uint32(42), 42), + (Int64(), np.int64(42), 42), + (UInt64(), np.uint64(42), 42), + (Float16(), np.float16(42.0), 42.0), + (Float32(), np.float32(42.0), 42.0), + (Float64(), np.float64(42.0), 42.0), + (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), + (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), + (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), + (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), + (FixedLengthUnicode(length=4), np.str_("test"), "test"), + (VariableLengthString(), "test", "test"), + (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), + ], +) +def test_to_json_value_v3( + wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any +) -> None: + """ + Test the to_json_value method for each dtype wrapper for zarr v3 + """ + assert wrapper.to_json_value(input_value, zarr_format=3) == expected_json + + @pytest.mark.parametrize( ("wrapper", "json_value", "expected_value"), [ @@ -227,7 +272,7 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) @@ -236,13 +281,13 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) # type: ignore[arg-type] + data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): def default_value(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) # type: ignore[arg-type] + data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) @staticmethod @@ -275,30 +320,26 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non data_type_registry_fixture.get(outside_dtype) @staticmethod - @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + @pytest.mark.parametrize("zdtype", zdtype_examples) def test_registered_dtypes( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], zarr_format: ZarrFormat + zdtype: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] - else: - instance = wrapper_cls() - assert data_type_registry.match_dtype(instance.to_dtype()) == instance + assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype assert ( data_type_registry.match_json( - instance.to_json(zarr_format=zarr_format), zarr_format=zarr_format + zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format ) - == instance + == zdtype ) @staticmethod - @pytest.mark.parametrize("wrapper_cls", get_args(DTYPE)) + @pytest.mark.parametrize("zdtype", zdtype_examples) def test_match_dtype_unique( - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], + zdtype: ZDType[Any, Any], data_type_registry_fixture: DataTypeRegistry, zarr_format: ZarrFormat, ) -> None: @@ -308,20 +349,16 @@ def test_match_dtype_unique( fails to match anything in the registry """ for _cls in get_args(DTYPE): - if _cls is not wrapper_cls: + if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - if issubclass(wrapper_cls, Structured): - instance = wrapper_cls(fields=((("a", Bool()),))) # type: ignore[misc] - else: - instance = wrapper_cls() - dtype_instance = instance.to_dtype() + dtype_instance = zdtype.to_dtype() msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_dtype(dtype_instance) - instance_dict = instance.to_json(zarr_format=zarr_format) + instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 1ecdb58718..47fc692f4f 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -13,7 +13,7 @@ from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype._numpy import DateTime64 -from zarr.core.dtype.common import complex_from_json +from zarr.core.dtype.common import check_json_complex_float from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, @@ -28,7 +28,7 @@ from typing import Any from zarr.abc.codec import Codec - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat from zarr.core.metadata.v3 import ( @@ -137,17 +137,12 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) -@pytest.mark.parametrize("dtype_str", [*complex_dtypes]) @pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) -def test_complex_to_json_invalid(data: object, dtype_str: str) -> None: - """ - Test that parse_fill_value(fill_value, dtype) correctly rejects sequences with length not - equal to 2 - """ - dtype_instance = get_data_type_from_native_dtype(dtype_str) - match = f"Invalid type: {data}. Expected a sequence of two numbers." - with pytest.raises(TypeError, match=re.escape(match)): - complex_from_json(data=data, dtype=dtype_instance, zarr_format=3) +def test_complex_to_json_invalid(data: object, zarr_format: ZarrFormat) -> None: + assert not check_json_complex_float(data, zarr_format=zarr_format) + # match = f"Invalid type: {data}. Expected a sequence of two numbers." + # with pytest.raises(TypeError, match=re.escape(match)): + # complex_float_from_json(data=data, zarr_format=3) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) From 23150688d56a1c2756c21001968aabc62439b6b0 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 24 Mar 2025 11:39:12 +0100 Subject: [PATCH 048/129] Update docs/user-guide/data_types.rst Co-authored-by: Ilan Gold --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index b964439706..36a9ea40f7 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -40,7 +40,7 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. -In addition to defining a representation of the data type itself (which in the example above was just a simple string ``" Date: Mon, 24 Mar 2025 17:20:37 +0100 Subject: [PATCH 049/129] update data types documentation, and expose core/dtype module to autodoc --- docs/conf.py | 5 +- docs/user-guide/data_types.rst | 203 ++++++++++----------------------- 2 files changed, 66 insertions(+), 142 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 803d2c4255..8a64836d5e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -68,7 +68,10 @@ def skip_submodules( ) -> bool: # Skip documenting zarr.codecs submodules # codecs are documented in the main zarr.codecs namespace - if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): + # TODO: just document everything instead using this weak case-by-case logic + if what == "module" and name.startswith("zarr.core.dtype."): + skip = False + elif what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): skip = True return skip diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 36a9ea40f7..a281b349de 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -16,14 +16,14 @@ Zarr-Python supports creating arrays with Numpy data types:: Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for -encoding / decoding that data type to / from Zarr array metadata, and also encoding / decoding **instances** of that data type to / from +encoding/decoding that data type to/from Zarr array metadata, and also encoding/decoding **instances** of that data type to/from array metadata. These serialization procedures depend on the Zarr format. Data types in Zarr version 2 ----------------------------- Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype: +Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype:: >>> import zarr >>> import numpy as np @@ -32,158 +32,79 @@ Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``st >>> np_dtype = np.dtype('int64') >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> assert dtype_meta == np_dtype.str # True + >>> assert dtype_meta == np_dtype.str # True >>> dtype_meta '`_, or "byte order", of the data type. Following Numpy's example, - in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. + The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, + in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. In addition to defining a representation of the data type itself (which in the example above was just a simple string ``"i2``; Zarr V3 represents the same data type as ``int16``. -* No endianness -* A data type can be encoded in metadata as a string or a ``JSON`` object with the structure ``{"name": , "configuration": {...}}`` +----------------------------- + +Zarr V3 brings several key changes to how data types are represented: + +- Zarr V3 identifies the basic data types as strings like ``int8``, ``int16``, etc. In Zarr V2 ``int8`` would represented as ``|i1``, ``int16`` would be ``>i2`` **or** ``i2')`` should be saved as ``{..., "dtype" : ">i2"}`` in Zarr V2 metadata. - - In Zarr V3 metadata, the same Numpy data type would be saved as ``{..., "data_type": "int16", "codecs": [..., {"name": "bytes", "configuration": {"endian": "big"}, ...]}`` - -* Associate a default fill value with a native data type. This is not mandated by the Zarr specifications, but it's convenient for users - to have a useful default. For numeric types like integers and floats the default can be statically set to 0, but for - parametric data types like fixed-length strings the default can only be generated after the data type has been parametrized at runtime. - -* Round-trip native scalars to the ``fill_value`` field in Zarr V2 and V3 array metadata documents. The Zarr V2 and V3 specifications - define how scalars of each data type should be stored as JSON in array metadata documents, and in principle each data type - can define this encoding separately. - -* Do all of the above for *user-defined data types*. Zarr-Python should support data types added as extensions,so we cannot - hard-code the list of data types. We need to ensure that users can easily (or easily enough) define a python object - that models their custom data type and register this object with Zarr-Python, so that the above operations all succeed for their - custom data type. - -To achieve these goals, Zarr Python uses a class called :class:`zarr.core.dtype.DTypeWrapper` to wrap native data types. Each data type -supported by Zarr Python is modeled by a subclass of `DTypeWrapper`, which has the following structure: - -(attribute) ``dtype_cls`` -^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``dtype_cls`` attribute is a **class variable** that is bound to a class that can produce -an instance of a native data type. For example, on the ``DTypeWrapper`` used to model the boolean -data type, the ``dtype_cls`` attribute is bound to the numpy bool data type class: ``np.dtypes.BoolDType``. -This attribute is used when we need to create an instance of the native data type, for example when -defining a Numpy array that will contain Zarr data. - -It might seem odd that ``DTypeWrapper.dtype_cls`` binds to a *class* that produces a native data type instead of an instance of that native data type -- -why not have a ``DTypeWrapper.dtype`` attribute that binds to ``np.dtypes.BoolDType()``? The reason why ``DTypeWrapper`` -doesn't wrap a concrete data type instance is because data type instances may have endianness information, but Zarr V3 -data types do not. To model Zarr V3 data types, we need endianness to be an **instance variable** which is -defined when creating an instance of the ```DTypeWrapper``. Subclasses of ``DTypeWrapper`` that model data types with -byte order semantics thus have ``endianness`` as an instance variable, and this value can be set when creating an instance of the wrapper. - - -(attribute) ``_zarr_v3_name`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``_zarr_v3_name`` attribute encodes the canonical name for a data type for Zarr V3. For many data types these names -are defined in the `Zarr V3 specification `_ For nearly all of the -data types defined in Zarr V3, this name can be used to uniquely specify a data type. The one exception is the ``r*`` data type, -which is parametrized by a number of bits, and so may take the form ``r8``, ``r16``, ... etc. - -(class method) ``from_dtype(cls, dtype) -> Self`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method defines a procedure for safely converting a native dtype instance into an instance of ``DTypeWrapper``. It should perform -validation of its input to ensure that the native dtype is an instance of the ``dtype_cls`` class attribute, for example. For some -data types, additional checks are needed -- in Numpy "structured" data types and "void" data types use the same class, with different properties. -A ``DTypeWrapper`` that wraps Numpy structured data types must do additional checks to ensure that the input ``dtype`` is actually a structured data type. -If input validation succeeds, this method will call ``_from_dtype_unsafe``. - -(method) ``to_dtype(self) -> dtype`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method produces a native data type consistent with the properties of the ``DTypeWrapper``. Together -with ``from_dtype``, this method allows round-trip conversion of a native data type in to a wrapper class and then out again. - -That is, for some ``DTypeWrapper`` class ``FooWrapper`` that wraps a native data type called ``foo``, ``FooWrapper.from_dtype(instance_of_foo).to_dtype() == instance_of_foo`` should be true. - -(method) ``to_dict(self) -> dict`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method generates a JSON-serialiazable representation of the wrapped data type which can be stored in -Zarr metadata. - -(method) ``cast_value(self, value: object) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method converts a python object to an instance of the wrapped data type. It is used for generating the default -value associated with this data type. - - -(method) ``default_value(self) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method returns the default value for the wrapped data type. Zarr-Python uses this method to generate a default fill value -for an array when a user has not requested one. - -Why is this a method and not a static attribute? Although some data types -can have a static default value, parametrized data types like fixed-length strings or structured data types cannot. For these data types, -a default value must be calculated based on the attributes of the wrapped data type. - -(class method) ``check_dtype(cls, dtype) -> bool`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This class method checks if a native dtype is compatible with the ``DTypeWrapper`` class. It returns ``True`` -if ``dtype`` is compatible with the wrapper class, and ``False`` otherwise. For many data types, this check is as simple -as checking that ``cls.dtype_cls`` matches ``type(dtype)``, i.e. checking that the data type class wrapped -by the ``DTypeWrapper`` is the same as the class of ``dtype``. But there are some data types where this check alone is not sufficient, -in which case this method is overridden so that additional properties of ``dtype`` can be inspected and compared with -the expectations of ``cls``. - -(class method) ``from_dict(cls, dtype) -> Self`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This class method creates a ``DTypeWrapper`` from an appropriately structured dictionary. The default -implementation first checks that the dictionary has the correct structure, and then uses its data -to instantiate the ``DTypeWrapper`` instance. - -(method) ``to_dict(self) -> dict[str, JSON]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Returns a dictionary form of the wrapped data type. This is used prior to writing array metadata. - -(class method) ``get_name(self, zarr_format: Literal[2, 3]) -> str`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method generates a name for the wrapped data type, depending on the Zarr format. If ``zarr_format`` is -2 and the wrapped data type is a Numpy data type, then the Numpy string representation of that data type is returned. -If ``zarr_format`` is 3, then the Zarr V3 name for the wrapped data type is returned. For most data types -the Zarr V3 name will be stored as the ``_zarr_v3_name`` class attribute, but for parametric data types the -name must be computed at runtime based on the parameters of the data type. - - -(method) ``to_json_value(self, data: scalar, zarr_format: Literal[2, 3]) -> JSON`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This method converts a scalar instance of the data type into a JSON-serialiazable value. -For some data types like bool and integers this conversion is simple -- just return a JSON boolean -or number -- but other data types define a JSON serialization for scalars that is a bit more involved. -And this JSON serialization depends on the Zarr format. - -(method) ``from_json_value(self, data: JSON, zarr_format: Literal[2, 3]) -> scalar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Convert a JSON-serialiazed scalar to a native scalar. This inverts the operation of ``to_json_value``. - -Using a custom data type ------------------------- - -TODO \ No newline at end of file +To abstract over these syntactical and semantic differences, Zarr-Python uses a class called `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ to wrap native data types (e.g., Numpy data types) and provide Zarr V2 and Zarr V3 compatibility routines. +Each data type supported by Zarr-Python is modeled by a subclass of ``ZDType``, which provides an API for the following operations: + +- Wrapping / unwrapping a native data type +- Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. +- Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. + + +Example Usage +~~~~~~~~~~~~~ + +.. code-block:: python + + from zarr.core.dtype.wrapper import Int8 + + # Create a ZDType instance from a native dtype + int8 = Int8.from_dtype(np.dtype('int8')) + + # Convert back to native dtype + native_dtype = int8.to_dtype() + assert native_dtype == np.dtype('int8') + + # Get the default value + default_value = int8.default_value() + assert default_value == np.int8(0) + + # Serialize to JSON + json_representation = int8.to_json(zarr_format=3) + + # Serialize a scalar value + json_value = int8.to_json_value(42, zarr_format=3) + assert json_value == 42 + + # Deserialize a scalar value + scalar_value = int8.from_json_value(42, zarr_format=3) + assert scalar_value == np.int8(42) + +Custom Data Types +~~~~~~~~~~~~~~~~~ + +Users can define custom data types by subclassing `ZDType` and implementing the required methods. +Once defined, the custom data type can be registered with Zarr-Python to enable seamless integration with the library. + + \ No newline at end of file From 9a2eb93670b5607f22d484553dfab65f047fc5e6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 17:43:12 +0100 Subject: [PATCH 050/129] add failing endianness round-trip test --- tests/test_array.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 4f436a84f4..98553860e3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -7,7 +7,7 @@ import re import sys from itertools import accumulate -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, get_args from unittest import mock import numcodecs @@ -42,7 +42,8 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64 +from zarr.core.dtype._numpy import Float64, endianness_from_numpy_str +from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv @@ -1661,3 +1662,20 @@ async def test_sharding_coordinate_selection() -> None: ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("endianness", get_args(Endianness)) +def test_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: + """ + Test that that endianness is correctly set when creating an array. + """ + if endianness == "little": + np_dtype = " Date: Mon, 24 Mar 2025 18:28:45 +0100 Subject: [PATCH 051/129] fix endianness --- src/zarr/core/array.py | 33 ++++++++++++++++ src/zarr/dtype.py | 88 +----------------------------------------- tests/test_array.py | 45 ++++++++++++++++----- 3 files changed, 70 insertions(+), 96 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a0e4f9c1ab..1ac06f731e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,6 +72,7 @@ ZDTypeLike, parse_data_type, ) +from zarr.core.dtype._numpy import HasEndianness from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4682,6 +4683,24 @@ def _get_default_chunk_encoding_v3( else: serializer = zarr_config.get("array.v3_default_serializer.default") + # Modify the default serializer so that it matches the endianness of the dtype, otherwise unset the + # endian key + + # This is effective problematic for many reasons: + # - we are assuming that endianness is set by the serializer, when it could also be changed + # by any one of the filters. + # - we are assuming that the serializer has a specific configuration. A different serializer that + # alters endianness might not use the same configuration structure. + # - we are mutating a configuration dictionary. It would be much better to work with the codec + # api for this. + # All of these things are acceptable right now because there is only 1 serializer that affects + # endianness, but this design will not last if this situation changes. + if "endian" in serializer["configuration"]: + if isinstance(dtype, HasEndianness): + serializer["configuration"]["endian"] = dtype.endianness + else: + serializer["configuration"].pop("endian") + return ( tuple(_parse_array_array_codec(f) for f in filters), _parse_array_bytes_codec(serializer), @@ -4816,6 +4835,20 @@ def _parse_chunk_encoding_v3( # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. out_array_bytes = _parse_array_bytes_codec(serializer) + # check that the endianness of the requested serializer matches the dtype of the data, if applicable + if ( + isinstance(out_array_bytes, BytesCodec) + and isinstance(dtype, HasEndianness) + and ( + out_array_bytes.endian is None + or str(out_array_bytes.endian.value) != dtype.endianness + ) + ): + msg = ( + f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + raise ValueError(msg) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 79f3aa3a0f..6e3789543b 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -1,87 +1,3 @@ -from zarr.core.dtype import ( - Bool, - Complex64, - Complex128, - DataTypeValidationError, - DateTime64, - DateTime64JSON_V2, - DateTime64JSON_V3, - FixedLengthUTF32, - FixedLengthUTF32JSON_V2, - FixedLengthUTF32JSON_V3, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, - NullTerminatedBytes, - NullterminatedBytesJSON_V2, - NullTerminatedBytesJSON_V3, - RawBytes, - RawBytesJSON_V2, - RawBytesJSON_V3, - Structured, - StructuredJSON_V2, - StructuredJSON_V3, - TimeDelta64, - TimeDelta64JSON_V2, - TimeDelta64JSON_V3, - UInt8, - UInt16, - UInt32, - UInt64, - VariableLengthBytes, - VariableLengthBytesJSON_V2, - VariableLengthUTF8, - VariableLengthUTF8JSON_V2, - ZDType, - data_type_registry, - parse_data_type, -) +from zarr.core.dtype import ZDType, data_type_registry -__all__ = [ - "Bool", - "Complex64", - "Complex128", - "DataTypeValidationError", - "DateTime64", - "DateTime64JSON_V2", - "DateTime64JSON_V3", - "FixedLengthUTF32", - "FixedLengthUTF32JSON_V2", - "FixedLengthUTF32JSON_V3", - "Float16", - "Float32", - "Float64", - "Int8", - "Int16", - "Int32", - "Int64", - "NullTerminatedBytes", - "NullTerminatedBytesJSON_V3", - "NullterminatedBytesJSON_V2", - "RawBytes", - "RawBytesJSON_V2", - "RawBytesJSON_V3", - "Structured", - "StructuredJSON_V2", - "StructuredJSON_V3", - "TimeDelta64", - "TimeDelta64", - "TimeDelta64JSON_V2", - "TimeDelta64JSON_V3", - "UInt8", - "UInt16", - "UInt32", - "UInt64", - "VariableLengthBytes", - "VariableLengthBytesJSON_V2", - "VariableLengthUTF8", - "VariableLengthUTF8JSON_V2", - "ZDType", - "data_type_registry", - "data_type_registry", - "parse_data_type", -] +__all__ = ["ZDType", "data_type_registry"] diff --git a/tests/test_array.py b/tests/test_array.py index 98553860e3..6163470c29 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -42,11 +42,12 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64, endianness_from_numpy_str +from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -56,7 +57,6 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata - from zarr.core.metadata.v3 import ArrayV3Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1666,16 +1666,41 @@ async def test_sharding_coordinate_selection() -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: +def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: """ - Test that that endianness is correctly set when creating an array. + Test that that endianness is correctly set when creating an array when not specifying a serializer + """ + dtype = Int16(endianness=endianness) + arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + if zarr_format == 3: + assert isinstance(arr.metadata, ArrayV3Metadata) # mypy + assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("endianness", get_args(Endianness)) +def test_explicit_endianness(store: Store, endianness: Endianness) -> None: + """ + Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error """ if endianness == "little": - np_dtype = " Date: Mon, 24 Mar 2025 18:38:08 +0100 Subject: [PATCH 052/129] additional check in test_explicit_endianness --- tests/test_array.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index 6163470c29..714963a7cb 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1704,3 +1704,19 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: zarr_format=3, serializer=serializer, ) + + # additional check for the case where the serializer has endian=None + none_serializer = dataclasses.replace(serializer, endian=None) + msg = ( + f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=none_serializer, + ) From 4dc9cd2121e4232faab8ba29239a1215ee96818d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 24 Mar 2025 21:22:52 +0100 Subject: [PATCH 053/129] add failing test for round-tripping vlen strings --- src/zarr/core/array.py | 4 +- src/zarr/core/dtype/__init__.py | 7 +- src/zarr/core/dtype/_numpy.py | 2 +- src/zarr/core/dtype/common.py | 5 +- tests/test_array.py | 171 +++++++++++++++++++------------- 5 files changed, 116 insertions(+), 73 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1ac06f731e..8aba3e564f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4695,11 +4695,11 @@ def _get_default_chunk_encoding_v3( # api for this. # All of these things are acceptable right now because there is only 1 serializer that affects # endianness, but this design will not last if this situation changes. - if "endian" in serializer["configuration"]: + if serializer.get("configuration") is not None: if isinstance(dtype, HasEndianness): serializer["configuration"]["endian"] = dtype.endianness else: - serializer["configuration"].pop("endian") + serializer["configuration"].pop("endian", None) return ( tuple(_parse_array_array_codec(f) for f in filters), diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 021b6b48e2..f9b1364011 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -99,7 +99,12 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: - na_dtype = np.dtype(dtype) + if dtype == "|T16": + # `|T16` is the numpy dtype str form for variable length strings. unfortunately + # numpy cannot create these directly from np.dtype("|T16") + na_dtype = np.dtypes.StringDType() + else: + na_dtype = np.dtype(dtype) else: na_dtype = dtype return data_type_registry.match_dtype(na_dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 38597f8fee..cab849cf74 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -907,7 +907,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) + return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_value_unsafe(self, value: object) -> np.void: return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index a53d2e7866..900b3fddbd 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -356,7 +356,10 @@ def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: """ if zarr_format == 2: return base64.b64decode(data.encode("ascii")) - raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.") + # TODO: differentiate these as needed. This is a spec question. + if zarr_format == 3: + return base64.b64decode(data.encode("ascii")) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") def float_from_json_v2(data: JSONFloat) -> float: diff --git a/tests/test_array.py b/tests/test_array.py index 714963a7cb..0cd28e3e94 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -42,7 +42,13 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str +from zarr.core.dtype._numpy import ( + DateTime64, + Float64, + Int16, + Structured, + endianness_from_numpy_str, +) from zarr.core.dtype.common import Endianness from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup @@ -981,16 +987,58 @@ def test_chunks_and_shards(store: Store) -> None: @staticmethod @pytest.mark.parametrize("dtype", zdtype_examples) - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: """ Test that the fill value of an array is set to the default value for the dtype object """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) - if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): - assert np.isnat(dtype.default_scalar()) + if isinstance(dtype, DateTime64) and np.isnat(a.fill_value): + assert np.isnat(dtype.default_value()) else: - assert a.fill_value == dtype.default_scalar() + assert a.fill_value == dtype.default_value() + + @staticmethod + @pytest.mark.parametrize("dtype", zdtype_examples) + def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None: + """ + Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string + """ + a = zarr.create_array( + store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format + ) + b = zarr.create_array( + store, + name="b", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype(), + zarr_format=zarr_format, + ) + assert a.dtype == b.dtype + + # Structured dtypes do not have a numpy string representation that uniquely identifies them + if not isinstance(dtype, Structured): + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().str, + zarr_format=zarr_format, + ) + assert a.dtype == c.dtype + + @staticmethod + @pytest.mark.parametrize("dtype", zdtype_examples) + def test_dtype_roundtrip( + dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat + ) -> None: + """ + Test that creating an array, then opening it, gets the same array. + """ + a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format) + b = zarr.open_array(store) + assert a.dtype == b.dtype @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) @@ -1381,18 +1429,63 @@ async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) -> ) @staticmethod - @pytest.mark.parametrize("endianness", ENDIANNESS_STR) + @pytest.mark.parametrize("endianness", get_args(Endianness)) def test_default_endianness( - store: Store, zarr_format: ZarrFormat, endianness: EndiannessStr + store: Store, zarr_format: ZarrFormat, endianness: Endianness ) -> None: """ Test that that endianness is correctly set when creating an array when not specifying a serializer """ dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - byte_order: str = arr[:].dtype.byteorder # type: ignore[union-attr] - assert byte_order in NUMPY_ENDIANNESS_STR - assert endianness_from_numpy_str(byte_order) == endianness # type: ignore[arg-type] + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + if zarr_format == 3: + assert isinstance(arr.metadata, ArrayV3Metadata) # mypy + assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] + + @staticmethod + @pytest.mark.parametrize("endianness", get_args(Endianness)) + def test_explicit_endianness(store: Store, endianness: Endianness) -> None: + """ + Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error + """ + if endianness == "little": + dtype = Int16(endianness="big") + else: + dtype = Int16(endianness="little") + + serializer = BytesCodec(endian=endianness) + + msg = ( + f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=serializer, + ) + + # additional check for the case where the serializer has endian=None + none_serializer = dataclasses.replace(serializer, endian=None) + msg = ( + f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " + "The endianness of the serializer and the dtype must match." + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + _ = zarr.create_array( + store=store, + shape=(1,), + dtype=dtype, + zarr_format=3, + serializer=none_serializer, + ) + @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) @@ -1662,61 +1755,3 @@ async def test_sharding_coordinate_selection() -> None: ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None: - """ - Test that that endianness is correctly set when creating an array when not specifying a serializer - """ - dtype = Int16(endianness=endianness) - arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness - if zarr_format == 3: - assert isinstance(arr.metadata, ArrayV3Metadata) # mypy - assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("endianness", get_args(Endianness)) -def test_explicit_endianness(store: Store, endianness: Endianness) -> None: - """ - Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error - """ - if endianness == "little": - dtype = Int16(endianness="big") - else: - dtype = Int16(endianness="little") - - serializer = BytesCodec(endian=endianness) - - msg = ( - f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=serializer, - ) - - # additional check for the case where the serializer has endian=None - none_serializer = dataclasses.replace(serializer, endian=None) - msg = ( - f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=none_serializer, - ) From e31e8135bd2ae7d81b2a3752a487fd30e56f2dcc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 10:11:54 +0100 Subject: [PATCH 054/129] route object dtype arrays to vlen string dtype when numpy > 2 --- src/zarr/core/dtype/__init__.py | 12 +++++------- src/zarr/core/dtype/_numpy.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index f9b1364011..5483b21998 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -90,7 +90,10 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, """ data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): - if dtype in (str, "str"): + # TODO: This check has a lot of assumptions in it! Chiefly, we assume that the + # numpy object dtype contains variable length strings, which is not in general true + # When / if zarr python supports ragged arrays, for example, this check will fail! + if dtype in (str, "str", "|T16", "O", "|O", np.dtypes.ObjectDType()): if _NUMPY_SUPPORTS_VLEN_STRING: na_dtype = np.dtype("T") else: @@ -99,12 +102,7 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: - if dtype == "|T16": - # `|T16` is the numpy dtype str form for variable length strings. unfortunately - # numpy cannot create these directly from np.dtype("|T16") - na_dtype = np.dtypes.StringDType() - else: - na_dtype = np.dtype(dtype) + na_dtype = np.dtype(dtype) else: na_dtype = dtype return data_type_registry.match_dtype(na_dtype) diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index cab849cf74..7c803ce1f0 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -1051,7 +1051,7 @@ def _cast_value_unsafe(self, value: object) -> str: return str(value) else: - + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType From 844a94d60d9f237074353e5522f26f6a1ad0f729 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 10:57:22 +0100 Subject: [PATCH 055/129] relax endianness mismatch to a warning instead of an error --- src/zarr/core/array.py | 4 ++-- tests/test_array.py | 21 +++------------------ 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8aba3e564f..b2058df607 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4846,9 +4846,9 @@ def _parse_chunk_encoding_v3( ): msg = ( f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." + "In this situation the serializer's endianness takes priority. To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." ) - raise ValueError(msg) + warnings.warn(msg, UserWarning, stacklevel=2) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () diff --git a/tests/test_array.py b/tests/test_array.py index 0cd28e3e94..ea3b64d87d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1458,10 +1458,11 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: msg = ( f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." + "In this situation the serializer's endianness takes priority. " + "To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." ) - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.warns(UserWarning, match=re.escape(msg)): _ = zarr.create_array( store=store, shape=(1,), @@ -1470,22 +1471,6 @@ def test_explicit_endianness(store: Store, endianness: Endianness) -> None: serializer=serializer, ) - # additional check for the case where the serializer has endian=None - none_serializer = dataclasses.replace(serializer, endian=None) - msg = ( - f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "The endianness of the serializer and the dtype must match." - ) - - with pytest.raises(ValueError, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=none_serializer, - ) - @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) From aa19ccaabd4aa2aef69d070c6a5a46717d3d393f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:14:56 +0100 Subject: [PATCH 056/129] use public dtype module for docs instead of special-casing the core dype module --- docs/conf.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 8a64836d5e..a83461d54a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ issues_github_path = "zarr-developers/zarr-python" autoapi_dirs = ['../src/zarr'] -autoapi_add_toctree_entry = False +autoapi_add_toctree_entry = True autoapi_generate_api_docs = True autoapi_member_order = "groupwise" autoapi_root = "api" @@ -68,10 +68,7 @@ def skip_submodules( ) -> bool: # Skip documenting zarr.codecs submodules # codecs are documented in the main zarr.codecs namespace - # TODO: just document everything instead using this weak case-by-case logic - if what == "module" and name.startswith("zarr.core.dtype."): - skip = False - elif what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): + if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): skip = True return skip From 528cf2834972d9cc0280b5cc9d587cc7db5af18d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:15:08 +0100 Subject: [PATCH 057/129] use public dtype module for docs instead of special-casing the core dype module --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index a83461d54a..803d2c4255 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ issues_github_path = "zarr-developers/zarr-python" autoapi_dirs = ['../src/zarr'] -autoapi_add_toctree_entry = True +autoapi_add_toctree_entry = False autoapi_generate_api_docs = True autoapi_member_order = "groupwise" autoapi_root = "api" From cdc83a8ee30391204dd3b36117401702d7211cdd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:35:17 +0100 Subject: [PATCH 058/129] silence mypy error about array indexing --- tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index ea3b64d87d..66806d017c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1739,4 +1739,4 @@ async def test_sharding_coordinate_selection() -> None: shards=(2, 4, 4), ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) - assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() # type: ignore[index] From 78747c920187f415173754e08e278591e101a37f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:35:38 +0100 Subject: [PATCH 059/129] add release note --- changes/2874.feature.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/2874.feature.rst diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst new file mode 100644 index 0000000000..26eda3a257 --- /dev/null +++ b/changes/2874.feature.rst @@ -0,0 +1,2 @@ +Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file From 901be0d39942c7db810b7e5f5e1993b17ce8ece6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 Mar 2025 12:50:26 +0100 Subject: [PATCH 060/129] fix doctests, excluding config tests --- docs/user-guide/groups.rst | 4 ++-- docs/user-guide/performance.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 9b241d2455..f9633dd6c1 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -128,7 +128,7 @@ property. E.g.:: >>> bar.info_complete() Type : Array Zarr format : 3 - Data type : int64 + Data type : Int64(endianness='little') Shape : (1000000,) Chunk shape : (100000,) Order : C @@ -144,7 +144,7 @@ property. E.g.:: >>> baz.info Type : Array Zarr format : 3 - Data type : float32 + Data type : Float32(endianness='little') Shape : (1000, 1000) Chunk shape : (100, 100) Order : C diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 6a60edbbd3..9f2e730785 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -91,7 +91,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the >>> z6.info Type : Array Zarr format : 3 - Data type : uint8 + Data type : UInt8() Shape : (10000, 10000, 1000) Shard shape : (1000, 1000, 1000) Chunk shape : (100, 100, 100) @@ -121,7 +121,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> c.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -140,7 +140,7 @@ ratios, depending on the correlation structure within the data. E.g.:: >>> f.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : F From 6eb707da999313eb4bfd38dfaf3d56415c8afef1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 10:17:28 +0100 Subject: [PATCH 061/129] revert addition of linkage between dtype endianness and bytes codec endianness --- src/zarr/core/array.py | 36 +++--------------------------------- tests/test_array.py | 33 +-------------------------------- 2 files changed, 4 insertions(+), 65 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b2058df607..023e12747b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,7 +72,6 @@ ZDTypeLike, parse_data_type, ) -from zarr.core.dtype._numpy import HasEndianness from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4683,24 +4682,6 @@ def _get_default_chunk_encoding_v3( else: serializer = zarr_config.get("array.v3_default_serializer.default") - # Modify the default serializer so that it matches the endianness of the dtype, otherwise unset the - # endian key - - # This is effective problematic for many reasons: - # - we are assuming that endianness is set by the serializer, when it could also be changed - # by any one of the filters. - # - we are assuming that the serializer has a specific configuration. A different serializer that - # alters endianness might not use the same configuration structure. - # - we are mutating a configuration dictionary. It would be much better to work with the codec - # api for this. - # All of these things are acceptable right now because there is only 1 serializer that affects - # endianness, but this design will not last if this situation changes. - if serializer.get("configuration") is not None: - if isinstance(dtype, HasEndianness): - serializer["configuration"]["endian"] = dtype.endianness - else: - serializer["configuration"].pop("endian", None) - return ( tuple(_parse_array_array_codec(f) for f in filters), _parse_array_bytes_codec(serializer), @@ -4835,20 +4816,6 @@ def _parse_chunk_encoding_v3( # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. out_array_bytes = _parse_array_bytes_codec(serializer) - # check that the endianness of the requested serializer matches the dtype of the data, if applicable - if ( - isinstance(out_array_bytes, BytesCodec) - and isinstance(dtype, HasEndianness) - and ( - out_array_bytes.endian is None - or str(out_array_bytes.endian.value) != dtype.endianness - ) - ): - msg = ( - f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). " - "In this situation the serializer's endianness takes priority. To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." - ) - warnings.warn(msg, UserWarning, stacklevel=2) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () @@ -4868,6 +4835,9 @@ def _parse_chunk_encoding_v3( # TODO: refactor so that the config only contains the name of the codec, and we use the dtype # to create the codec instance, instead of storing a dict representation of a full codec. + # TODO: ensure that the serializer is compatible with the ndarray produced by the + # array-array codecs. For example, if a sequence of array-array codecs produces an + # array with a single-byte data type, then the serializer should not specify endiannesss. if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: # The default endianness in the bytescodec might not be None, so we need to replace it out_array_bytes = replace(out_array_bytes, endian=None) diff --git a/tests/test_array.py b/tests/test_array.py index 66806d017c..1c863c3ebe 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -53,7 +53,6 @@ from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv -from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -63,6 +62,7 @@ if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike from zarr.core.metadata.v2 import ArrayV2Metadata + from zarr.core.metadata.v3 import ArrayV3Metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -1439,37 +1439,6 @@ def test_default_endianness( dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness - if zarr_format == 3: - assert isinstance(arr.metadata, ArrayV3Metadata) # mypy - assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr] - - @staticmethod - @pytest.mark.parametrize("endianness", get_args(Endianness)) - def test_explicit_endianness(store: Store, endianness: Endianness) -> None: - """ - Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error - """ - if endianness == "little": - dtype = Int16(endianness="big") - else: - dtype = Int16(endianness="little") - - serializer = BytesCodec(endian=endianness) - - msg = ( - f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). " - "In this situation the serializer's endianness takes priority. " - "To avoid this warning, ensure the endianness of the serializer matches the endianness of the dtype." - ) - - with pytest.warns(UserWarning, match=re.escape(msg)): - _ = zarr.create_array( - store=store, - shape=(1,), - dtype=dtype, - zarr_format=3, - serializer=serializer, - ) From 5f0e60fc31ebf1e1b1b6083ebb15bdb660e212cb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 10:45:53 +0100 Subject: [PATCH 062/129] remove Any types --- src/zarr/core/_info.py | 6 +++--- src/zarr/core/array_spec.py | 6 +++--- src/zarr/core/dtype/__init__.py | 6 +++--- src/zarr/core/dtype/_numpy.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 610ae48382..310ba27ea1 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -2,14 +2,14 @@ import dataclasses import textwrap -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: import numcodecs.abc from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[Any, Any] + _data_type: ZDType[_BaseDType, _BaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index f1eac930c4..e8e451944f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType + from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[Any, Any] + dtype: ZDType[_BaseDType, _BaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[Any, Any], + dtype: ZDType[_BaseDType, _BaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 5483b21998..0aaf9ccf06 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, TypeAlias, get_args +from typing import TYPE_CHECKING, TypeAlias, get_args if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -77,7 +77,7 @@ | DateTime64 ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[Any, Any] | dict[str, JSON] +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) @@ -114,7 +114,7 @@ def get_data_type_from_json( return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[Any, Any]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: """ Interpret the input as a ZDType instance. """ diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py index 7c803ce1f0..51be83b173 100644 --- a/src/zarr/core/dtype/_numpy.py +++ b/src/zarr/core/dtype/_numpy.py @@ -1232,7 +1232,7 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, ZDType[Any, Any]]] = [] + fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") From d8bf27437d03434ee69df2665b321c21d9ff3414 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 11:27:54 +0100 Subject: [PATCH 063/129] add docstring for wrapper module --- src/zarr/core/dtype/wrapper.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 74e7bf79e1..ba1b78f096 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -1,3 +1,25 @@ +""" +Wrapper for native array data types. + +The `ZDType` class is an abstract base class for wrapping native array data types, e.g. numpy dtypes. +It provides a common interface for working with data types in a way that is independent of the +underlying data type system. + +The wrapper class encapsulates a native data type. Instances of the class can be created from a +native data type instance, and a native data type instance can be created from an instance of the +wrapper class. + +The wrapper class is responsible for: +- Reversibly serializing a native data type to Zarr V2 or Zarr V3 metadata. + This ensures that the data type can be properly stored and retrieved from array metadata. +- Reversibly serializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for + storing a fill value for an array in a manner that is valid for the data type. + +To add support for a new data type in Zarr, you should subclass the wrapper class and adapt its methods +to support your native data type. The wrapper class must be added to a data type registry +(defined elsewhere) before ``create_array`` can properly handle the new data type. +""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -17,9 +39,10 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. _BaseDType = np.dtype[np.generic] +# These two type parameters are covariant because we want +# x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] +# to type check TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) -# TODO: figure out an interface or protocol that non-numpy dtypes can use -# These two type parameters are covariant because we want isinstance(ZDType[Subclass](), ZDType[BaseDType]) to be True TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) From 7d6b86eeae69daa05cd72a910ef415514965108d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 15:23:08 +0100 Subject: [PATCH 064/129] simplify config and docs --- docs/user-guide/arrays.rst | 18 +++++++++++++---- docs/user-guide/config.rst | 24 ++++++++++++++++++++--- src/zarr/core/array.py | 40 +++++++------------------------------- src/zarr/core/config.py | 28 ++++++++++++++------------ 4 files changed, 58 insertions(+), 52 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index ad2a1e9cc6..e4b253d812 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -183,7 +183,7 @@ which can be used to print useful diagnostics, e.g.:: >>> z.info Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -200,7 +200,7 @@ prints additional diagnostics, e.g.:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -244,6 +244,16 @@ built-in delta filter:: >>> z.compressors (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) +The default compressor can be changed by setting the value of the using Zarr's +:ref:`user-guide-config`, e.g.:: + + >>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}): + ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) + >>> z.filters + () + >>> z.compressors + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) + To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) @@ -277,7 +287,7 @@ Here is an example using a delta filter with the Blosc compressor:: >>> z.info_complete() Type : Array Zarr format : 3 - Data type : int32 + Data type : Int32(endianness='little') Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -594,7 +604,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za >>> a.info_complete() Type : Array Zarr format : 3 - Data type : uint8 + Data type : UInt8() Shape : (10000, 10000) Shard shape : (1000, 1000) Chunk shape : (100, 100) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 0ae8017ca9..4479e30619 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -43,9 +43,27 @@ This is the current default configuration:: >>> zarr.config.pprint() {'array': {'order': 'C', - 'write_empty_chunks': False}, + 'v2_default_compressor': {'default': {'checksum': False, + 'id': 'zstd', + 'level': 0}, + 'variable-length-string': {'checksum': False, + 'id': 'zstd', + 'level': 0}}, + 'v2_default_filters': {'default': None, + 'variable-length-string': [{'id': 'vlen-utf8'}]}, + 'v3_default_compressors': {'default': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'variable-length-string': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}]}, + 'v3_default_filters': {'default': [], 'variable-length-string': []}, + 'v3_default_serializer': {'default': {'configuration': {'endian': 'little'}, + 'name': 'bytes'}, + 'variable-length-string': {'name': 'vlen-utf8'}}, + 'write_empty_chunks': False}, 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.buffer.cpu.Buffer', + 'buffer': 'zarr.core.buffer.cpu.Buffer', 'codec_pipeline': {'batch_size': 1, 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', @@ -60,5 +78,5 @@ This is the current default configuration:: 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, 'default_zarr_format': 3, 'json_indent': 2, - 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', + 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', 'threading': {'max_workers': None}} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 023e12747b..83d9763915 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4660,27 +4660,12 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - # the config will not allow keys to have "." characters in them - # so we will access the config by transforming "." to "__" - dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") + dtype_category = categorize_data_type(dtype) - # TODO: find a registry-style solution for this that isn't bloated - # We need to associate specific dtypes with specific encoding schemes - - if dtype_name_conf in zarr_config.get("array.v3_default_filters"): - filters = zarr_config.get(f"array.v3_default_filters.{dtype_name_conf}") - else: - filters = zarr_config.get("array.v3_default_filters.default") - - if dtype_name_conf in zarr_config.get("array.v3_default_compressors"): - compressors = zarr_config.get(f"array.v3_default_compressors.{dtype_name_conf}") - else: - compressors = zarr_config.get("array.v3_default_compressors.default") - if dtype_name_conf in zarr_config.get("array.v3_default_serializer"): - serializer = zarr_config.get(f"array.v3_default_serializer.{dtype_name_conf}") - else: - serializer = zarr_config.get("array.v3_default_serializer.default") + filters = zarr_config.get("array.v3_default_filters").get(dtype_category) + compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) + serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) return ( tuple(_parse_array_array_codec(f) for f in filters), @@ -4697,20 +4682,9 @@ def _get_default_chunk_encoding_v2( This is an empty tuple. No data types have default filters. """ - # the config will not allow keys to have "." characters in them - # so we will access the config by transforming "." to "__" - dtype_name_conf = dtype._zarr_v3_name.replace(".", "__") - - if dtype_name_conf in zarr_config.get("array.v2_default_filters"): - filters = zarr_config.get(f"array.v2_default_filters.{dtype_name_conf}") - else: - filters = zarr_config.get("array.v2_default_filters.default") - - if dtype_name_conf in zarr_config.get("array.v2_default_compressor"): - compressor = zarr_config.get(f"array.v2_default_compressor.{dtype_name_conf}") - else: - compressor = zarr_config.get("array.v2_default_compressor.default") - + dtype_category = categorize_data_type(dtype) + filters = zarr_config.get("array.v2_default_filters").get(dtype_category) + compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category) if filters is not None: filters = tuple(numcodecs.get_codec(f) for f in filters) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 8e0a55b8d0..06fa8536ae 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -36,6 +36,8 @@ if TYPE_CHECKING: from donfig.config_obj import ConfigSet + from zarr.core.dtype.wrapper import ZDType + class BadConfigError(ValueError): _msg = "bad Config: %r" @@ -104,24 +106,26 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, + "v2_default_compressor": { + "default": {"id": "zstd", "level": 0, "checksum": False}, + "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, + }, "v2_default_filters": { "default": None, - "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], + "variable-length-string": [{"id": "vlen-utf8"}], }, - "v3_default_filters": {"default": []}, + "v3_default_filters": {"default": [], "variable-length-string": []}, "v3_default_serializer": { "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy__variable_length_utf8": {"name": "vlen-utf8"}, - "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, - "r*": {"name": "vlen-bytes"}, + "variable-length-string": {"name": "vlen-utf8"}, }, "v3_default_compressors": { "default": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ] + ], + "variable-length-string": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}} + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -160,13 +164,13 @@ def parse_indexing_order(data: Any) -> Literal["C", "F"]: def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: """ - Classify a ZDType. The return value is a string which belongs to the type ``DTypeCategory``. + Classify a ZDType. The return value is a string which belongs to the type ``DTypeKind``. This is used by the config system to determine how to encode arrays with the associated data type when the user has not specified a particular serialization scheme. """ - from zarr.core.dtype import VariableLengthUTF8 + from zarr.core.dtype._numpy import VariableLengthString - if isinstance(dtype, VariableLengthUTF8): + if isinstance(dtype, VariableLengthString): return "variable-length-string" return "default" From 5382e181b3d54c5e4343e83c3439b59076c6b7cd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 17:32:03 +0100 Subject: [PATCH 065/129] update config test --- tests/test_config.py | 112 +++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 38d8c1c0bd..53db9e5208 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype._numpy import Int8, VariableLengthString from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -47,55 +47,60 @@ def test_config_defaults_set() -> None: # regression test for available defaults - assert config.defaults == [ - { - "default_zarr_format": 3, - "array": { - "order": "C", - "write_empty_chunks": False, - "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}}, - "v2_default_filters": { - "default": None, - "numpy__variable_length_utf8": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ucs4": [{"id": "vlen-utf8"}], - "numpy__fixed_length_ascii": [{"id": "vlen-bytes"}], + assert ( + config.defaults + == [ + { + "default_zarr_format": 3, + "array": { + "order": "C", + "write_empty_chunks": False, + "v2_default_compressor": { + "default": {"id": "zstd", "level": 0, "checksum": False}, + "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "default": None, + "variable-length-string": [{"id": "vlen-utf8"}], + }, + "v3_default_filters": {"default": [], "variable-length-string": []}, + "v3_default_serializer": { + "default": {"name": "bytes", "configuration": {"endian": "little"}}, + "variable-length-string": {"name": "vlen-utf8"}, + }, + "v3_default_compressors": { + "default": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "variable-length-string": [ + {"name": "zstd", "configuration": {"level": 0, "checksum": False}} + ], + }, }, - "v3_default_filters": {"default": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "numpy__variable_length_utf8": {"name": "vlen-utf8"}, - "numpy__fixed_length_ucs4": {"name": "vlen-utf8"}, - "r*": {"name": "vlen-bytes"}, + "async": {"concurrency": 10, "timeout": None}, + "threading": {"max_workers": None}, + "json_indent": 2, + "codec_pipeline": { + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "batch_size": 1, }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ] + "codecs": { + "blosc": "zarr.codecs.blosc.BloscCodec", + "gzip": "zarr.codecs.gzip.GzipCodec", + "zstd": "zarr.codecs.zstd.ZstdCodec", + "bytes": "zarr.codecs.bytes.BytesCodec", + "endian": "zarr.codecs.bytes.BytesCodec", # compatibility with earlier versions of ZEP1 + "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", + "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", + "transpose": "zarr.codecs.transpose.TransposeCodec", + "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", + "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - }, - "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, - "json_indent": 2, - "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, - }, - "buffer": "zarr.core.buffer.cpu.Buffer", - "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", - "codecs": { - "blosc": "zarr.codecs.blosc.BloscCodec", - "gzip": "zarr.codecs.gzip.GzipCodec", - "zstd": "zarr.codecs.zstd.ZstdCodec", - "bytes": "zarr.codecs.bytes.BytesCodec", - "endian": "zarr.codecs.bytes.BytesCodec", - "crc32c": "zarr.codecs.crc32c_.Crc32cCodec", - "sharding_indexed": "zarr.codecs.sharding.ShardingCodec", - "transpose": "zarr.codecs.transpose.TransposeCodec", - "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", - "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", - }, - } - ] + "buffer": "zarr.core.buffer.cpu.Buffer", + "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", + } + ] + ) assert config.get("array.order") == "C" assert config.get("async.concurrency") == 10 assert config.get("async.timeout") is None @@ -313,15 +318,18 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) -async def test_default_codecs(dtype: str) -> None: +@pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"]) +async def test_default_codecs(dtype_category: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ - zdtype = get_data_type_from_native_dtype(dtype) + if dtype_category == "variable-length-string": + zdtype = VariableLengthString() + else: + zdtype = Int8() expected_compressors = (GzipCodec(),) new_conf = { - f"array.v3_default_compressors.{zdtype._zarr_v3_name.replace('.', '__')}": [ + f"array.v3_default_compressors.{dtype_category}": [ c.to_dict() for c in expected_compressors ] } @@ -329,7 +337,7 @@ async def test_default_codecs(dtype: str) -> None: arr = await create_array( shape=(100,), chunks=(100,), - dtype=dtype, + dtype=zdtype, zarr_format=3, store=MemoryStore(), ) From 233e051fa5e533826c057e4958832c5dd44eeb52 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 26 Mar 2025 17:39:55 +0100 Subject: [PATCH 066/129] fix S dtype test for v2 --- tests/test_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_v2.py b/tests/test_v2.py index dfd81f525a..f71ba84f01 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -92,7 +92,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js "compressor": None, "dtype": expected_dtype, "fill_value": fill_value_json, - "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, + "filters": None, "order": "C", "shape": [3], "zarr_format": 2, From a3a17df4bf70a32dd9b9e5b180a65612528c4cf1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 28 Apr 2025 16:30:02 +0200 Subject: [PATCH 067/129] fully remove v3jsonencoder --- src/zarr/core/metadata/v2.py | 28 ++++++++++++++--- src/zarr/core/metadata/v3.py | 39 +++++------------------- tests/test_metadata/test_consolidated.py | 2 +- tests/test_properties.py | 6 ++-- 4 files changed, 35 insertions(+), 40 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 90ef3c3192..6d51079025 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,8 +1,8 @@ from __future__ import annotations import warnings -from collections.abc import Iterable -from typing import TYPE_CHECKING, TypedDict +from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING, Any, TypedDict import numcodecs.abc @@ -125,7 +125,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: json_indent = config.get("json_indent") return { ZARRAY_JSON: prototype.buffer.from_bytes( - json.dumps(zarray_dict, indent=json_indent).encode() + json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode() ), ZATTRS_JSON: prototype.buffer.from_bytes( json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode() @@ -194,10 +194,12 @@ def to_dict(self) -> dict[str, JSON]: new_filters.append(f) zarray_dict["filters"] = new_filters + # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value + # serialize the dtype after fill value-specific JSON encoding zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) return zarray_dict @@ -287,7 +289,25 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: +def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: + """Handle structured dtype/fill value pairs""" + try: + if isinstance(fill_value, list): + return np.array([tuple(fill_value)], dtype=dtype)[0] + elif isinstance(fill_value, tuple): + return np.array([fill_value], dtype=dtype)[0] + elif isinstance(fill_value, bytes): + return np.frombuffer(fill_value, dtype=dtype)[0] + elif isinstance(fill_value, str): + decoded = base64.standard_b64decode(fill_value) + return np.frombuffer(decoded, dtype=dtype)[0] + else: + return np.array(fill_value, dtype=dtype)[()] + except Exception as e: + raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e + + +def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: """ Inspect a sequence of codecs / filters for an "object codec", i.e. a codec that can serialize object arrays to contiguous bytes. Zarr python diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 0eb472bbc8..05679263c5 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,8 +24,6 @@ from dataclasses import dataclass, field, replace from typing import Any, Literal -import numpy as np - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid @@ -132,33 +130,6 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: ) -class V3JsonEncoder(json.JSONEncoder): - def __init__( - self, - *, - skipkeys: bool = False, - ensure_ascii: bool = True, - check_circular: bool = True, - allow_nan: bool = True, - sort_keys: bool = False, - indent: int | None = None, - separators: tuple[str, str] | None = None, - default: Callable[[object], object] | None = None, - ) -> None: - if indent is None: - indent = config.get("json_indent") - super().__init__( - skipkeys=skipkeys, - ensure_ascii=ensure_ascii, - check_circular=check_circular, - allow_nan=allow_nan, - sort_keys=sort_keys, - indent=indent, - separators=separators, - default=default, - ) - - class ArrayV3MetadataDict(TypedDict): """ A typed dictionary model for zarr v3 metadata. @@ -251,7 +222,7 @@ def ndim(self) -> int: return len(self.shape) @property - def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: + def dtype(self) -> ZDType[_BaseDType, _BaseScalar]: return self.data_type @property @@ -301,9 +272,13 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + json_indent = config.get("json_indent") d = self.to_dict() - # d = _replace_special_floats(self.to_dict()) - return {ZARR_JSON: prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode())} + return { + ZARR_JSON: prototype.buffer.from_bytes( + json.dumps(d, allow_nan=False, indent=json_indent).encode() + ) + } @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 395e036db2..7c82662052 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -638,7 +638,7 @@ async def test_consolidated_metadata_encodes_special_chars( "consolidated_metadata" ]["metadata"] - expected_fill_value = _time._zdtype.to_json_scalar(fill_value, zarr_format=2) + expected_fill_value = _time._zdtype.to_json_value(fill_value, zarr_format=2) if zarr_format == 2: assert root_metadata["time/.zarray"]["fill_value"] == expected_fill_value diff --git a/tests/test_properties.py b/tests/test_properties.py index 27f847fa69..df384f187f 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -326,11 +326,11 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N assert asdict_dict["zarr_format"] == 3 # version-agnostic validations - dtype_native = meta.dtype.to_native_dtype() + dtype_native = meta.dtype.to_dtype() if dtype_native.kind == "f": assert serialized_float_is_valid(asdict_dict["fill_value"]) elif dtype_native.kind == "c": # fill_value should be a two-element array [real, imag]. assert serialized_complex_float_is_valid(asdict_dict["fill_value"]) - elif dtype_native.kind in ("M", "m") and np.isnat(meta.fill_value): - assert asdict_dict["fill_value"] == -9223372036854775808 + elif dtype_native.kind == "M" and np.isnat(meta.fill_value): + assert asdict_dict["fill_value"] == "NaT" From 421cf0b285e0139dfda82e0451e9a025f2828966 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Apr 2025 15:06:11 +0200 Subject: [PATCH 068/129] refactor dtype module structure --- src/zarr/codecs/bytes.py | 2 +- src/zarr/codecs/sharding.py | 2 +- src/zarr/core/config.py | 4 +- src/zarr/core/dtype/__init__.py | 33 +- src/zarr/core/dtype/_numpy.py | 1397 ------------------- src/zarr/core/dtype/common.py | 513 +------ src/zarr/core/dtype/npy/bool.py | 291 +--- src/zarr/core/dtype/npy/common.py | 333 +++-- src/zarr/core/dtype/npy/complex.py | 373 +---- src/zarr/core/dtype/npy/float.py | 380 +---- src/zarr/core/dtype/npy/int.py | 1537 +++------------------ src/zarr/core/dtype/npy/sized.py | 382 +++++ src/zarr/core/dtype/npy/string.py | 849 ++---------- src/zarr/core/dtype/npy/time.py | 907 ++---------- src/zarr/core/metadata/v2.py | 2 +- src/zarr/core/metadata/v3.py | 1 + tests/conftest.py | 4 +- tests/package_with_entrypoint/__init__.py | 2 +- tests/test_array.py | 12 +- tests/test_config.py | 2 +- tests/test_dtype.py | 28 +- tests/test_info.py | 2 +- tests/test_metadata/test_v2.py | 3 +- tests/test_metadata/test_v3.py | 4 +- 24 files changed, 1147 insertions(+), 5916 deletions(-) delete mode 100644 src/zarr/core/dtype/_numpy.py create mode 100644 src/zarr/core/dtype/npy/sized.py diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 80972096c2..6c28bfe543 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,7 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration -from zarr.core.dtype._numpy import endianness_to_numpy_str +from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 3a90fdfcca..5089baeff5 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -43,7 +43,7 @@ parse_shapelike, product, ) -from zarr.core.dtype._numpy import UInt64 +from zarr.core.dtype.npy.int import UInt64 from zarr.core.indexing import ( BasicIndexer, SelectorTuple, diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 06fa8536ae..b53bc525cd 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -164,12 +164,12 @@ def parse_indexing_order(data: Any) -> Literal["C", "F"]: def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: """ - Classify a ZDType. The return value is a string which belongs to the type ``DTypeKind``. + Classify a ZDType. The return value is a string which belongs to the type ``DTypeCategory``. This is used by the config system to determine how to encode arrays with the associated data type when the user has not specified a particular serialization scheme. """ - from zarr.core.dtype._numpy import VariableLengthString + from zarr.core.dtype import VariableLengthString if isinstance(dtype, VariableLengthString): return "variable-length-string" diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 0aaf9ccf06..63b593fd28 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,6 +2,18 @@ from typing import TYPE_CHECKING, TypeAlias, get_args +from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.complex import Complex64, Complex128 +from zarr.core.dtype.npy.float import Float16, Float32, Float64 +from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +from zarr.core.dtype.npy.sized import ( + FixedLengthAscii, + FixedLengthBytes, + FixedLengthUnicode, + Structured, +) +from zarr.core.dtype.npy.time import DateTime64 + if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -9,27 +21,8 @@ import numpy.typing as npt from zarr.core.common import JSON -from zarr.core.dtype._numpy import ( +from zarr.core.dtype.npy.string import ( _NUMPY_SUPPORTS_VLEN_STRING, - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, - Structured, - UInt8, - UInt16, - UInt32, - UInt64, VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py deleted file mode 100644 index 51be83b173..0000000000 --- a/src/zarr/core/dtype/_numpy.py +++ /dev/null @@ -1,1397 +0,0 @@ -from __future__ import annotations - -import base64 -import re -import sys -from collections.abc import Sequence -from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Literal, - Self, - SupportsComplex, - SupportsFloat, - SupportsIndex, - SupportsInt, - TypeGuard, - TypeVar, - cast, - get_args, -) - -import numpy as np - -from zarr.core.dtype.common import ( - DataTypeValidationError, - Endianness, - bytes_from_json, - bytes_to_json, - check_json_bool, - check_json_complex_float, - check_json_float, - check_json_int, - check_json_str, - complex_float_from_json, - complex_float_to_json, - datetime_from_json, - datetime_to_json, - float_from_json, - float_to_json, -) -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - -EndiannessNumpy = Literal[">", "<", "|", "="] -IntLike = SupportsInt | SupportsIndex | bytes | str -FloatLike = SupportsIndex | SupportsFloat | bytes | str -ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None - - -@dataclass(frozen=True) -class HasEndianness: - """ - This is a mix-in class for data types with an endianness attribute - """ - - endianness: Endianness | None = "little" - - -@dataclass(frozen=True) -class HasLength: - """ - This is a mix-in class for data types with a length attribute - """ - - length: int - - -@dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): - """ - Wrapper for numpy boolean dtype. - - Attributes - ---------- - name : str - The name of the dtype. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] - The numpy dtype class. - """ - - _zarr_v3_name = "bool" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) - dtype_cls = np.dtypes.BoolDType - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.BoolDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: - """ - Check that the input is a valid JSON representation of a bool. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> str: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> np.bool_: - """ - Get the default value for the boolean dtype. - - Returns - ------- - np.bool_ - The default value. - """ - return np.False_ - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: - """ - Convert a scalar to a python bool. - - Parameters - ---------- - data : object - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - bool - The JSON-serializable format. - """ - return bool(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: - """ - Read a JSON-serializable value as a numpy boolean scalar. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.bool_ - The numpy boolean scalar. - """ - if check_json_bool(data): - return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") - - def check_value(self, data: object) -> bool: - # Anything can become a bool - return True - - def cast_value(self, value: object) -> np.bool_: - return self._cast_value_unsafe(value) - - def _cast_value_unsafe(self, value: object) -> np.bool_: - return np.bool_(value) - - -_NumpyIntDType = ( - np.dtypes.Int8DType - | np.dtypes.Int16DType - | np.dtypes.Int32DType - | np.dtypes.Int64DType - | np.dtypes.UInt8DType - | np.dtypes.UInt16DType - | np.dtypes.UInt32DType - | np.dtypes.UInt64DType -) -_NumpyIntScalar = ( - np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64 -) -TIntDType_co = TypeVar("TIntDType_co", bound=_NumpyIntDType, covariant=True) -TIntScalar_co = TypeVar("TIntScalar_co", bound=_NumpyIntScalar, covariant=True) - - -@dataclass(frozen=True) -class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> TypeGuard[IntLike]: - return isinstance(value, IntLike) - - def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") - - def default_value(self) -> TIntScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: - """ - Read a JSON-serializable value as a numpy int scalar. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy scalar. - """ - if check_json_int(data): - return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: - """ - Convert an object to JSON-serializable scalar. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - int - The JSON-serializable form of the scalar. - """ - return int(self.cast_value(data)) - - -@dataclass(frozen=True, kw_only=True) -class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.Int8DType: - return self.dtype_cls() - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - -@dataclass(frozen=True, kw_only=True) -class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self: Self) -> np.dtypes.UInt8DType: - return self.dtype_cls() - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - -@dataclass(frozen=True, kw_only=True) -class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): - dtype_cls = np.dtypes.Int16DType - _zarr_v3_name = "int16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - # This ensures that we get the endianness correct without annoying string parsing - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): - dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name = "uint16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): - dtype_cls = np.dtypes.Int32DType - _zarr_v3_name = "int32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: - # We override the base implementation to address a windows-specific, pre-numpy 2 issue where - # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` - # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, - # despite the two classes being different. Thus we will create an instance of `cls` with the - # latter dtype, after pulling in the byte order of the input - if dtype == np.dtypes.Int32DType(): - return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) - else: - return super().from_dtype(dtype) - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): - dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name = "uint32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt32DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): - dtype_cls = np.dtypes.Int64DType - _zarr_v3_name = "int64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.Int64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True) -class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): - dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name = "uint64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> np.dtypes.UInt64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -TFloatDType_co = TypeVar( - "TFloatDType_co", - bound=np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, - covariant=True, -) -TFloatScalar_co = TypeVar( - "TFloatScalar_co", bound=np.float16 | np.float32 | np.float64, covariant=True -) - - -@dataclass(frozen=True) -class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> TFloatDType_co: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> TypeGuard[FloatLike]: - return isinstance(value, FloatLike) - - def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") - - def default_value(self) -> TFloatScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: - """ - Read a JSON-serializable value as a numpy float. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy float. - """ - if check_json_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: - """ - Convert an object to a JSON-serializable float. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSON - The JSON-serializable form of the float, which is potentially a number or a string. - See the zarr specifications for details on the JSON encoding for floats. - """ - return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) - - -@dataclass(frozen=True, kw_only=True) -class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): - dtype_cls = np.dtypes.Float16DType - _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f4", "f8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_dtype(self) -> TComplexDType_co: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of this data type. - """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def check_value(self, value: object) -> bool: - return isinstance(value, ComplexLike) - - def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") - - def default_value(self) -> TComplexScalar_co: - """ - Get the default value, which is 0 cast to this dtype - - Returns - ------- - Int scalar - The default value. - """ - return self._cast_value_unsafe(0) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: - """ - Read a JSON-serializable value as a numpy float. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - TScalar_co - The numpy float. - """ - if check_json_complex_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: - """ - Convert an object to a JSON-serializable float. - - Parameters - ---------- - data : _BaseScalar - The value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSON - The JSON-serializable form of the complex number, which is a list of two floats, - each of which is encoding according to a zarr-format-specific encoding. - """ - return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) - - -@dataclass(frozen=True, kw_only=True) -class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): - dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", "c16", " Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def to_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") - - def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) - - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): - # np.dtypes.VoidDType is specified in an odd way in numpy - # it cannot be used to create instances of the dtype - # so we have to tell mypy to ignore this here - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.void" - item_size_bits: ClassVar[int] = 8 - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) - - def to_dtype(self) -> np.dtypes.VoidDType[int]: - # Numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": f"r{self.length * self.item_size_bits}"} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: - """ - Numpy void dtype comes in two forms: - * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. - * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, - - In this check we ensure that ``fields`` is ``None``. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - - def default_value(self) -> np.void: - return self.to_dtype().type(("\x00" * self.length).encode("ascii")) - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data)) - raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") - - def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes | np.void) - - def _cast_value_unsafe(self, value: object) -> np.void: - return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): - dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.item_size_bits // 8), - endianness=endianness_from_numpy_str(byte_order), - ) - - def to_dtype(self) -> np.dtypes.StrDType[int]: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(byte_order) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.str_: - return np.str_("") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype().type(data) - - def check_value(self, data: object) -> bool: - return isinstance(data, str | np.str_ | bytes) - - def _cast_value_unsafe(self, value: object) -> np.str_: - return self.to_dtype().type(value) - - -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") - - -if _NUMPY_SUPPORTS_VLEN_STRING: - - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] - dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_utf8" - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy string dtype. - """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - # Note that we are checking for the object dtype name. - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - # Note: unlike many other numpy data types, we don't serialize the .str attribute - # of the data type to JSON. This is because Zarr was using `|O` for strings before the - # numpy variable length string data type existed, and we want to be consistent with - # that practice - return "|O" - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> str: - return "" - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - def check_value(self, data: object) -> bool: - return isinstance(data, str) - - def _cast_value_unsafe(self, value: object) -> str: - return str(value) - -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_utf8" - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - return cls() - - def to_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy O dtype. - """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - return cls() - - def default_value(self) -> str: - return "" - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Strings pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data - - def check_value(self, data: object) -> bool: - return isinstance(data, str) - - def _cast_value_unsafe(self, value: object) -> str: - return str(value) - - -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] - - -@dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): - raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.DateTime64DType: - # Numpy does not allow creating datetime64 via - # np.dtypes.DateTime64Dtype() - return cast( - "np.dtypes.DateTime64DType", - np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ), - ) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) - and data[-1] == "]" - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and "unit" in data["configuration"] - and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.datetime64: - return np.datetime64("NaT") - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): - return datetime_from_json(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) # type: ignore[arg-type] - - def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - - def _cast_value_unsafe(self, value: object) -> np.datetime64: - return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] - - -@dataclass(frozen=True, kw_only=True) -class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "structured" - fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] - - def default_value(self) -> np.void: - return self._cast_value_unsafe(0) - - def _cast_value_unsafe(self, value: object) -> np.void: - return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) - - @classmethod - def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: - """ - Check that this dtype is a numpy structured dtype - - Parameters - ---------- - dtype : np.dtypes.DTypeLike - The dtype to check. - - Returns - ------- - TypeGuard[np.dtypes.VoidDType] - True if the dtype matches, False otherwise. - """ - return super().check_dtype(dtype) and dtype.fields is not None - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - from zarr.core.dtype import get_data_type_from_native_dtype - - fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] - - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only - # care about the first element in either case. - for key, (dtype_instance, *_) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) - fields.append((key, dtype_wrapped)) - - return cls(fields=tuple(fields)) - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - fields = [ - (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields - ] - if zarr_format == 2: - return fields - elif zarr_format == 3: - base_dict = {"name": self._zarr_v3_name} - base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("JSON", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[dict[str, JSON] | list[Any]]: - # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[Any] for now - if zarr_format == 2: - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - from zarr.core.dtype import get_data_type_from_json - - if cls.check_json(data, zarr_format=zarr_format): - if zarr_format == 2: - # structured dtypes are constructed directly from a list of lists - return cls( - fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in data - ) - ) - elif zarr_format == 3: # noqa: SIM102 - if isinstance(data, dict) and "configuration" in data: - config = data["configuration"] - if isinstance(config, dict) and "fields" in config: - meta_fields = config["fields"] - fields = tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in meta_fields - ) - return cls(fields=fields) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - - def to_dtype(self) -> np.dtypes.VoidDType[int]: - return cast( - "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), - ) - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) - - def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() - return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) - - -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: - """ - Convert an endianness literal to its numpy string representation. - - Parameters - ---------- - endianness : Endianness or None - The endianness to convert. - - Returns - ------- - Literal[">", "<", "|"] - The numpy string representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "little": - return "<" - case "big": - return ">" - case None: - return "|" - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" - ) - - -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: - """ - Convert a numpy endianness string literal to a human-readable literal value. - - Parameters - ---------- - endianness : Literal[">", "<", "=", "|"] - The numpy string representation of the endianness. - - Returns - ------- - Endianness or None - The human-readable representation of the endianness. - - Raises - ------ - ValueError - If the endianness is invalid. - """ - match endianness: - case "=": - return sys.byteorder - case "<": - return "little" - case ">": - return "big" - case "|": - return None - raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" - ) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 900b3fddbd..657f56bfb7 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,14 +1,7 @@ from __future__ import annotations -import base64 -from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, Literal, TypeGuard, cast - -import numpy as np - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype._numpy import DateUnit, TimeUnit +from dataclasses import dataclass +from typing import Literal Endianness = Literal["little", "big"] JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] @@ -17,504 +10,20 @@ class DataTypeValidationError(ValueError): ... -def check_json_bool(data: JSON) -> TypeGuard[bool]: - """ - Check if a JSON value is a boolean. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a boolean, False otherwise. - """ - return isinstance(data, bool) - - -def check_json_str(data: JSON) -> TypeGuard[str]: - """ - Check if a JSON value is a string. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a string, False otherwise. - """ - return bool(isinstance(data, str)) - - -def check_json_int(data: JSON) -> TypeGuard[int]: - """ - Check if a JSON value is an integer. - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is an integer, False otherwise. - """ - return bool(isinstance(data, int)) - - -def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: - """ - Check if a JSON value represents a float (v2). - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if data == "NaN" or data == "Infinity" or data == "-Infinity": - return True - return isinstance(data, float | int) - - -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: - """ - Check if a JSON value represents a float (v3). - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - """ - Check if a JSON value represents a float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the zarr v3 spec - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a complex float, False otherwise. - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v3(data[0]) - and check_json_float_v3(data[1]) - ) - - -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x - - Parameters - ---------- - data : JSON - The JSON value to check. - - Returns - ------- - Bool - True if the data is a complex float, False otherwise. - """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) - - -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data represents a complex float, False otherwise. - """ - if zarr_format == 2: - return check_json_complex_float_v2(data) - return check_json_complex_float_v3(data) - - -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: - """ - Convert a float to JSON (v2). - - Parameters - ---------- - data : float or np.floating - The float value to convert. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - if np.isnan(data): - return "NaN" - elif np.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - return float(data) - - -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: - """ - Convert a float to JSON (v3). - - Parameters - ---------- - data : float or np.floating - The float value to convert. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - # v3 can in principle handle distinct NaN values, but numpy does not represent these explicitly - # so we just reuse the v2 routine here - return float_to_json_v2(data) - - -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: - """ - Convert a float to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : float or np.floating - The float value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSONFloat - The JSON representation of the float. +@dataclass(frozen=True) +class HasLength: """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON (v2). - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - - Returns - ------- - tuple[JSONFloat, JSONFloat] - The JSON representation of the complex number. - """ - return float_to_json_v2(data.real), float_to_json_v2(data.imag) - - -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON (v3). - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - - Returns - ------- - tuple[JSONFloat, JSONFloat] - The JSON representation of the complex number. - """ - return float_to_json_v3(data.real), float_to_json_v3(data.imag) - - -def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : complex or np.complexfloating - The complex value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - tuple[JSONFloat, JSONFloat] or JSONFloat - The JSON representation of the complex number. - """ - if zarr_format == 2: - return complex_to_json_v2(data) - else: - return complex_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: - """ - Convert bytes to JSON. - - Parameters - ---------- - data : bytes - The bytes to store. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The bytes encoded as ascii using the base64 alphabet. - """ - # TODO: decide if we are going to make this implementation zarr format-specific - return base64.b64encode(data).decode("ascii") - - -def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: - """ - Convert a JSON string to bytes - - Parameters - ---------- - data : str - The JSON string to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - bytes - The bytes. + A mix-in class for data types with a length attribute, such as fixed-size collections + of unicode strings, or bytes. """ - if zarr_format == 2: - return base64.b64decode(data.encode("ascii")) - # TODO: differentiate these as needed. This is a spec question. - if zarr_format == 3: - return base64.b64decode(data.encode("ascii")) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + length: int -def float_from_json_v2(data: JSONFloat) -> float: - """ - Convert a JSON float to a float (Zarr v2). - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - - Returns - ------- - float - The float value. +@dataclass(frozen=True) +class HasEndianness: """ - match data: - case "NaN": - return float("nan") - case "Infinity": - return float("inf") - case "-Infinity": - return float("-inf") - case _: - return float(data) - - -def float_from_json_v3(data: JSONFloat) -> float: - """ - Convert a JSON float to a float (v3). - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - - Returns - ------- - float - The float value. + A mix-in class for data types with an endianness attribute """ - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: - """ - Convert a JSON float to a float based on zarr format. - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - float - The float value. - """ - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v2). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) - - -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v3). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) - - -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def datetime_to_json(data: np.datetime64) -> int: - """ - Convert a datetime64 to a JSON integer. - - Parameters - ---------- - data : np.datetime64 - The datetime64 value to convert. - - Returns - ------- - int - The JSON representation of the datetime64. - """ - return data.view(np.int64).item() - - -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: - """ - Convert a JSON integer to a datetime64. - - Parameters - ---------- - data : int - The JSON integer to convert. - unit : DateUnit or TimeUnit - The unit of the datetime64. - - Returns - ------- - np.datetime64 - The datetime64 value. - """ - return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) + endianness: Endianness | None = "little" diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 37371cd0cd..293d8383c0 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,269 +1,71 @@ -from __future__ import annotations - from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload +from typing import ClassVar, Literal, Self, TypeGuard import numpy as np -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeConfig_V2, - DTypeJSON, - HasItemSize, - check_dtype_spec_v2, -) -from zarr.core.dtype.wrapper import TBaseDType, ZDType - -if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.npy.common import check_json_bool +from zarr.core.dtype.wrapper import ZDType, _BaseDType @dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): +class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): """ - A Zarr data type for arrays containing booleans. - - Wraps the ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of - ``np.bool_``. + Wrapper for numpy boolean dtype. Attributes ---------- - - _zarr_v3_name : Literal["bool"] = "bool" - The Zarr v3 name of the dtype. - _zarr_v2_name : ``Literal["|b1"]`` = ``"|b1"`` - The Zarr v2 name of the dtype, which is also a string representation - of the boolean dtype used by NumPy. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType - The NumPy dtype class. - - References - ---------- - This class implements the boolean data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + name : str + The name of the dtype. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] + The numpy dtype class. """ - _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" - _zarr_v2_name: ClassVar[Literal["|b1"]] = "|b1" + _zarr_v3_name = "bool" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) dtype_cls = np.dtypes.BoolDType @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an instance of Bool from an instance of np.dtypes.BoolDType. - - Parameters - ---------- - dtype : TBaseDType - The NumPy boolean dtype instance to convert. - - Returns - ------- - Bool - An instance of Bool. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - Raises - ------ - DataTypeValidationError - If the provided dtype is not compatible with this ZDType. - """ - if cls._check_native_dtype(dtype): - return cls() - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self: Self) -> np.dtypes.BoolDType: - """ - Create a NumPy boolean dtype instance from this ZDType. - - Returns - ------- - np.dtypes.BoolDType - The NumPy boolean dtype. - """ + def to_dtype(self: Self) -> np.dtypes.BoolDType: return self.dtype_cls() @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """ - Check that the input is a valid JSON representation of a Bool. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == cls._zarr_v2_name - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of Bool from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: """ - Create an instance of Bool from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. + Check that the input is a valid JSON representation of a bool. """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: - """ - Serialize this Bool instance to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` - The JSON representation of the Bool instance. + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ + def to_json(self, zarr_format: ZarrFormat) -> str: if zarr_format == 2: - return {"name": self._zarr_v2_name, "object_codec_id": None} + return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> bool: - """ - Check if the input can be cast to a boolean scalar. - - Parameters - ---------- - data : object - The data to check. - - Returns - ------- - bool - True if the input can be cast to a boolean scalar, False otherwise. - """ - return True - - def cast_scalar(self, data: object) -> np.bool_: - """ - Cast the input to a numpy boolean scalar. - - Parameters - ---------- - data : object - The data to cast. - - Returns - ------- - ``np.bool_`` - The numpy boolean scalar. - - Raises - ------ - TypeError - If the input cannot be converted to a numpy boolean. - """ - if self._check_scalar(data): - return np.bool_(data) - msg = ( # pragma: no cover - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) # pragma: no cover + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - def default_scalar(self) -> np.bool_: + def default_value(self) -> np.bool_: """ Get the default value for the boolean dtype. Returns ------- - ``np.bool_`` + np.bool_ The default value. """ return np.False_ - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> bool: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: """ Convert a scalar to a python bool. @@ -281,7 +83,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ return bool(data) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: """ Read a JSON-serializable value as a numpy boolean scalar. @@ -294,26 +96,19 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: Returns ------- - ``np.bool_`` + np.bool_ The numpy boolean scalar. - - Raises - ------ - TypeError - If the input is not a valid boolean type. """ - if self._check_scalar(data): - return np.bool_(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover + if check_json_bool(data): + return self._cast_value_unsafe(data) + raise TypeError(f"Invalid type: {data}. Expected a boolean.") - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. + def check_value(self, data: object) -> bool: + # Anything can become a bool + return True - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 1 + def cast_value(self, value: object) -> np.bool_: + return self._cast_value_unsafe(value) + + def _cast_value_unsafe(self, value: object) -> np.bool_: + return np.bool_(value) diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 67644449a0..6571002bbb 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -1,13 +1,11 @@ from __future__ import annotations import base64 -import struct import sys from collections.abc import Sequence from typing import ( TYPE_CHECKING, Any, - Final, Literal, SupportsComplex, SupportsFloat, @@ -15,17 +13,12 @@ SupportsInt, TypeGuard, TypeVar, + get_args, ) import numpy as np -from zarr.core.dtype.common import ( - ENDIANNESS_STR, - SPECIAL_FLOAT_STRINGS, - EndiannessStr, - JSONFloatV2, - JSONFloatV3, -) +from zarr.core.dtype.common import Endianness, JSONFloat if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -33,29 +26,9 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateTimeUnit = Literal[ - "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" -] -DATETIME_UNIT: Final = ( - "Y", - "M", - "W", - "D", - "h", - "m", - "s", - "ms", - "us", - "μs", - "ns", - "ps", - "fs", - "as", - "generic", -) - -NumpyEndiannessStr = Literal[">", "<", "="] -NUMPY_ENDIANNESS_STR: Final = ">", "<", "=" +DateUnit = Literal["Y", "M", "W", "D"] +TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +EndiannessNumpy = Literal[">", "<", "|", "="] TFloatDType_co = TypeVar( "TFloatDType_co", @@ -72,18 +45,18 @@ TComplexScalar_co = TypeVar("TComplexScalar_co", bound=np.complex64 | np.complex128, covariant=True) -def endianness_from_numpy_str(endianness: NumpyEndiannessStr) -> EndiannessStr: +def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: """ Convert a numpy endianness string literal to a human-readable literal value. Parameters ---------- - endianness : Literal[">", "<", "="] + endianness : Literal[">", "<", "=", "|"] The numpy string representation of the endianness. Returns ------- - Endianness + Endianness or None The human-readable representation of the endianness. Raises @@ -99,21 +72,26 @@ def endianness_from_numpy_str(endianness: NumpyEndiannessStr) -> EndiannessStr: return "little" case ">": return "big" - raise ValueError(f"Invalid endianness: {endianness!r}. Expected one of {NUMPY_ENDIANNESS_STR}") + case "|": + # for dtypes without byte ordering semantics + return None + raise ValueError( + f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + ) -def endianness_to_numpy_str(endianness: EndiannessStr) -> NumpyEndiannessStr: +def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: """ Convert an endianness literal to its numpy string representation. Parameters ---------- - endianness : Endianness + endianness : Endianness or None The endianness to convert. Returns ------- - Literal[">", "<"] + Literal[">", "<", "|"] The numpy string representation of the endianness. Raises @@ -126,23 +104,14 @@ def endianness_to_numpy_str(endianness: EndiannessStr) -> NumpyEndiannessStr: return "<" case "big": return ">" + case None: + return "|" raise ValueError( - f"Invalid endianness: {endianness!r}. Expected one of {ENDIANNESS_STR} or None" + f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" ) -def get_endianness_from_numpy_dtype(dtype: np.dtype[np.generic]) -> EndiannessStr: - """ - Gets the endianness from a numpy dtype that has an endianness. This function will - raise a ValueError if the numpy data type does not have a concrete endianness. - """ - endianness = dtype.byteorder - if dtype.byteorder in NUMPY_ENDIANNESS_STR: - return endianness_from_numpy_str(endianness) # type: ignore [arg-type] - raise ValueError(f"The dtype {dtype} has an unsupported endianness: {endianness}") - - -def float_from_json_v2(data: JSONFloatV2) -> float: +def float_from_json_v2(data: JSONFloat) -> float: """ Convert a JSON float to a float (Zarr v2). @@ -167,7 +136,7 @@ def float_from_json_v2(data: JSONFloatV2) -> float: return float(data) -def float_from_json_v3(data: JSONFloatV3) -> float: +def float_from_json_v3(data: JSONFloat) -> float: """ Convert a JSON float to a float (v3). @@ -180,41 +149,34 @@ def float_from_json_v3(data: JSONFloatV3) -> float: ------- float The float value. - - Notes - ----- - Zarr V3 allows floats to be stored as hex strings. To quote the spec: - "...for float32, "NaN" is equivalent to "0x7fc00000". - This representation is the only way to specify a NaN value other than the specific NaN value - denoted by "NaN"." - """ - - if isinstance(data, str): - if data in SPECIAL_FLOAT_STRINGS: - return float_from_json_v2(data) # type: ignore[arg-type] - if not data.startswith("0x"): - msg = ( - f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" - " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." - ) - raise ValueError(msg) - if len(data[2:]) == 4: - dtype_code = ">e" - elif len(data[2:]) == 8: - dtype_code = ">f" - elif len(data[2:]) == 16: - dtype_code = ">d" - else: - msg = ( - f"Invalid hexadecimal float value: {data!r}. " - "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" - ) - raise ValueError(msg) - return float(struct.unpack(dtype_code, bytes.fromhex(data[2:]))[0]) + """ + # todo: support the v3-specific NaN handling return float_from_json_v2(data) -def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: +def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: + """ + Convert a JSON float to a float based on zarr format. + + Parameters + ---------- + data : JSONFloat + The JSON float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + float + The float value. + """ + if zarr_format == 2: + return float_from_json_v2(data) + else: + return float_from_json_v3(data) + + +def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: """ Convert a JSON string to bytes @@ -235,7 +197,7 @@ def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: # TODO: differentiate these as needed. This is a spec question. if zarr_format == 3: return base64.b64decode(data.encode("ascii")) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") # pragma: no cover + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: @@ -258,7 +220,7 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: return base64.b64encode(data).decode("ascii") -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloatV2: +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: """ Convert a float to JSON (v2). @@ -279,7 +241,7 @@ def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloatV2: return float(data) -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloatV3: +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: """ Convert a float to JSON (v3). @@ -298,11 +260,9 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloatV3: return float_to_json_v2(data) -def complex_float_to_json_v3( - data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloatV3, JSONFloatV3]: +def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON as defined by the Zarr V3 spec. + Convert a complex number to JSON (v3). Parameters ---------- @@ -317,15 +277,13 @@ def complex_float_to_json_v3( return float_to_json_v3(data.real), float_to_json_v3(data.imag) -def complex_float_to_json_v2( - data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloatV2, JSONFloatV2]: +def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON as defined by the Zarr V2 spec. + Convert a complex number to JSON (v2). Parameters ---------- - data : complex | np.complexfloating + data : complex or np.complexfloating The complex value to convert. Returns @@ -336,41 +294,55 @@ def complex_float_to_json_v2( return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_float_from_json_v2(data: tuple[JSONFloatV2, JSONFloatV2]) -> complex: +def complex_float_to_json( + data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat +) -> tuple[JSONFloat, JSONFloat]: """ - Convert a JSON complex float to a complex number (v2). + Convert a complex number to JSON, parametrized by the zarr format version. Parameters ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. + data : complex or np.complexfloating + The complex value to convert. + zarr_format : ZarrFormat + The zarr format version. Returns ------- - np.complexfloating - The complex number. + tuple[JSONFloat, JSONFloat] or JSONFloat + The JSON representation of the complex number. """ - return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) + if zarr_format == 2: + return complex_to_json_v2(data) + else: + return complex_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def complex_float_from_json_v3(data: tuple[JSONFloatV3, JSONFloatV3]) -> complex: +def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: """ - Convert a JSON complex float to a complex number (v3). + Convert a float to JSON, parametrized by the zarr format version. Parameters ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. + data : float or np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. Returns ------- - np.complexfloating - The complex number. + JSONFloat + The JSON representation of the float. """ - return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloatV2]: +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: """ Check if a JSON value represents a float (v2). @@ -384,12 +356,14 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloatV2]: Bool True if the data is a float, False otherwise. """ - return data in ("NaN", "Infinity", "-Infinity") or isinstance(data, float | int) + if data == "NaN" or data == "Infinity" or data == "-Infinity": + return True + return isinstance(data, float | int) -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloatV3]: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Check if a JSON value represents a float (v3). + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x Parameters ---------- @@ -399,14 +373,20 @@ def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloatV3]: Returns ------- Bool - True if the data is a float, False otherwise. + True if the data is a complex float, False otherwise. """ - return check_json_float_v2(data) or (isinstance(data, str) and data.startswith("0x")) + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloatV2, JSONFloatV2]]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + Check if a JSON value represents a float (v3). Parameters ---------- @@ -416,18 +396,13 @@ def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloatV2, JSON Returns ------- Bool - True if the data is a complex float, False otherwise. + True if the data is a float, False otherwise. """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloatV3, JSONFloatV3]]: +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ Check if a JSON value represents a complex float, as per the zarr v3 spec @@ -450,6 +425,51 @@ def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloatV3, JSON ) +def check_json_complex_float( + data: JSON, zarr_format: ZarrFormat +) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: + """ + Check if a JSON value represents a complex float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data represents a complex float, False otherwise. + """ + if zarr_format == 2: + return check_json_complex_float_v2(data) + return check_json_complex_float_v3(data) + + +def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: + """ + Check if a JSON value represents a float based on zarr format. + + Parameters + ---------- + data : JSON + The JSON value to check. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + Bool + True if the data is a float, False otherwise. + """ + if zarr_format == 2: + return check_json_float_v2(data) + else: + return check_json_float_v3(data) + + def check_json_int(data: JSON) -> TypeGuard[int]: """ Check if a JSON value is an integer. @@ -499,3 +519,60 @@ def check_json_bool(data: JSON) -> TypeGuard[bool]: True if the data is a boolean, False otherwise. """ return isinstance(data, bool) + + +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v2). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) + + +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) + + +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.complexfloating + The complex number. + """ + if zarr_format == 2: + return complex_float_from_json_v2(data) + else: + return complex_float_from_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 2f432a9e0a..22e1bd66a3 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -1,289 +1,94 @@ -from __future__ import annotations - from dataclasses import dataclass from typing import ( TYPE_CHECKING, ClassVar, - Literal, Self, TypeGuard, - overload, + cast, ) import numpy as np -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeConfig_V2, - DTypeJSON, - HasEndianness, - HasItemSize, - check_dtype_spec_v2, -) +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, TComplexScalar_co, - check_json_complex_float_v2, - check_json_complex_float_v3, - complex_float_from_json_v2, - complex_float_from_json_v3, - complex_float_to_json_v2, - complex_float_to_json_v3, + check_json_complex_float, + complex_float_from_json, + complex_float_to_json, + endianness_from_numpy_str, endianness_to_numpy_str, - get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.npy.common import EndiannessNumpy @dataclass(frozen=True) -class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): - """ - A base class for Zarr data types that wrap NumPy complex float data types. - """ - +class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an instance of this data type from a NumPy complex dtype. - - Parameters - ---------- - dtype : TBaseDType - The native dtype to convert. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the dtype is not compatible with this data type. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> TComplexDType_co: - """ - Convert this class to a NumPy complex dtype with the appropriate byte order. - - Returns - ------- - TComplexDType_co - A NumPy data type object representing the complex data type with the specified byte order. - """ + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) + def to_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - The input data must be a mapping that contains a "name" key that is one of - the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this data type in Zarr V3. - - This method verifies that the provided data matches the expected Zarr V3 - representation, which is the string specified by the class-level attribute _zarr_v3_name. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr V3, False otherwise. + def to_json(self, zarr_format: ZarrFormat) -> str: """ - - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this class. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: - """ - Serialize this object to a JSON-serializable representation. + Convert the wrapped data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. + The zarr format version. Returns ------- - DTypeConfig_V2[str, None] | str - If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is - returned. - If ``zarr_format`` is 3, a string representation of the complex data type is returned. - - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. + str + The JSON-serializable representation of the wrapped data type """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} + return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: - """ - Check that the input is a scalar complex value. - - Parameters - ---------- - data : object - The value to check. - - Returns - ------- - TypeGuard[ComplexLike] - True if the input is a scalar complex value, False otherwise. - """ - return isinstance(data, ComplexLike) - - def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: - """ - Cast the provided scalar data to the native scalar type of this class. - - Parameters - ---------- - data : ComplexLike - The data to cast. - - Returns - ------- - TComplexScalar_co - The casted data as a numpy complex scalar. + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Notes - ----- - This method does not perform any type checking. - The input data must be a scalar complex value. + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ - return self.to_native_dtype().type(data) # type: ignore[return-value] - - def cast_scalar(self, data: object) -> TComplexScalar_co: + Check that the input is a valid JSON representation of this data type. """ - Attempt to cast a given object to a numpy complex scalar. - - Parameters - ---------- - data : object - The data to be cast to a numpy complex scalar. + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Returns - ------- - TComplexScalar_co - The data cast as a numpy complex scalar. + def check_value(self, value: object) -> bool: + return isinstance(value, ComplexLike) - Raises - ------ - TypeError - If the data cannot be converted to a numpy complex scalar. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) + def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") - def default_scalar(self) -> TComplexScalar_co: + def default_value(self) -> TComplexScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -292,9 +97,9 @@ def default_scalar(self) -> TComplexScalar_co: Int scalar The default value. """ - return self._cast_scalar_unchecked(0) + return self._cast_value_unsafe(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: """ Read a JSON-serializable value as a numpy float. @@ -310,21 +115,13 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSc TScalar_co The numpy float. """ - if zarr_format == 2: - if check_json_complex_float_v2(data): - return self._cast_scalar_unchecked(complex_float_from_json_v2(data)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - elif zarr_format == 3: - if check_json_complex_float_v3(data): - return self._cast_scalar_unchecked(complex_float_from_json_v3(data)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + if check_json_complex_float(data, zarr_format=zarr_format): + return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: """ Convert an object to a JSON-serializable float. @@ -341,78 +138,18 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: The JSON-serializable form of the complex number, which is a list of two floats, each of which is encoding according to a zarr-format-specific encoding. """ - if zarr_format == 2: - return complex_float_to_json_v2(self.cast_scalar(data)) - elif zarr_format == 3: - return complex_float_to_json_v3(self.cast_scalar(data)) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): - """ - A Zarr data type for arrays containing 64 bit complex floats. - - Wraps the ``np.dtypes.Complex64DType`` data type. Scalars for this data type - are instances of ``np.complex64``. - - Attributes - ---------- - dtype_cls : Type[np.dtypes.Complex64DType] - The numpy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["complex64"]] - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8"], Literal["c8", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 8 + _zarr_v3_name = "complex64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", "c16"], Literal["c16"], Literal["c16", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 16 + _zarr_v3_name = "complex128" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: - """ - Create an instance of this ZDType from a NumPy data type. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Parameters - ---------- - dtype : TBaseDType - The NumPy data type. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> TFloatDType_co: - """ - Convert the wrapped data type to a NumPy data type. - - Returns - ------- - TFloatDType_co - The NumPy data type. - """ + def to_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid JSON representation of this data type, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[str] - True if the input is a valid JSON representation of this class, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: + def to_json(self, zarr_format: ZarrFormat) -> str: """ Convert the wrapped data type to a JSON-serializable form. @@ -172,88 +44,56 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: Returns ------- - DTypeConfig_V2[str, None] or str - The JSON-serializable representation of the wrapped data type. - - Raises - ------ - ValueError - If zarr_format is not 2 or 3. + str + The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} + return self.to_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: - """ - Check that the input is a valid scalar value. - - Parameters - ---------- - data : object - The input to check. - - Returns - ------- - TypeGuard[FloatLike] - True if the input is a valid scalar value, False otherwise. - """ - return isinstance(data, FloatLike) - - def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: - """ - Cast a scalar value to a NumPy float scalar. - - Parameters - ---------- - data : FloatLike - The scalar value to cast. + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Returns - ------- - TFloatScalar_co - The NumPy float scalar. + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ - return self.to_native_dtype().type(data) # type: ignore[return-value] - - def cast_scalar(self, data: object) -> TFloatScalar_co: + Check that the input is a valid JSON representation of this data type. """ - Cast a scalar value to a NumPy float scalar. + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Parameters - ---------- - data : object - The scalar value to cast. + def check_value(self, value: object) -> TypeGuard[FloatLike]: + return isinstance(value, FloatLike) - Returns - ------- - TFloatScalar_co - The NumPy float scalar. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) + def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") - def default_scalar(self) -> TFloatScalar_co: + def default_value(self) -> TFloatScalar_co: """ - Get the default value, which is 0 cast to this zdtype. + Get the default value, which is 0 cast to this dtype Returns ------- - TFloatScalar_co + Int scalar The default value. """ - return self._cast_scalar_unchecked(0) + return self._cast_value_unsafe(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ - Read a JSON-serializable value as a NumPy float scalar. + Read a JSON-serializable value as a numpy float. Parameters ---------- @@ -264,27 +104,16 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- - TFloatScalar_co - The NumPy float scalar. + TScalar_co + The numpy float. """ - if zarr_format == 2: - if check_json_float_v2(data): - return self._cast_scalar_unchecked(float_from_json_v2(data)) - else: - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - elif zarr_format == 3: - if check_json_float_v3(data): - return self._cast_scalar_unchecked(float_from_json_v3(data)) - else: - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + if check_json_float(data, zarr_format=zarr_format): + return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | str: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: """ Convert an object to a JSON-serializable float. @@ -301,120 +130,25 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st The JSON-serializable form of the float, which is potentially a number or a string. See the zarr specifications for details on the JSON encoding for floats. """ - if zarr_format == 2: - return float_to_json_v2(self.cast_scalar(data)) - elif zarr_format == 3: - return float_to_json_v3(self.cast_scalar(data)) - else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) @dataclass(frozen=True, kw_only=True) class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): - """ - A Zarr data type for arrays containing 16-bit floating point numbers. - - Wraps the ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances - of ``np.float16``. - - Attributes - ---------- - dtype_cls : Type[np.dtypes.Float16DType] - The NumPy dtype class for this data type. - - References - ---------- - This class implements the float16 data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[Literal[">f2"], Literal["f2", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 2 + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "`__ and `Zarr V3 `__ specification documents for details. - """ - dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[Literal[">f4"], Literal["f4", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 4 + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", "`__ and `Zarr V3 `__ specification documents for details. - """ - dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" - _zarr_v2_names: ClassVar[tuple[Literal[">f8"], Literal["f8", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 8 + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " TypeGuard[DTypeConfig_V2[str, None]]: + def to_json(self, zarr_format: ZarrFormat) -> str: """ - Check that the input is a valid JSON representation of this integer data type in Zarr V2. - - This method verifies that the provided data matches the expected Zarr V2 representation - for this data type. The input data must be a mapping that contains a "name" key that is - one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + Convert the wrapped data type to a JSON-serializable form. Parameters ---------- - data : object - The JSON data to check. + zarr_format : ZarrFormat + The zarr format version. Returns ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid representation of this class in Zarr V2, - False otherwise. + str + The JSON-serializable representation of the wrapped data type """ - - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _check_json_v3(cls, data: object) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : object - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr v3, - False otherwise. - """ - return data == cls._zarr_v3_name - - def _check_scalar(self, data: object) -> TypeGuard[IntLike]: - """ - Check if the input object is of an IntLike type. - - This method verifies whether the provided data can be considered as an integer-like - value, which includes objects supporting integer conversion. - - Parameters - ---------- - data : object - The data to check. - - Returns - ------- - TypeGuard[IntLike] - True if the data is IntLike, False otherwise. - """ - - return isinstance(data, IntLike) - - def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: - """ - Casts a given scalar value to the native integer scalar type without type checking. - - Parameters - ---------- - data : IntLike - The scalar value to cast. - - Returns - ------- - TIntScalar_co - The casted integer scalar of the native dtype. + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ - - return self.to_native_dtype().type(data) # type: ignore[return-value] - - def cast_scalar(self, data: object) -> TIntScalar_co: + Check that the input is a valid JSON representation of this data type. """ - Attempt to cast a given object to a NumPy integer scalar. - - Parameters - ---------- - data : object - The data to be cast to a NumPy integer scalar. - - Returns - ------- - TIntScalar_co - The data cast as a NumPy integer scalar. + if zarr_format == 2: + return data in cls._zarr_v2_names + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Raises - ------ - TypeError - If the data cannot be converted to a NumPy integer scalar. - """ + def check_value(self, value: object) -> TypeGuard[IntLike]: + return isinstance(value, IntLike) - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) + def _cast_value_unsafe(self, value: object) -> TIntScalar_co: + if self.check_value(value): + return self.to_dtype().type(value) # type: ignore[return-value] + raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") - def default_scalar(self) -> TIntScalar_co: + def default_value(self) -> TIntScalar_co: """ - Get the default value, which is 0 cast to this dtype. + Get the default value, which is 0 cast to this dtype Returns ------- - TIntScalar_co + Int scalar The default value. """ - return self._cast_scalar_unchecked(0) + return self._cast_value_unsafe(0) - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Read a JSON-serializable value as a NumPy int scalar. + Read a JSON-serializable value as a numpy int scalar. Parameters ---------- data : JSON The JSON-serializable value. zarr_format : ZarrFormat - The Zarr format version. + The zarr format version. Returns ------- - TIntScalar_co - The NumPy int scalar. - - Raises - ------ - TypeError - If the input is not a valid integer type. + TScalar_co + The numpy scalar. """ if check_json_int(data): - return self._cast_scalar_unchecked(data) + return self._cast_value_unsafe(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: """ - Convert an object to a JSON serializable scalar. For the integer data types, - the JSON form is a plain integer. + Convert an object to JSON-serializable scalar. Parameters ---------- - data : object + data : _BaseScalar The value to convert. zarr_format : ZarrFormat - The Zarr format version. + The zarr format version. Returns ------- int The JSON-serializable form of the scalar. """ - return int(self.cast_scalar(data)) + return int(self.cast_value(data)) @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): - """ - A Zarr data type for arrays containing 8-bit signed integers. - - Wraps the ``np.dtypes.Int8DType`` data type. Scalars for this data type are - instances of ``np.int8``. - - Attributes - ---------- - dtype_cls : np.dtypes.Int8DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 8-bit signed integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" - _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) + _zarr_v3_name = "int8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an Int8 from a np.dtype('int8') instance. - - Parameters - ---------- - dtype : TBaseDType - The np.dtype('int8') instance. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not a valid representation of this class Int8. - """ - if cls._check_native_dtype(dtype): - return cls() - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self: Self) -> np.dtypes.Int8DType: - """ - Convert the Int8 instance to a np.dtype('int8') instance. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - Returns - ------- - np.dtypes.Int8DType - The np.dtype('int8') instance. - """ + def to_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - Returns - ------- - Self - An instance of this data type. - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) +@dataclass(frozen=True, kw_only=True) +class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name = "uint8" + _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) + def to_dtype(self: Self) -> np.dtypes.UInt8DType: + return self.dtype_cls() - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: - """ - Convert the data type to a JSON-serializable form. +@dataclass(frozen=True, kw_only=True) +class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name = "int16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Returns - ------- - ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` - The JSON-serializable representation of the data type. + def to_dtype(self) -> np.dtypes.Int16DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: - return {"name": self._zarr_v2_names[0], "object_codec_id": None} + # This ensures that we get the endianness correct without annoying string parsing + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return self._zarr_v3_name + return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 1 - @dataclass(frozen=True, kw_only=True) -class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): - """ - A Zarr data type for arrays containing 8-bit unsigned integers. - - Wraps the ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. - - Attributes - ---------- - dtype_cls : np.dtypes.UInt8DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 8-bit unsigned integer data type defined in Zarr V2 and V3. +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name = "uint16" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", "`__ and `Zarr V3 `__ specification documents for details. - """ + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" - _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) + def to_dtype(self) -> np.dtypes.UInt16DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create a UInt8 from a np.dtype('uint8') instance. - """ - if cls._check_native_dtype(dtype): + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: return cls() - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: - """ - Create a NumPy unsigned 8-bit integer dtype instance from this UInt8 ZDType. + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Returns - ------- - np.dtypes.UInt8DType - The NumPy unsigned 8-bit integer dtype. - """ - return self.dtype_cls() +@dataclass(frozen=True, kw_only=True) +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name = "int32" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) + def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: + # We override the base implementation to address a windows-specific, pre-numpy 2 issue where + # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` + # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, + # despite the two classes being different. Thus we will create an instance of `cls` with the + # latter dtype, after pulling in the byte order of the input + if dtype == np.dtypes.Int32DType(): + return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + else: + return super().from_dtype(dtype) @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: - """ - Convert the data type to a JSON-serializable form. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. - - Returns - ------- - ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` - The JSON-serializable representation of the data type. + def to_dtype(self) -> np.dtypes.Int32DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. - """ + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: - # For Zarr format version 2, return a dictionary with the name and object codec ID. - return {"name": self._zarr_v2_names[0], "object_codec_id": None} + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - # For Zarr format version 3, return the v3 name as a string. - return self._zarr_v3_name - # Raise an error if the zarr_format is neither 2 nor 3. + return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 1 - @dataclass(frozen=True, kw_only=True) -class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): - """ - A Zarr data type for arrays containing 16-bit signed integers. - - Wraps the ``np.dtypes.Int16DType`` data type. Scalars for this data type are instances of - ``np.int16``. - - Attributes - ---------- - dtype_cls : np.dtypes.Int16DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 16-bit signed integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.Int16DType - _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", "u4", " Self: - """ - Create an instance of this data type from a np.dtype('int16') instance. - - Parameters - ---------- - dtype : np.dtype - The instance of np.dtype('int16') to create from. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not an instance of np.dtype('int16'). - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.Int16DType: - """ - Convert the data type to a np.dtype('int16') instance. - - Returns - ------- - np.dtype - The np.dtype('int16') instance. - """ + def to_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - Returns - ------- - Self - An instance of this data type. - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) +@dataclass(frozen=True, kw_only=True) +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name = "int64" + _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Returns - ------- - Self - An instance of this data type. + def to_dtype(self) -> np.dtypes.Int64DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " Literal["int16"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i2", "u8", " Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls(endianness=endianness_from_numpy_str(byte_order)) - Returns - ------- - DTypeConfig_V2[Literal[">i2", " np.dtypes.UInt64DType: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return self._zarr_v3_name + return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 2 - - -@dataclass(frozen=True, kw_only=True) -class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): - """ - A Zarr data type for arrays containing 16-bit unsigned integers. - - Wraps the ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of - ``np.uint16``. - - Attributes - ---------- - dtype_cls : np.dtypes.UInt16DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the unsigned 16-bit unsigned integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: - """ - Create an instance of this data type from a np.dtype('uint16') instance. - - Parameters - ---------- - dtype : np.dtype - The NumPy data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not an instance of np.dtype('uint16'). - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.UInt16DType: - """ - Convert the data type to a np.dtype('uint16') instance. - - Returns - ------- - np.dtype - The np.dtype('uint16') instance. - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " Literal["uint16"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u2", "u2", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 2 - - -@dataclass(frozen=True, kw_only=True) -class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): - """ - A Zarr data type for arrays containing 32-bit signed integers. - - Wraps the ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of - ``np.int32``. - - Attributes - ---------- - dtype_cls : np.dtypes.Int32DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 32-bit signed integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.Int32DType - _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " TypeGuard[np.dtypes.Int32DType]: - """ - A type guard that checks if the input is assignable to the type of ``cls.dtype_class`` - - This method is overridden for this particular data type because of a Windows-specific issue - where np.dtype('i') creates an instance of ``np.dtypes.IntDType``, rather than an - instance of ``np.dtypes.Int32DType``, even though both represent 32-bit signed integers. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return super()._check_native_dtype(dtype) or dtype == np.dtypes.Int32DType() - - @classmethod - def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: - """ - Create an Int32 from a np.dtype('int32') instance. - - Parameters - ---------- - dtype : TBaseDType - The np.dtype('int32') instance. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self: Self) -> np.dtypes.Int32DType: - """ - Convert the Int32 instance to a np.dtype('int32') instance. - - Returns - ------- - np.dtypes.Int32DType - The np.dtype('int32') instance. - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " Literal["int32"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i4", "i4", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 4 - - -@dataclass(frozen=True, kw_only=True) -class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): - """ - A Zarr data type for arrays containing 32-bit unsigned integers. - - Wraps the ``np.dtypes.UInt32DType`` data type. Scalars for this data type are instances of - ``np.uint32``. - - Attributes - ---------- - dtype_cls : np.dtypes.UInt32DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 32-bit unsigned integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " Self: - """ - Create a UInt32 from a np.dtype('uint32') instance. - - Parameters - ---------- - dtype : TBaseDType - The NumPy data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.UInt32DType: - """ - Create a NumPy unsigned 32-bit integer dtype instance from this UInt32 ZDType. - - Returns - ------- - np.dtypes.UInt32DType - The NumPy unsigned 32-bit integer dtype. - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u4", "u4", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 4 - - -@dataclass(frozen=True, kw_only=True) -class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): - """ - A Zarr data type for arrays containing 64-bit signed integers. - - Wraps the ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of - ``np.int64``. - - Attributes - ---------- - dtype_cls : np.dtypes.Int64DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the 64-bit signed integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.Int64DType - _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: - """ - Create an Int64 from a np.dtype('int64') instance. - - Parameters - ---------- - dtype : TBaseDType - The NumPy data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.Int64DType: - """ - Create a NumPy signed 64-bit integer dtype instance from this Int64 ZDType. - - Returns - ------- - np.dtypes.Int64DType - The NumPy signed 64-bit integer dtype. - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i8", "i8", " int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 8 - - -@dataclass(frozen=True, kw_only=True) -class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): - """ - A Zarr data type for arrays containing 64-bit unsigned integers. - - Wraps the ``np.dtypes.UInt64DType`` data type. Scalars for this data type - are instances of ``np.uint64``. - - Attributes - ---------- - dtype_cls: np.dtypes.UInt64DType - The class of the underlying NumPy dtype. - - References - ---------- - This class implements the unsigned 64-bit integer data type defined in Zarr V2 and V3. - - See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. - """ - - dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: - """ - Convert the data type to a native NumPy dtype. - - Returns - ------- - np.dtypes.UInt64DType - The native NumPy dtype.eeeeeeeeeeeeeeeee - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u8", "u8", " Self: - """ - Create an instance of this data type from a native NumPy dtype. - - Parameters - ---------- - dtype : TBaseDType - The native NumPy dtype. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input dtype is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_native_dtype(dtype): - return cls(endianness=get_endianness_from_numpy_dtype(dtype)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 8 diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py new file mode 100644 index 0000000000..8d8ff57800 --- /dev/null +++ b/src/zarr/core/dtype/npy/sized.py @@ -0,0 +1,382 @@ +import base64 +import re +from collections.abc import Sequence +from dataclasses import dataclass +from typing import Any, ClassVar, Self, TypeGuard, cast + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasLength +from zarr.core.dtype.npy.common import ( + EndiannessNumpy, + bytes_from_json, + bytes_to_json, + check_json_str, + endianness_from_numpy_str, + endianness_to_numpy_str, +) +from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.fixed_length_ascii" + item_size_bits: ClassVar[int] = 8 + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.BytesDType[int]: + return self.dtype_cls(self.length) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): + # np.dtypes.VoidDType is specified in an odd way in numpy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "numpy.void" + item_size_bits: ClassVar[int] = 8 + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + + def to_dtype(self) -> np.dtypes.VoidDType[int]: + # Numpy does not allow creating a void type + # by invoking np.dtypes.VoidDType directly + return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and isinstance(data["name"], str) + and (re.match(r"^r\d+$", data["name"]) is not None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": f"r{self.length * self.item_size_bits}"} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + """ + Numpy void dtype comes in two forms: + * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. + * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, + + In this check we ensure that ``fields`` is ``None``. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] + + def default_value(self) -> np.void: + return self.to_dtype().type(("\x00" * self.length).encode("ascii")) + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data)) + raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + + def check_value(self, data: object) -> bool: + return isinstance(data, np.bytes_ | str | bytes | np.void) + + def _cast_value_unsafe(self, value: object) -> np.void: + return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_ucs4" + item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls( + length=dtype.itemsize // (cls.item_size_bits // 8), + endianness=endianness_from_numpy_str(byte_order), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "length_bits" in data["configuration"] + and isinstance(data["configuration"]["length_bits"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bits": self.length * self.item_size_bits}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.str_: + return np.str_("") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return self.to_dtype().type(data) + + def check_value(self, data: object) -> bool: + return isinstance(data, str | np.str_ | bytes) + + def _cast_value_unsafe(self, value: object) -> np.str_: + return self.to_dtype().type(value) + + +@dataclass(frozen=True, kw_only=True) +class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] + _zarr_v3_name = "structured" + fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] + + def default_value(self) -> np.void: + return self._cast_value_unsafe(0) + + def _cast_value_unsafe(self, value: object) -> np.void: + return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) + + @classmethod + def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + """ + Check that this dtype is a numpy structured dtype + + Parameters + ---------- + dtype : np.dtypes.DTypeLike + The dtype to check. + + Returns + ------- + TypeGuard[np.dtypes.VoidDType] + True if the dtype matches, False otherwise. + """ + return super().check_dtype(dtype) and dtype.fields is not None + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + from zarr.core.dtype import get_data_type_from_native_dtype + + fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] + + if dtype.fields is None: + raise ValueError("numpy dtype has no fields") + + # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only + # care about the first element in either case. + for key, (dtype_instance, *_) in dtype.fields.items(): + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + fields = [ + (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields + ] + if zarr_format == 2: + return fields + elif zarr_format == 3: + base_dict = {"name": self._zarr_v3_name} + base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] + return cast("JSON", base_dict) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def check_json( + cls, data: JSON, zarr_format: ZarrFormat + ) -> TypeGuard[dict[str, JSON] | list[Any]]: + # the actual JSON form is recursive and hard to annotate, so we give up and do + # list[Any] for now + if zarr_format == 2: + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data + ) + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and "name" in data + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + from zarr.core.dtype import get_data_type_from_json + + if cls.check_json(data, zarr_format=zarr_format): + if zarr_format == 2: + # structured dtypes are constructed directly from a list of lists + return cls( + fields=tuple( # type: ignore[misc] + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in data + ) + ) + elif zarr_format == 3: # noqa: SIM102 + if isinstance(data, dict) and "configuration" in data: + config = data["configuration"] + if isinstance(config, dict) and "fields" in config: + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + for f_name, f_dtype in meta_fields + ) + return cls(fields=fields) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + + def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cast( + "np.dtypes.VoidDType[int]", + np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + ) + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + + def check_value(self, data: object) -> bool: + # not sure which values we should accept for structured dtypes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 32375a1c71..15ccfb30f1 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,787 +1,134 @@ from __future__ import annotations -import re from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - ClassVar, - Literal, - Protocol, - Self, - TypedDict, - TypeGuard, - overload, - runtime_checkable, -) +from typing import TYPE_CHECKING, Self, TypeGuard import numpy as np -from zarr.core.common import NamedConfig -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeConfig_V2, - DTypeJSON, - HasEndianness, - HasItemSize, - HasLength, - HasObjectCodec, - check_dtype_spec_v2, - v3_unstable_dtype_warning, -) -from zarr.core.dtype.npy.common import ( - check_json_str, - endianness_to_numpy_str, - get_endianness_from_numpy_dtype, -) -from zarr.core.dtype.wrapper import TDType_co, ZDType +from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.wrapper import ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.wrapper import TBaseDType + from zarr.core.dtype.wrapper import _BaseDType _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") -@runtime_checkable -class SupportsStr(Protocol): - def __str__(self) -> str: ... - - -class LengthBytesConfig(TypedDict): - """ - Configuration for a fixed-length string data type in Zarr V3. - - Attributes - ---------- - length_bytes : int - The length in bytes of the data associated with this configuration. - """ - - length_bytes: int - - -class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): - """ - A wrapper around the JSON representation of the ``FixedLengthUTF32`` data type in Zarr V2. - - The ``name`` field of this class contains the value that would appear under the - ``dtype`` field in Zarr V2 array metadata. - - References - ---------- - The structure of the ``name`` field is defined in the Zarr V2 - `specification document `__. - - Examples - -------- - - .. code-block:: python - - { - "name": " None: - """ - We don't allow instances of this class with length less than 1 because there is no way such - a data type can contain actual data. - """ - if self.length < 1: - raise ValueError(f"length must be >= 1, got {self.length}.") - - @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create a FixedLengthUTF32 from a NumPy data type. - - Parameters - ---------- - dtype : TBaseDType - The NumPy data type. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_native_dtype(dtype): - endianness = get_endianness_from_numpy_dtype(dtype) - return cls( - length=dtype.itemsize // (cls.code_point_bytes), - endianness=endianness, - ) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> np.dtypes.StrDType[int]: - """ - Convert the FixedLengthUTF32 instance to a NumPy data type. - - Returns - ------- - np.dtypes.StrDType[int] - The NumPy data type. - """ - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(byte_order) - - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: - """ - Check that the input is a valid JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[FixedLengthUTF32JSON_V2] - Whether the input is a valid JSON representation of a NumPy U dtype. - """ - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^[><]U\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[FixedLengthUTF32JSONV3] - Whether the input is a valid JSON representation of a NumPy U dtype. - """ - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - and isinstance(data["configuration"]["length_bytes"], int) - ) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: - """ - Convert the FixedLengthUTF32 instance to a JSON representation. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format to use. - - Returns - ------- - DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 - The JSON representation of the data type. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.code_point_bytes}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Construct the NumPy dtype instead of string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." - ) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - def default_scalar(self) -> np.str_: - """ - Return the default scalar value for this data type. - - Returns - ------- - ``np.str_`` - The default scalar value. - """ - return np.str_("") - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - """ - Convert the scalar value to a JSON representation. - - Parameters - ---------- - data : object - The scalar value. - zarr_format : ZarrFormat - The Zarr format to use. - - Returns - ------- - str - The JSON representation of the scalar value. - """ - return str(data) - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - """ - Convert the JSON representation of a scalar value to the native scalar value. - - Parameters - ---------- - data : JSON - The JSON data. - zarr_format : ZarrFormat - The Zarr format to use. - - Returns - ------- - ``np.str_`` - The native scalar value. - """ - if check_json_str(data): - return self.to_native_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: - """ - Check that the input is a valid scalar value for this data type. - - Parameters - ---------- - data : object - The scalar value. - - Returns - ------- - TypeGuard[SupportsStr] - Whether the input is a valid scalar value for this data type. - """ - # this is generous for backwards compatibility - return isinstance(data, SupportsStr) - - def cast_scalar(self, data: object) -> np.str_: - """ - Cast the scalar value to the native scalar value. - - Parameters - ---------- - data : object - The scalar value. - - Returns - ------- - ``np.str_`` - The native scalar value. - """ - if self._check_scalar(data): - # We explicitly truncate before casting because of the following NumPy behavior: - # >>> x = np.dtype('U3').type('hello world') - # >>> x - # np.str_('hello world') - # >>> x.dtype - # dtype('U11') - - return self.to_native_dtype().type(str(data)[: self.length]) - - msg = ( # pragma: no cover - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) # pragma: no-cover - - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return self.length * self.code_point_bytes - - -def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: - """ - Check if the input is a valid JSON scalar for a variable-length string. - - This function is generous for backwards compatibility, as Zarr Python v2 would use ints for - variable-length string fill values. - - Parameters - ---------- - data : object - The JSON value to check. - - Returns - ------- - TypeGuard[int | str | float] - True if the input is a valid scalar for a variable-length string. - """ - return isinstance(data, int | str | float) - - -class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]): - """ - A wrapper around the JSON representation of the ``VariableLengthUTF8`` data type in Zarr V2. - - The ``name`` field of this class contains the value that would appear under the - ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-utf8"``. - - References - ---------- - The structure of the ``name`` field is defined in the Zarr V2 - `specification document `__. - - - Examples - -------- - .. code-block:: python - - { - "name": "|O", - "object_codec_id": "vlen-utf8" - } - """ - - -# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. -# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length -# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object -# dtype as the native dtype. -class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): - """ - A base class for variable-length UTF-8 string data types. - - Not intended for direct use, but as a base for concrete implementations. - - Attributes - ---------- - object_codec_id : ClassVar[Literal["vlen-utf8"]] - The object codec ID for this data type. - - References - ---------- - This data type does not have a Zarr V3 specification. - - The Zarr V2 data type specification can be found `here `__. - """ - - _zarr_v3_name: ClassVar[Literal["string"]] = "string" - object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - - @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an instance of this data type from a compatible NumPy data type. - - - Parameters - ---------- - dtype : TBaseDType - The native data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input is not compatible with this data type. - """ - if cls._check_native_dtype(dtype): - return cls() - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[VariableLengthUTF8JSON_V2]: - """ - "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype - for Zarr v2." - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[VariableLengthUTF8JSON_V2]`` - Whether the input is a valid JSON representation of a NumPy "object" data type, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == "|O" - and data["object_codec_id"] == cls.object_codec_id - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[Literal["variable_length_utf8"]] - Whether the input is a valid JSON representation of a variable length UTF-8 string - data type. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a NumPy "object" dtype. +if _NUMPY_SUPPORTS_VLEN_STRING: - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. + @dataclass(frozen=True, kw_only=True) + class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] + dtype_cls = np.dtypes.StringDType + _zarr_v3_name = "numpy.variable_length_utf8" - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: return cls() - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" - ) - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a variable length UTF-8 - string data type. - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. + def to_dtype(self) -> np.dtypes.StringDType: + return self.dtype_cls() - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy string dtype. + """ + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + # Note that we are checking for the object dtype name. + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + # Note: unlike many other numpy data types, we don't serialize the .str attribute + # of the data type to JSON. This is because Zarr was using `|O` for strings before the + # numpy variable length string data type existed, and we want to be consistent with + # that practice + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: - """ - Convert this data type to a JSON representation. - - Parameters - ---------- - zarr_format : int - The zarr format to use for the JSON representation. - - Returns - ------- - ``VariableLengthUTF8JSON_V2 | Literal["string"]`` - The JSON representation of this data type. - """ - if zarr_format == 2: - return {"name": "|O", "object_codec_id": self.object_codec_id} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_scalar(self) -> str: - """ - Return the default scalar value for this data type. - - Returns - ------- - str - The default scalar value. - """ - return "" - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - """ - Convert a scalar value to a JSON representation. + def default_value(self) -> str: + return "" - Parameters - ---------- - data : object - The scalar value to convert. - zarr_format : int - The zarr format to use for the JSON representation. + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) - Returns - ------- - str - The JSON representation of the scalar value. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - raise TypeError(f"Invalid type: {data}. Expected a string.") + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Convert a JSON representation of a scalar value to the native scalar type. + def check_value(self, data: object) -> bool: + return isinstance(data, str) - Parameters - ---------- - data : JSON - The JSON representation of the scalar value. - zarr_format : int - The zarr format to use for the JSON representation. - - Returns - ------- - str - The native scalar type of the scalar value. - """ - if not check_vlen_string_json_scalar(data): - raise TypeError(f"Invalid type: {data}. Expected a string or number.") - return str(data) - - def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: - """ - Check that the input is a valid scalar value for this data type. - - Parameters - ---------- - data : object - The scalar value to check. - - Returns - ------- - TypeGuard[SupportsStr] - Whether the input is a valid scalar value for this data type. - """ - return isinstance(data, SupportsStr) - - def _cast_scalar_unchecked(self, data: SupportsStr) -> str: - """ - Cast a scalar value to a string. - - Parameters - ---------- - data : object - The scalar value to cast. - - Returns - ------- - str - The string representation of the scalar value. - """ - return str(data) - - def cast_scalar(self, data: object) -> str: - """ - Cast an object to a string. - - Parameters - ---------- - data : object - The value to cast. - - Returns - ------- - str - The input cast to str. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( # pragma: no cover - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) # pragma: no cover - - -if _NUMPY_SUPPORTS_VLEN_STRING: + def _cast_value_unsafe(self, value: object) -> str: + return str(value) +else: + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] - """ - A Zarr data type for arrays containing variable-length UTF-8 strings. - - Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances - of ``str``. - + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name = "numpy.variable_length_utf8" - Attributes - ---------- - dtype_cls : Type[np.dtypes.StringDType] - The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - The name of this data type in Zarr V3. - object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - The object codec ID for this data type. - """ + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + return cls() - dtype_cls = np.dtypes.StringDType + def to_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() - def to_native_dtype(self) -> np.dtypes.StringDType: + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: """ - Create a NumPy string dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.StringDType - The NumPy string dtype. + Check that the input is a valid JSON representation of a numpy O dtype. """ - return self.dtype_cls() - -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] - """ - A Zarr data type for arrays containing variable-length UTF-8 strings. - - Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances - of ``str``. - + if zarr_format == 2: + # TODO: take the entire metadata document in here, and + # check the compressors / filters for vlen-utf8 + return data == "|O" + elif zarr_format == 3: + return data == cls._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + return cls() - Attributes - ---------- - dtype_cls : Type[np.dtypes.ObjectDType] - The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - The name of this data type in Zarr V3. - object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" - The object codec ID for this data type. - """ + def default_value(self) -> str: + return "" - dtype_cls = np.dtypes.ObjectDType + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] - def to_native_dtype(self) -> np.dtypes.ObjectDType: + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ - Create a NumPy object dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.ObjectDType - The NumPy object dtype. + Strings pass through """ - return self.dtype_cls() + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + def check_value(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_value_unsafe(self, value: object) -> str: + return str(value) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index d523e16940..a10b9ae8a3 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,885 +1,142 @@ from __future__ import annotations from dataclasses import dataclass -from datetime import datetime, timedelta -from typing import ( - TYPE_CHECKING, - ClassVar, - Literal, - Self, - TypedDict, - TypeGuard, - TypeVar, - cast, - get_args, - overload, -) +from typing import TYPE_CHECKING, Self, TypeGuard, cast, get_args import numpy as np -from typing_extensions import ReadOnly -from zarr.core.common import NamedConfig -from zarr.core.dtype.common import ( - DataTypeValidationError, - DTypeConfig_V2, - DTypeJSON, - HasEndianness, - HasItemSize, - check_dtype_spec_v2, -) +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness from zarr.core.dtype.npy.common import ( - DATETIME_UNIT, - DateTimeUnit, + DateUnit, + EndiannessNumpy, + TimeUnit, check_json_int, + endianness_from_numpy_str, endianness_to_numpy_str, - get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import ZDType, _BaseDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None -DateTimeLike = str | int | bytes | np.datetime64 | datetime | None - -def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np.datetime64: +def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: """ - Convert an integer to a datetime64. + Convert a JSON integer to a datetime64. Parameters ---------- data : int - The integer to convert. - unit : DateTimeUnit + The JSON integer to convert. + unit : DateUnit or TimeUnit The unit of the datetime64. - scale_factor : int - The scale factor of the datetime64. Returns ------- - numpy.datetime64 + np.datetime64 The datetime64 value. """ - dtype_name = f"datetime64[{scale_factor}{unit}]" - return cast("np.datetime64", np.int64(data).view(dtype_name)) + return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) -def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: +def datetime_to_json(data: np.datetime64) -> int: """ - Convert a datetime64 or a timedelta64 to an integer. + Convert a datetime64 to a JSON integer. Parameters ---------- - data : np.datetime64 | numpy.timedelta64 - The value to convert. + data : np.datetime64 + The datetime64 value to convert. Returns ------- int - An integer representation of the scalar. + The JSON representation of the datetime64. """ return data.view(np.int64).item() -def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: - """ - Type guard to check if the input JSON data is the literal string "NaT" - or an integer. - """ - return check_json_int(data) or data == "NaT" - - -BaseTimeDType_co = TypeVar( - "BaseTimeDType_co", - bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, - covariant=True, -) -BaseTimeScalar_co = TypeVar( - "BaseTimeScalar_co", bound=np.timedelta64 | np.datetime64, covariant=True -) - - -class TimeConfig(TypedDict): - """ - The configuration for the numpy.timedelta64 or numpy.datetime64 data type in Zarr V3. - - Attributes - ---------- - unit : ReadOnly[DateTimeUnit] - A string encoding a unit of time. - scale_factor : ReadOnly[int] - A scale factor. - - Examples - -------- - .. code-block:: python - - {"unit": "ms", "scale_factor": 1} - """ - - unit: ReadOnly[DateTimeUnit] - scale_factor: ReadOnly[int] - - -class DateTime64JSON_V3(NamedConfig[Literal["numpy.datetime64"], TimeConfig]): - """ - The JSON representation of the ``numpy.datetime64`` data type in Zarr V3. - - References - ---------- - This representation is defined in the ``numpy.datetime64`` - `specification document `__. - - Examples - -------- - .. code-block:: python - - { - "name": "numpy.datetime64", - "configuration": { - "unit": "ms", - "scale_factor": 1 - } - } - """ - - -class TimeDelta64JSON_V3(NamedConfig[Literal["numpy.timedelta64"], TimeConfig]): - """ - The JSON representation of the ``TimeDelta64`` data type in Zarr V3. - - References - ---------- - This representation is defined in the numpy.timedelta64 - `specification document `__. - - Examples - -------- - .. code-block:: python - - { - "name": "numpy.timedelta64", - "configuration": { - "unit": "ms", - "scale_factor": 1 - } - } - """ - - -class TimeDelta64JSON_V2(DTypeConfig_V2[str, None]): - """ - A wrapper around the JSON representation of the ``TimeDelta64`` data type in Zarr V2. - - The ``name`` field of this class contains the value that would appear under the - ``dtype`` field in Zarr V2 array metadata. - - References - ---------- - The structure of the ``name`` field is defined in the Zarr V2 - `specification document `__. - - - Examples - -------- - .. code-block:: python - - { - "name": "`__. - - - Examples - -------- - .. code-block:: python - - { - "name": " None: - if self.scale_factor < 1: - raise ValueError(f"scale_factor must be > 0, got {self.scale_factor}.") - if self.scale_factor >= 2**31: - raise ValueError(f"scale_factor must be < 2147483648, got {self.scale_factor}.") - if self.unit not in get_args(DateTimeUnit): - raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") - - @classmethod - def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create an instance of this class from a native NumPy data type. - - Parameters - ---------- - dtype : TBaseDType - The native NumPy dtype to convert. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the dtype is not a valid representation of this class. - """ - - if cls._check_native_dtype(dtype): - unit, scale_factor = np.datetime_data(dtype.name) - unit = cast("DateTimeUnit", unit) - return cls( - unit=unit, - scale_factor=scale_factor, - endianness=get_endianness_from_numpy_dtype(dtype), - ) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) - - def to_native_dtype(self) -> BaseTimeDType_co: - # Numpy does not allow creating datetime64 or timedelta64 via - # np.dtypes.{dtype_name}() - # so we use np.dtype with a formatted string. - """ - Convert this data type to a NumPy temporal data type with the appropriate - unit and scale factor. - - Returns - ------- - BaseTimeDType_co - A NumPy data type object representing the time data type with - the specified unit, scale factor, and byte order. - """ - - dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" - return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: - """ - Convert a python object to a JSON representation of a datetime64 or timedelta64 scalar. - - Parameters - ---------- - data : object - The python object to convert. - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - int - The JSON representation of the scalar. - """ - return datetimelike_to_int(data) # type: ignore[arg-type] - - @property - def item_size(self) -> int: - """ - The size of a single scalar in bytes. - - Returns - ------- - int - The size of a single scalar in bytes. - """ - return 8 - - -@dataclass(frozen=True, kw_only=True, slots=True) -class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): - """ - A Zarr data type for arrays containing NumPy TimeDelta64 data. - - Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type - are instances of `np.timedelta64`. - - Attributes - ---------- - dtype_cls : Type[np.dtypesTimeDelta64DType] - The NumPy dtype class for this data type. - scale_factor : int - The scale factor for this data type. - unit : DateTimeUnit - The unit for this data type. - - References - ---------- - The Zarr V2 representation of this data type is defined in the Zarr V2 - `specification document `__. - - The Zarr V3 representation of this data type is defined in the ``numpy.timedelta64`` - `specification document `__ - """ - - # mypy infers the type of np.dtypes.TimeDelta64DType to be - # "Callable[[Literal['Y', 'M', 'W', 'D'] | Literal['h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as']], Never]" - dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - unit: DateTimeUnit = "generic" - scale_factor: int = 1 - _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names: ClassVar[tuple[Literal[">m8"], Literal["m8", " TypeGuard[TimeDelta64JSON_V2]: - """ - Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, - which could be in the form of strings like "m8[10s]". This method serves as a type - guard, helping to refine the type of unknown JSON input by confirming its adherence to the - expected format for NumPy timedelta64 data types. - - The JSON input should contain a "name" key with a value that matches the expected string - pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed - within square brackets, following the base type identifier. - - Returns - ------- - bool - True if the JSON input is a valid representation of this class, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - # match m[M], etc - # consider making this a standalone function - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either m8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Returns - ------- - TypeGuard[DateTime64JSON_V3] - True if the JSON input is a valid representation of this class, - otherwise False. - """ - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - ) +class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.datetime64" + unit: DateUnit | TimeUnit @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a TimeDelta64 from a Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TimeDelta64 - An instance of TimeDelta64. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] + if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): + raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) + + def to_dtype(self) -> np.dtypes.DateTime64DType: + # Numpy does not allow creating datetime64 via + # np.dtypes.DateTime64Dtype() + return cast( + "np.dtypes.DateTime64DType", + np.dtype(f"datetime64[{self.unit}]").newbyteorder( + endianness_to_numpy_str(self.endianness) + ), ) - raise DataTypeValidationError(msg) @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create a TimeDelta64 from a Zarr V3-flavored JSON. - - The JSON representation of a TimeDelta64 in Zarr V3 is a dict with a 'name' key - with the value 'numpy.timedelta64', and a 'configuration' key with a value of a dict - with a 'unit' key and a 'scale_factor' key. - - For example: - - .. code-block:: json - - { - "name": "numpy.timedelta64", - "configuration": { - "unit": "generic", - "scale_factor": 1 - } - } - - """ - if cls._check_json_v3(data): - unit = data["configuration"]["unit"] - scale_factor = data["configuration"]["scale_factor"] - return cls(unit=unit, scale_factor=scale_factor) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " - f"with a 'name' key with the value 'numpy.timedelta64', " - "and a 'configuration' key with a value of a dict with a 'unit' key and a " - "'scale_factor' key" - ) - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - TimeDelta64JSON_V2 | TimeDelta64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) + and data[-1] == "]" + ) elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and "unit" in data["configuration"] + and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) + ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: - """ - Check if the input is a scalar of this data type. - - Parameters - ---------- - data : object - The object to check. - - Returns - ------- - TypeGuard[TimeDeltaLike] - True if the input is a scalar of this data type, False otherwise. - """ - if data is None: - return True - return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) - - def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: - """ - Cast the provided scalar input to a numpy timedelta64 without any type checking. - - This method assumes that the input data is already a valid scalar of this data type, - and does not perform any validation or type checks. It directly casts the input - to a numpy timedelta64 scalar using the unit and scale factor defined in the class. - - Parameters - ---------- - data : TimeDeltaLike - The scalar input data to cast. - - Returns - ------- - numpy.timedelta64 - The input data cast as a numpy timedelta64 scalar. - """ - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") - - def cast_scalar(self, data: object) -> np.timedelta64: - """ - Cast the input to a numpy timedelta64 scalar. If the input is not a scalar of this data type, - raise a TypeError. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) - - def default_scalar(self) -> np.timedelta64: - """ - Return a default scalar of this data type. - - This method provides a default value for the timedelta64 scalar, which is - a 'Not-a-Time' (NaT) value. - """ - return np.timedelta64("NaT") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - """ - Create a scalar of this data type from JSON input. - - Parameters - ---------- - data : JSON - The JSON representation of the scalar value. - zarr_format : int - The zarr format to use for the JSON representation. - - Returns - ------- - numpy.timedelta64 - The scalar value of this data type. - - Raises - ------ - TypeError - If the input JSON is not a valid representation of a scalar for this data type. - """ - if check_json_time(data): - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - - -@dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - """ - A Zarr data type for arrays containing NumPy Datetime64 data. - - Wraps the ``np.dtypes.TimeDelta64DType`` data type. Scalars for this data type - are instances of ``np.datetime64``. - - Attributes - ---------- - dtype_cls : Type[np.dtypesTimeDelta64DType] - The numpy dtype class for this data type. - unit : DateTimeUnit - The unit of time for this data type. - scale_factor : int - The scale factor for the time unit. - - References - ---------- - The Zarr V2 representation of this data type is defined in the Zarr V2 - `specification document `__. - - The Zarr V3 representation of this data type is defined in the ``numpy.datetime64`` - `specification document `__ - """ - - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names: ClassVar[tuple[Literal[">M8"], Literal["M8", " TypeGuard[DateTime64JSON_V2]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[DateTime64JSON_V2] - True if the input is a valid JSON representation of a NumPy datetime64 data type, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[DateTime64JSON_V3] - True if the input is a valid JSON representation of a numpy datetime64 data type in Zarr V3, False otherwise. - """ - - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - ) - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from a Zarr V2-flavored JSON representation. - - This method checks if the provided JSON data is a valid representation of this class. - If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a - DataTypeValidationError. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ + def default_value(self) -> np.datetime64: + return np.datetime64("NaT") - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" - ) - raise DataTypeValidationError(msg) + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from a Zarr V3-flavored JSON representation. - - This method checks if the provided JSON data is a valid representation of this class. - If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a - DataTypeValidationError. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - unit = data["configuration"]["unit"] - scale_factor = data["configuration"]["scale_factor"] - return cls(unit=unit, scale_factor=scale_factor) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " - f"with a 'name' key with the value 'numpy.datetime64', " - "and a 'configuration' key with a value of a dict with a 'unit' key and a " - "'scale_factor' key" - ) - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - DateTime64JSON_V2 | DateTime64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: - """ - Check if the input is convertible to a scalar of this data type. - - Parameters - ---------- - data : object - The object to check. - - Returns - ------- - TypeGuard[DateTimeLike] - True if the input is a scalar of this data type, False otherwise. - """ - if data is None: - return True - return isinstance(data, str | int | bytes | np.datetime64 | datetime) - - def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: - """ - Cast the input to a scalar of this data type without any type checking. - - Parameters - ---------- - data : DateTimeLike - The scalar data to cast. + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + if check_json_int(data): + return datetime_from_json(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") - Returns - ------- - numpy.datetime64 - The input cast to a NumPy datetime scalar. - """ - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetime_to_json(data) # type: ignore[arg-type] - def cast_scalar(self, data: object) -> np.datetime64: - """ - Cast the input to a scalar of this data type after a type check. - - Parameters - ---------- - data : object - The scalar value to cast. - - Returns - ------- - numpy.datetime64 - The input cast to a NumPy datetime scalar. - - Raises - ------ - TypeError - If the data cannot be converted to a numpy datetime scalar. - """ - if self._check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {self}." - ) - raise TypeError(msg) - - def default_scalar(self) -> np.datetime64: - """ - Return the default scalar value for this data type. - - Returns - ------- - numpy.datetime64 - The default scalar value, which is a 'Not-a-Time' (NaT) value - """ - - return np.datetime64("NaT") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - """ - Read a JSON-serializable value as a scalar. - - Parameters - ---------- - data : JSON - The JSON-serializable value. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - numpy.datetime64 - The numpy datetime scalar. + def check_value(self, data: object) -> bool: + # TODO: decide which values we should accept for datetimes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False - Raises - ------ - TypeError - If the input is not a valid integer type. - """ - if check_json_time(data): - return self._cast_scalar_unchecked(data) - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover + def _cast_value_unsafe(self, value: object) -> np.datetime64: + return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6d51079025..cd252fa181 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -307,7 +307,7 @@ def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e -def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: """ Inspect a sequence of codecs / filters for an "object codec", i.e. a codec that can serialize object arrays to contiguous bytes. Zarr python diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 05679263c5..bd8e8193cc 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -174,6 +174,7 @@ def __init__( chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid) chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) + # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.to_dtype().type(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) diff --git a/tests/conftest.py b/tests/conftest.py index d8ead82406..00fa8c9268 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,9 @@ from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype -from zarr.core.dtype._numpy import DateTime64, HasLength, Structured +from zarr.core.dtype.common import HasLength +from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 704ee2c41c..84df5f38d8 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -11,7 +11,7 @@ from zarr.codecs import BytesCodec from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import BytesLike -from zarr.core.dtype import Bool +from zarr.core.dtype.npy.bool import Bool class TestEntrypointCodec(ArrayBytesCodec): diff --git a/tests/test_array.py b/tests/test_array.py index 1c863c3ebe..b0a7992589 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -42,14 +42,14 @@ from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import ( - DateTime64, - Float64, - Int16, +from zarr.core.dtype.common import Endianness +from zarr.core.dtype.npy.common import endianness_from_numpy_str +from zarr.core.dtype.npy.float import Float64 +from zarr.core.dtype.npy.int import Int16 +from zarr.core.dtype.npy.sized import ( Structured, - endianness_from_numpy_str, ) -from zarr.core.dtype.common import Endianness +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv diff --git a/tests/test_config.py b/tests/test_config.py index 53db9e5208..e9b3921339 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,7 +24,7 @@ from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype._numpy import Int8, VariableLengthString +from zarr.core.dtype import Int8, VariableLengthString from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, diff --git a/tests/test_dtype.py b/tests/test_dtype.py index 122949664c..2b520383b1 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype.py @@ -7,6 +7,12 @@ import zarr from zarr.core.config import config +from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.complex import Complex64, Complex128 +from zarr.core.dtype.npy.float import Float16, Float32, Float64 +from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +from zarr.core.dtype.npy.sized import FixedLengthAscii, FixedLengthBytes, FixedLengthUnicode +from zarr.core.dtype.npy.time import DateTime64 from .conftest import zdtype_examples @@ -26,28 +32,10 @@ data_type_registry, get_data_type_from_json, ) -from zarr.core.dtype._numpy import ( - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, +from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.npy.sized import ( Structured, - UInt8, - UInt16, - UInt32, - UInt64, ) -from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.registry import DataTypeRegistry diff --git a/tests/test_info.py b/tests/test_info.py index 06ce8f1985..f7369b565a 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -5,7 +5,7 @@ from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo, GroupInfo, human_readable_size from zarr.core.common import ZarrFormat -from zarr.core.dtype._numpy import Int32 +from zarr.core.dtype.npy.int import Int32 ZARR_FORMATS = [2, 3] diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 627d615e74..88fb107433 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -9,7 +9,8 @@ import zarr.storage from zarr.core.buffer import cpu from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype._numpy import Float32, Float64, Int16 +from zarr.core.dtype.npy.float import Float32, Float64 +from zarr.core.dtype.npy.int import Int16 from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata from zarr.core.metadata.v2 import parse_zarr_format diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 47fc692f4f..d70095c045 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,8 +12,8 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype._numpy import DateTime64 -from zarr.core.dtype.common import check_json_complex_float +from zarr.core.dtype.npy.common import check_json_complex_float +from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, From 317f5cc0b7a718cc222e5f17e1c21e1ad279aefb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 29 Apr 2025 21:50:15 +0200 Subject: [PATCH 069/129] add timedelta64 --- src/zarr/core/dtype/__init__.py | 4 +- src/zarr/core/dtype/npy/common.py | 3 +- src/zarr/core/dtype/npy/time.py | 239 +++++++++++++++++++++++++----- src/zarr/testing/strategies.py | 18 ++- tests/test_array.py | 6 +- 5 files changed, 222 insertions(+), 48 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 63b593fd28..4cd71bb8bc 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -12,7 +12,7 @@ FixedLengthUnicode, Structured, ) -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -43,6 +43,7 @@ "Int32", "Int64", "Structured", + "TimeDelta64", "UInt8", "UInt16", "UInt32", @@ -68,6 +69,7 @@ | FixedLengthBytes | Structured | DateTime64 + | TimeDelta64 ) ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 6571002bbb..c079664aa5 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -26,8 +26,7 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateUnit = Literal["Y", "M", "W", "D"] -TimeUnit = Literal["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +DateTimeUnit = Literal["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] EndiannessNumpy = Literal[">", "<", "|", "="] TFloatDType_co = TypeVar( diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index a10b9ae8a3..030b01c769 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,15 +1,15 @@ from __future__ import annotations +import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Self, TypeGuard, cast, get_args +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, get_args import numpy as np from zarr.core.dtype.common import DataTypeValidationError, HasEndianness from zarr.core.dtype.npy.common import ( - DateUnit, + DateTimeUnit, EndiannessNumpy, - TimeUnit, check_json_int, endianness_from_numpy_str, endianness_to_numpy_str, @@ -19,15 +19,58 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat +_DTypeName = Literal["datetime64", "timedelta64"] -def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: + +def parse_timedtype_name(name: str) -> tuple[_DTypeName, DateTimeUnit | None]: + """ + Parse a string like "datetime64[s]" into a tuple like ("datetime64", "s"). """ - Convert a JSON integer to a datetime64. + dtype_name: _DTypeName + unit: DateTimeUnit | None + + if name.startswith("datetime64"): + dtype_name = "datetime64" + elif name.startswith("timedelta64"): + dtype_name = "timedelta64" + else: + msg = ( + f"Invalid dtype name. Expected a string starting with on of {get_args(_DTypeName)}. " + f"Got {name!r} instead." + ) + raise ValueError(msg) + + regex = re.search(r"\[(.*?)\]", name) + + if regex is None: + if dtype_name == "timedelta64": + unit = None + else: + msg = ( + "The name of a datetime64 dtype must end with a specification of a unit. " + 'For example, "datetime64[s].' + f"Got {name!r}, which does not follow this pattern." + ) + raise ValueError(msg) + else: + maybe_unit = regex.group(1) + unit_expected = get_args(DateTimeUnit) + if maybe_unit not in unit_expected: + msg = f"Invalid unit. Expected one of {unit_expected}. Got {maybe_unit} instead." + raise ValueError(msg) + unit = maybe_unit # type: ignore[assignment] + + return dtype_name, unit + + +def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: + """ + Convert an integer to a datetime64. Parameters ---------- data : int - The JSON integer to convert. + The integer to convert. unit : DateUnit or TimeUnit The unit of the datetime64. @@ -39,33 +82,150 @@ def datetime_from_json(data: int, unit: DateUnit | TimeUnit) -> np.datetime64: return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) -def datetime_to_json(data: np.datetime64) -> int: +def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: """ - Convert a datetime64 to a JSON integer. + Convert a datetime64 or a timedelta64 to an integer. Parameters ---------- - data : np.datetime64 - The datetime64 value to convert. + data : np.datetime64 | np.timedelta64 + The value to convert. Returns ------- int - The JSON representation of the datetime64. + An integer representation of the scalar. """ return data.view(np.int64).item() +def timedelta_from_int(data: int, unit: DateTimeUnit | None) -> np.timedelta64: + """ + Convert an integer to a timedelta64. + + Parameters + ---------- + data : int + The integer to convert. + unit : DateUnit or TimeUnit + The unit of the timedelta64. + + Returns + ------- + np.timedelta64 + The timedelta64 value. + """ + if unit is not None: + dtype_name = f"timedelta64[{unit}]" + else: + dtype_name = "timedelta64" + return cast("np.timedelta64", np.int64(data).view(dtype_name)) + + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDelta64(ZDType[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): + """ + A wrapper for the ``TimeDelta64`` data type defined in numpy. + Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. + Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the + unit for ``TimeDelta64`` is optional. + """ + + dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] + _zarr_v3_name = "numpy.timedelta64" + unit: DateTimeUnit | None = None + + @classmethod + def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + _, unit = parse_timedtype_name(dtype.name) + byteorder = cast("EndiannessNumpy", dtype.byteorder) + return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) + + def to_dtype(self) -> np.dtypes.TimeDelta64DType: + # Numpy does not allow creating timedelta64 via + # np.dtypes.TimeDelta64DType() + if self.unit is not None: + dtype_string = f"timedelta64[{self.unit}]" + else: + dtype_string = "timedelta64" + dt = np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) + return cast("np.dtypes.TimeDelta64DType", dt) + + def default_value(self) -> np.timedelta64: + return np.timedelta64("NaT") + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + if check_json_int(data): + return timedelta_from_int(data, self.unit) + raise TypeError(f"Invalid type: {data}. Expected an integer.") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + return datetimelike_to_int(data) # type: ignore[arg-type] + + def check_value(self, data: object) -> bool: + # TODO: decide which values we should accept for datetimes. + try: + np.array([data], dtype=self.to_dtype()) + return True # noqa: TRY300 + except ValueError: + return False + + def _cast_value_unsafe(self, value: object) -> np.timedelta64: + return self.to_dtype().type(value) # type: ignore[arg-type] + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match m[M], etc + # consider making this a standalone function + if not (isinstance(data, str) and data[0] in (">", "<") and data[1:3] == "m8"): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either m8 + return True + if len(data) in (6, 7): + return data[4:-1] in get_args(DateTimeUnit) and data[-1] == "]" + else: + return False + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and set(data.keys()) == {"name", "configuration"} + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) in ({"unit"}, {}) + and data["configuration"].get("unit", None) in (*get_args(DateTimeUnit), None) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" - unit: DateUnit | TimeUnit + unit: DateTimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateUnit | TimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit): + unit: DateTimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] + if unit not in get_args(DateTimeUnit): raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') byteorder = cast("EndiannessNumpy", dtype.byteorder) return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) @@ -80,30 +240,6 @@ def to_dtype(self) -> np.dtypes.DateTime64DType: ), ) - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(TimeUnit) + get_args(DateUnit) - and data[-1] == "]" - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and "unit" in data["configuration"] - and data["configuration"]["unit"] in get_args(DateUnit) + get_args(TimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.datetime64: return np.datetime64("NaT") @@ -124,11 +260,11 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return datetime_from_json(data, self.unit) + return datetime_from_int(data, self.unit) raise TypeError(f"Invalid type: {data}. Expected an integer.") def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetime_to_json(data) # type: ignore[arg-type] + return datetimelike_to_int(data) # type: ignore[arg-type] def check_value(self, data: object) -> bool: # TODO: decide which values we should accept for datetimes. @@ -140,3 +276,26 @@ def check_value(self, data: object) -> bool: def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + if zarr_format == 2: + # match M[M], etc + # consider making this a standalone function + return ( + isinstance(data, str) + and len(data) in (6, 7) + and data[0] in (">", "<") + and data[1:4] == "M8[" + and data[4:-1] in get_args(DateTimeUnit) + and data[-1] == "]" + ) + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and set(data["configuration"].keys()) == {"unit"} + and data["configuration"]["unit"] in get_args(DateTimeUnit) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 96c4ec749d..fd8e37ee6d 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -43,7 +43,21 @@ def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> str: return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) -def dtypes() -> st.SearchStrategy[np.dtype[Any]]: +def v3_dtypes() -> st.SearchStrategy[np.dtype]: + return ( + npst.boolean_dtypes() + | npst.integer_dtypes(endianness="=") + | npst.unsigned_integer_dtypes(endianness="=") + | npst.floating_dtypes(endianness="=") + | npst.complex_number_dtypes(endianness="=") + # | npst.byte_string_dtypes(endianness="=") + # | npst.unicode_string_dtypes() + # | npst.datetime64_dtypes() + # | npst.timedelta64_dtypes() + ) + + +def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") @@ -53,7 +67,7 @@ def dtypes() -> st.SearchStrategy[np.dtype[Any]]: | npst.byte_string_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="=") | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="?") ) diff --git a/tests/test_array.py b/tests/test_array.py index b0a7992589..4a99730f7c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -49,7 +49,7 @@ from zarr.core.dtype.npy.sized import ( Structured, ) -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv @@ -992,7 +992,7 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: Test that the fill value of an array is set to the default value for the dtype object """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) - if isinstance(dtype, DateTime64) and np.isnat(a.fill_value): + if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): assert np.isnat(dtype.default_value()) else: assert a.fill_value == dtype.default_value() @@ -1438,7 +1438,7 @@ def test_default_endianness( """ dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness + assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness # type: ignore[union-attr] From 1dd36b351d444456525a3efda51ce01468971a12 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 30 Apr 2025 22:20:40 +0200 Subject: [PATCH 070/129] refactor time dtypes --- src/zarr/core/buffer/core.py | 2 +- src/zarr/core/dtype/__init__.py | 1 + src/zarr/core/dtype/npy/common.py | 4 +- src/zarr/core/dtype/npy/time.py | 285 +++++++++++++----------------- src/zarr/testing/strategies.py | 4 +- tests/conftest.py | 14 +- tests/test_properties.py | 4 +- 7 files changed, 137 insertions(+), 177 deletions(-) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 19125b838f..a8577b282b 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -475,7 +475,7 @@ def as_scalar(self) -> ScalarType: """Returns the buffer as a scalar value""" if self._data.size != 1: raise ValueError("Buffer does not contain a single scalar value") - return cast("ScalarType", self.as_numpy_array()[()]) + return cast(ScalarType, self.as_numpy_array()[()]) @property def dtype(self) -> np.dtype[Any]: diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 4cd71bb8bc..f535f62f35 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -44,6 +44,7 @@ "Int64", "Structured", "TimeDelta64", + "TimeDelta64", "UInt8", "UInt16", "UInt32", diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index c079664aa5..857c515c19 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -26,7 +26,9 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateTimeUnit = Literal["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +DateTimeUnit = Literal[ + "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" +] EndiannessNumpy = Literal[">", "<", "|", "="] TFloatDType_co = TypeVar( diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 030b01c769..056836a105 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,12 +1,23 @@ from __future__ import annotations -import re +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, get_args +from typing import ( + TYPE_CHECKING, + ClassVar, + Generic, + Literal, + Self, + TypedDict, + TypeGuard, + TypeVar, + cast, + get_args, +) import numpy as np -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness +from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import ( DateTimeUnit, EndiannessNumpy, @@ -22,48 +33,7 @@ _DTypeName = Literal["datetime64", "timedelta64"] -def parse_timedtype_name(name: str) -> tuple[_DTypeName, DateTimeUnit | None]: - """ - Parse a string like "datetime64[s]" into a tuple like ("datetime64", "s"). - """ - dtype_name: _DTypeName - unit: DateTimeUnit | None - - if name.startswith("datetime64"): - dtype_name = "datetime64" - elif name.startswith("timedelta64"): - dtype_name = "timedelta64" - else: - msg = ( - f"Invalid dtype name. Expected a string starting with on of {get_args(_DTypeName)}. " - f"Got {name!r} instead." - ) - raise ValueError(msg) - - regex = re.search(r"\[(.*?)\]", name) - - if regex is None: - if dtype_name == "timedelta64": - unit = None - else: - msg = ( - "The name of a datetime64 dtype must end with a specification of a unit. " - 'For example, "datetime64[s].' - f"Got {name!r}, which does not follow this pattern." - ) - raise ValueError(msg) - else: - maybe_unit = regex.group(1) - unit_expected = get_args(DateTimeUnit) - if maybe_unit not in unit_expected: - msg = f"Invalid unit. Expected one of {unit_expected}. Got {maybe_unit} instead." - raise ValueError(msg) - unit = maybe_unit # type: ignore[assignment] - - return dtype_name, unit - - -def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: +def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.datetime64: """ Convert an integer to a datetime64. @@ -71,15 +41,18 @@ def datetime_from_int(data: int, unit: DateTimeUnit) -> np.datetime64: ---------- data : int The integer to convert. - unit : DateUnit or TimeUnit + unit : DateTimeUnit The unit of the datetime64. + interval : int + The interval of the datetime64. Returns ------- np.datetime64 The datetime64 value. """ - return cast("np.datetime64", np.int64(data).view(f"datetime64[{unit}]")) + dtype_name = f"datetime64[{interval}{unit}]" + return cast("np.datetime64", np.int64(data).view(dtype_name)) def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: @@ -99,80 +72,74 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: return data.view(np.int64).item() -def timedelta_from_int(data: int, unit: DateTimeUnit | None) -> np.timedelta64: - """ - Convert an integer to a timedelta64. +_BaseTimeDType_co = TypeVar( + "_BaseTimeDType_co", + bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, + covariant=True, +) +_BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) - Parameters - ---------- - data : int - The integer to convert. - unit : DateUnit or TimeUnit - The unit of the timedelta64. +TName = TypeVar("TName", bound=str) +TConfig = TypeVar("TConfig", bound=Mapping[str, object]) - Returns - ------- - np.timedelta64 - The timedelta64 value. - """ - if unit is not None: - dtype_name = f"timedelta64[{unit}]" - else: - dtype_name = "timedelta64" - return cast("np.timedelta64", np.int64(data).view(dtype_name)) +class NamedConfig(TypedDict, Generic[TName, TConfig]): + name: TName + configuration: TConfig -@dataclass(frozen=True, kw_only=True, slots=True) -class TimeDelta64(ZDType[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): - """ - A wrapper for the ``TimeDelta64`` data type defined in numpy. - Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. - Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the - unit for ``TimeDelta64`` is optional. - """ - dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.timedelta64" - unit: DateTimeUnit | None = None +class TimeConfig(TypedDict): + unit: DateTimeUnit + interval: int + + +# aspirational +DateTime64MetaParams = NamedConfig[Literal["numpy.datetime64"], TimeConfig] +TimeDelta64MetaParams = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] + + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): + _zarr_v2_names: ClassVar[tuple[str, ...]] + # this attribute exists so that we can programmatically create a numpy dtype instance + # because the particular numpy dtype we are wrapping does not allow direct construction via + # cls.dtype_cls() + _numpy_name: ClassVar[_DTypeName] + interval: int + unit: DateTimeUnit @classmethod def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - _, unit = parse_timedtype_name(dtype.name) + unit, interval = np.datetime_data(dtype.name) byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.TimeDelta64DType: - # Numpy does not allow creating timedelta64 via - # np.dtypes.TimeDelta64DType() - if self.unit is not None: - dtype_string = f"timedelta64[{self.unit}]" - else: - dtype_string = "timedelta64" - dt = np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) - return cast("np.dtypes.TimeDelta64DType", dt) + return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] - def default_value(self) -> np.timedelta64: - return np.timedelta64("NaT") - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def to_dtype(self) -> _BaseTimeDType_co: + # Numpy does not allow creating datetime64 or timedelta64 via + # np.dtypes.{dtype_name}() + # so we use np.dtype with a formatted string. + dtype_string = f"{self._numpy_name}[{self.interval}{self.unit}]" + return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] + unit = data["configuration"]["unit"] # type: ignore[index, call-overload] + interval = data["configuration"]["interval"] # type: ignore[index, call-overload] + return cls(unit=unit, interval=interval) # type: ignore[arg-type] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data): - return timedelta_from_int(data, self.unit) - raise TypeError(f"Invalid type: {data}. Expected an integer.") + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return cast("str", self.to_dtype().str) + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "interval": self.interval}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] @@ -185,6 +152,31 @@ def check_value(self, data: object) -> bool: except ValueError: return False + +@dataclass(frozen=True, kw_only=True, slots=True) +class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): + """ + A wrapper for the ``TimeDelta64`` data type defined in numpy. + Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. + Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the + unit for ``TimeDelta64`` is optional. + """ + + dtype_cls = np.dtypes.TimeDelta64DType + _zarr_v3_name = "numpy.timedelta64" + _zarr_v2_names = (">m8", " np.timedelta64: + return np.timedelta64("NaT") + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + if check_json_int(data): + return self.to_dtype().type(data, f"{self.interval}{self.unit}") + raise TypeError(f"Invalid type: {data}. Expected an integer.") + def _cast_value_unsafe(self, value: object) -> np.timedelta64: return self.to_dtype().type(value) # type: ignore[arg-type] @@ -193,16 +185,16 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: # match m[M], etc # consider making this a standalone function - if not (isinstance(data, str) and data[0] in (">", "<") and data[1:3] == "m8"): + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): return False if len(data) == 3: # no unit, and # we already checked that this string is either m8 return True - if len(data) in (6, 7): - return data[4:-1] in get_args(DateTimeUnit) and data[-1] == "]" else: - return False + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" elif zarr_format == 3: return ( isinstance(data, dict) @@ -210,70 +202,29 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and data["name"] == cls._zarr_v3_name and set(data.keys()) == {"name", "configuration"} and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) in ({"unit"}, {}) - and data["configuration"].get("unit", None) in (*get_args(DateTimeUnit), None) + and set(data["configuration"].keys()) == {"unit", "interval"} + and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True, slots=True) -class DateTime64(ZDType[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] +class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + dtype_cls = np.dtypes.DateTime64DType _zarr_v3_name = "numpy.datetime64" - unit: DateTimeUnit - - @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: - unit: DateTimeUnit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")] # type: ignore[assignment] - if unit not in get_args(DateTimeUnit): - raise DataTypeValidationError('Invalid unit for "numpy.datetime64"') - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, endianness=endianness_from_numpy_str(byteorder)) - - def to_dtype(self) -> np.dtypes.DateTime64DType: - # Numpy does not allow creating datetime64 via - # np.dtypes.DateTime64Dtype() - return cast( - "np.dtypes.DateTime64DType", - np.dtype(f"datetime64[{self.unit}]").newbyteorder( - endianness_to_numpy_str(self.endianness) - ), - ) + _zarr_v2_names = (">M8", " np.datetime64: return np.datetime64("NaT") - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"unit": self.unit}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(unit=data["configuration"]["unit"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return datetime_from_int(data, self.unit) + return self.to_dtype().type(data, f"{self.interval}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: - return datetimelike_to_int(data) # type: ignore[arg-type] - - def check_value(self, data: object) -> bool: - # TODO: decide which values we should accept for datetimes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] @@ -282,20 +233,22 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: if zarr_format == 2: # match M[M], etc # consider making this a standalone function - return ( - isinstance(data, str) - and len(data) in (6, 7) - and data[0] in (">", "<") - and data[1:4] == "M8[" - and data[4:-1] in get_args(DateTimeUnit) - and data[-1] == "]" - ) + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" elif zarr_format == 3: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit"} + and set(data["configuration"].keys()) == {"unit", "interval"} and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index fd8e37ee6d..f111569450 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -52,8 +52,8 @@ def v3_dtypes() -> st.SearchStrategy[np.dtype]: | npst.complex_number_dtypes(endianness="=") # | npst.byte_string_dtypes(endianness="=") # | npst.unicode_string_dtypes() - # | npst.datetime64_dtypes() - # | npst.timedelta64_dtypes() + | npst.datetime64_dtypes() + | npst.timedelta64_dtypes() ) diff --git a/tests/conftest.py b/tests/conftest.py index 00fa8c9268..78f44c3822 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,10 +21,14 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config -from zarr.core.dtype import data_type_registry, get_data_type_from_native_dtype +from zarr.core.dtype import ( + DateTime64, + Structured, + TimeDelta64, + data_type_registry, + get_data_type_from_native_dtype, +) from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.sized import Structured -from zarr.core.dtype.npy.time import DateTime64 from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -443,7 +447,7 @@ def meta_from_array( zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) - elif issubclass(wrapper_cls, DateTime64): - zdtype_examples += (wrapper_cls(unit="s"),) + elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): + zdtype_examples += (wrapper_cls(unit="s", interval=10),) else: zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_properties.py b/tests/test_properties.py index df384f187f..31fa17ce93 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -332,5 +332,5 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N elif dtype_native.kind == "c": # fill_value should be a two-element array [real, imag]. assert serialized_complex_float_is_valid(asdict_dict["fill_value"]) - elif dtype_native.kind == "M" and np.isnat(meta.fill_value): - assert asdict_dict["fill_value"] == "NaT" + elif dtype_native.kind in ("M", "m") and np.isnat(meta.fill_value): + assert asdict_dict["fill_value"] == -9223372036854775808 From b91ebb610939b7f0091a7b8f9e0732600caade0f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 1 May 2025 11:12:35 +0200 Subject: [PATCH 071/129] widen dtype test strategies --- src/zarr/testing/strategies.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index f111569450..38ef7119db 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -43,17 +43,17 @@ def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> str: return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) -def v3_dtypes() -> st.SearchStrategy[np.dtype]: +def v3_dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") | npst.unsigned_integer_dtypes(endianness="=") | npst.floating_dtypes(endianness="=") | npst.complex_number_dtypes(endianness="=") - # | npst.byte_string_dtypes(endianness="=") - # | npst.unicode_string_dtypes() - | npst.datetime64_dtypes() - | npst.timedelta64_dtypes() + | npst.byte_string_dtypes(endianness="=") + | npst.unicode_string_dtypes(endianness="=") + | npst.datetime64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="=") ) @@ -67,7 +67,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: | npst.byte_string_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="=") | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="?") + | npst.timedelta64_dtypes(endianness="=") ) From 5a2c48d2be714f562fd5b629a97d4a3827f86e21 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 2 May 2025 16:48:49 +0200 Subject: [PATCH 072/129] wip: begin creating isomorphic test suite for dtypes --- src/zarr/abc/codec.py | 6 +- src/zarr/codecs/sharding.py | 4 +- src/zarr/codecs/transpose.py | 4 +- src/zarr/core/_info.py | 4 +- src/zarr/core/array.py | 24 +-- src/zarr/core/array_spec.py | 6 +- src/zarr/core/codec_pipeline.py | 4 +- src/zarr/core/dtype/__init__.py | 12 +- src/zarr/core/dtype/common.py | 3 +- src/zarr/core/dtype/npy/bool.py | 4 +- src/zarr/core/dtype/npy/common.py | 36 ++-- src/zarr/core/dtype/npy/complex.py | 4 +- src/zarr/core/dtype/npy/float.py | 4 +- src/zarr/core/dtype/npy/int.py | 20 +-- src/zarr/core/dtype/npy/sized.py | 18 +- src/zarr/core/dtype/npy/string.py | 6 +- src/zarr/core/dtype/npy/time.py | 4 +- src/zarr/core/dtype/registry.py | 12 +- src/zarr/core/dtype/wrapper.py | 14 +- src/zarr/core/metadata/v2.py | 4 +- src/zarr/core/metadata/v3.py | 10 +- tests/package_with_entrypoint/__init__.py | 4 +- tests/{ => test_dtype}/test_dtype.py | 185 +++----------------- tests/test_dtype/test_npy/test_common.py | 203 ++++++++-------------- tests/test_dtype_registry.py | 85 +++------ 25 files changed, 225 insertions(+), 455 deletions(-) rename tests/{ => test_dtype}/test_dtype.py (58%) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f064fad02e..47664abced 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -15,7 +15,7 @@ from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.indexing import SelectorTuple from zarr.core.metadata import ArrayMetadata @@ -97,7 +97,7 @@ def validate( self, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: """Validates that the codec configuration is compatible with the array metadata. @@ -311,7 +311,7 @@ def supports_partial_encode(self) -> bool: ... @abstractmethod def validate( - self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid ) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 5089baeff5..b3ff0953d4 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -59,7 +59,7 @@ from typing import Self from zarr.core.common import JSON - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -409,7 +409,7 @@ def validate( self, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: if len(self.chunk_shape) != len(shape): diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 7715d06265..be89690441 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -16,7 +16,7 @@ from zarr.core.buffer import NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: @@ -49,7 +49,7 @@ def to_dict(self) -> dict[str, JSON]: def validate( self, shape: tuple[int, ...], - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid, ) -> None: if len(self.order) != len(shape): diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 310ba27ea1..1a16a4808a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -9,7 +9,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @dataclasses.dataclass(kw_only=True) @@ -80,7 +80,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat - _data_type: ZDType[_BaseDType, _BaseScalar] + _data_type: ZDType[TBaseDType, TBaseScalar] _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 83d9763915..6861111bab 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -130,7 +130,7 @@ from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar from zarr.core.group import AsyncGroup from zarr.storage import StoreLike @@ -582,7 +582,7 @@ async def _create( *, # v2 and v3 shape: ShapeLike, - dtype: ZDTypeLike | ZDType[_BaseDType, _BaseScalar], + dtype: ZDTypeLike | ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat = 3, fill_value: Any | None = DEFAULT_FILL_VALUE, attributes: dict[str, JSON] | None = None, @@ -703,7 +703,7 @@ async def _create( @staticmethod def _create_metadata_v3( shape: ShapeLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_shape: ChunkCoords, fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, @@ -754,7 +754,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunk_shape: ChunkCoords, config: ArrayConfig, fill_value: Any | None = DEFAULT_FILL_VALUE, @@ -802,7 +802,7 @@ async def _create_v3( @staticmethod def _create_metadata_v2( shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunks: ChunkCoords, order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, @@ -832,7 +832,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], chunks: ChunkCoords, order: MemoryOrder, config: ArrayConfig, @@ -1079,7 +1079,7 @@ def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec ) @property - def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: + def _zdtype(self) -> ZDType[TBaseDType, TBaseScalar]: """ The zarr-specific representation of the array data type """ @@ -1089,7 +1089,7 @@ def _zdtype(self) -> ZDType[_BaseDType, _BaseScalar]: return self.metadata.data_type @property - def dtype(self) -> _BaseDType: + def dtype(self) -> TBaseDType: """Returns the data type of the array. Returns @@ -4655,7 +4655,7 @@ def _parse_chunk_key_encoding( def _get_default_chunk_encoding_v3( - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. @@ -4675,7 +4675,7 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Given a data type, return the default filters for that data type. @@ -4695,7 +4695,7 @@ def _parse_chunk_encoding_v2( *, compressor: CompressorsLike, filters: FiltersLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. @@ -4765,7 +4765,7 @@ def _parse_chunk_encoding_v3( compressors: CompressorsLike, filters: FiltersLike, serializer: SerializerLike, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e8e451944f..279bf6edf0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -17,7 +17,7 @@ from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType class ArrayConfigParams(TypedDict): @@ -89,7 +89,7 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[TBaseDType, TBaseScalar] fill_value: Any config: ArrayConfig prototype: BufferPrototype @@ -97,7 +97,7 @@ class ArraySpec: def __init__( self, shape: ChunkCoords, - dtype: ZDType[_BaseDType, _BaseScalar], + dtype: ZDType[TBaseDType, TBaseScalar], fill_value: Any, config: ArrayConfig, prototype: BufferPrototype, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 71600fee90..3d00fe5467 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -27,7 +27,7 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType T = TypeVar("T") U = TypeVar("U") @@ -133,7 +133,7 @@ def __iter__(self) -> Iterator[Codec]: yield from self.bytes_bytes_codecs def validate( - self, *, shape: ChunkCoords, dtype: ZDType[_BaseDType, _BaseScalar], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid ) -> None: for codec in self: codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index f535f62f35..1a18849a13 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, TypeAlias, get_args +from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 @@ -26,11 +27,12 @@ VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ "Complex64", "Complex128", + "DataTypeValidationError", "DateTime64", "FixedLengthAscii", "FixedLengthBytes", @@ -73,14 +75,14 @@ | TimeDelta64 ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[_BaseDType, _BaseScalar] | dict[str, JSON] +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] for dtype in get_args(DTYPE): data_type_registry.register(dtype._zarr_v3_name, dtype) # TODO: find a better name for this function -def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, _BaseScalar]: +def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, TBaseScalar]: """ Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. """ @@ -106,11 +108,11 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType, def get_data_type_from_json( dtype: JSON, zarr_format: ZarrFormat -) -> ZDType[_BaseDType, _BaseScalar]: +) -> ZDType[TBaseDType, TBaseScalar]: return data_type_registry.match_json(dtype, zarr_format=zarr_format) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: +def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. """ diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 657f56bfb7..4249c57b1f 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -4,7 +4,8 @@ from typing import Literal Endianness = Literal["little", "big"] -JSONFloat = float | Literal["NaN", "Infinity", "-Infinity"] +SpecialFloats = Literal["NaN", "Infinity", "-Infinity"] +JSONFloat = float | SpecialFloats class DataTypeValidationError(ValueError): ... diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 293d8383c0..776acf4f8c 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -5,7 +5,7 @@ from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) @@ -26,7 +26,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.BoolDType: diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 857c515c19..8ef1286e6f 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -77,7 +77,7 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: # for dtypes without byte ordering semantics return None raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(EndiannessNumpy)}" + f"Invalid endianness: {endianness!r}. Expected one of {get_args(EndiannessNumpy)}" ) @@ -108,7 +108,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: case None: return "|" raise ValueError( - f"Invalid endianness: {endianness}. Expected one of {get_args(Endianness)} or None" + f"Invalid endianness: {endianness!r}. Expected one of {get_args(Endianness)} or None" ) @@ -155,7 +155,7 @@ def float_from_json_v3(data: JSONFloat) -> float: return float_from_json_v2(data) -def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: +def float_from_json(data: JSONFloat, *, zarr_format: ZarrFormat) -> float: """ Convert a JSON float to a float based on zarr format. @@ -177,7 +177,7 @@ def float_from_json(data: JSONFloat, zarr_format: ZarrFormat) -> float: return float_from_json_v3(data) -def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: +def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: """ Convert a JSON string to bytes @@ -198,7 +198,7 @@ def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes: # TODO: differentiate these as needed. This is a spec question. if zarr_format == 3: return base64.b64decode(data.encode("ascii")) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") # pragma: no cover def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: @@ -261,9 +261,11 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) -def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_float_to_json_v3( + data: complex | np.complexfloating[Any, Any], +) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON (v3). + Convert a complex number to JSON as defined by the Zarr V3 spec. Parameters ---------- @@ -278,13 +280,15 @@ def complex_to_json_v3(data: complex | np.complexfloating[Any, Any]) -> tuple[JS return float_to_json_v3(data.real), float_to_json_v3(data.imag) -def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JSONFloat, JSONFloat]: +def complex_float_to_json_v2( + data: complex | np.complexfloating[Any, Any], +) -> tuple[JSONFloat, JSONFloat]: """ - Convert a complex number to JSON (v2). + Convert a complex number to JSON as defined by the Zarr V2 spec. Parameters ---------- - data : complex or np.complexfloating + data : complex | np.complexfloating The complex value to convert. Returns @@ -296,14 +300,14 @@ def complex_to_json_v2(data: complex | np.complexfloating[Any, Any]) -> tuple[JS def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], zarr_format: ZarrFormat + data: complex | np.complexfloating[Any, Any], *, zarr_format: ZarrFormat ) -> tuple[JSONFloat, JSONFloat]: """ Convert a complex number to JSON, parametrized by the zarr format version. Parameters ---------- - data : complex or np.complexfloating + data : complex | np.complexfloating The complex value to convert. zarr_format : ZarrFormat The zarr format version. @@ -314,19 +318,19 @@ def complex_float_to_json( The JSON representation of the complex number. """ if zarr_format == 2: - return complex_to_json_v2(data) + return complex_float_to_json_v2(data) else: - return complex_to_json_v3(data) + return complex_float_to_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_to_json(data: float | np.floating[Any], zarr_format: ZarrFormat) -> JSONFloat: +def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: """ Convert a float to JSON, parametrized by the zarr format version. Parameters ---------- - data : float or np.floating + data : float | np.floating The float value to convert. zarr_format : ZarrFormat The zarr format version. diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 22e1bd66a3..6e19266660 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -21,7 +21,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.dtype.npy.common import EndiannessNumpy @@ -33,7 +33,7 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 3f56919cf4..15baaaadaa 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -16,7 +16,7 @@ float_from_json, float_to_json, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True) @@ -25,7 +25,7 @@ class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 500f98bb73..7da7245162 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -11,7 +11,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -132,7 +132,7 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.Int8DType: @@ -150,7 +150,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self: Self) -> np.dtypes.UInt8DType: @@ -168,7 +168,7 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -193,7 +193,7 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -217,7 +217,7 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Self: + def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, @@ -229,7 +229,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: return super().from_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -253,7 +253,7 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -277,7 +277,7 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) @@ -301,7 +301,7 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 8d8ff57800..d9524a4891 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -16,7 +16,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @dataclass(frozen=True, kw_only=True) @@ -26,7 +26,7 @@ class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.BytesDType[int]: @@ -98,7 +98,7 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): item_size_bits: ClassVar[int] = 8 @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) def to_dtype(self) -> np.dtypes.VoidDType[int]: @@ -136,7 +136,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. @@ -181,7 +181,7 @@ class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.item_size_bits // 8), @@ -252,7 +252,7 @@ def _cast_value_unsafe(self, value: object) -> np.str_: class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" - fields: tuple[tuple[str, ZDType[_BaseDType, _BaseScalar]], ...] + fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] def default_value(self) -> np.void: return self._cast_value_unsafe(0) @@ -261,7 +261,7 @@ def _cast_value_unsafe(self, value: object) -> np.void: return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) @classmethod - def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -278,10 +278,10 @@ def check_dtype(cls, dtype: _BaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: return super().check_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype - fields: list[tuple[str, ZDType[_BaseDType, _BaseScalar]]] = [] + fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] if dtype.fields is None: raise ValueError("numpy dtype has no fields") diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 15ccfb30f1..3849fd05ce 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.wrapper import _BaseDType + from zarr.core.dtype.wrapper import TBaseDType _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") @@ -23,7 +23,7 @@ class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.StringDType: @@ -83,7 +83,7 @@ class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[ _zarr_v3_name = "numpy.variable_length_utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() def to_dtype(self) -> np.dtypes.ObjectDType: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 056836a105..f691bd88c8 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -25,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType, _BaseDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -109,7 +109,7 @@ class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): unit: DateTimeUnit @classmethod - def _from_dtype_unsafe(cls, dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit, interval = np.datetime_data(dtype.name) byteorder = cast("EndiannessNumpy", dtype.byteorder) return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 4ad2158f96..ae5c3d426e 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -9,7 +9,7 @@ from importlib.metadata import EntryPoint from zarr.core.common import JSON, ZarrFormat - from zarr.core.dtype.wrapper import ZDType, _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType # This class is different from the other registry classes, which inherit from @@ -17,7 +17,7 @@ # have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: - contents: dict[str, type[ZDType[_BaseDType, _BaseScalar]]] = field( + contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) @@ -28,15 +28,15 @@ def lazy_load(self) -> None: self.lazy_load_list.clear() - def register(self: Self, key: str, cls: type[ZDType[_BaseDType, _BaseScalar]]) -> None: + def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: # don't register the same dtype twice if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls - def get(self, key: str) -> type[ZDType[_BaseDType, _BaseScalar]]: + def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: return self.contents[key] - def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: + def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() for val in self.contents.values(): try: @@ -45,7 +45,7 @@ def match_dtype(self, dtype: _BaseDType) -> ZDType[_BaseDType, _BaseScalar]: pass raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") - def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[_BaseDType, _BaseScalar]: + def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() for val in self.contents.values(): try: diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index ba1b78f096..be51db3ae5 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -35,15 +35,15 @@ # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type -_BaseScalar = np.generic | str +TBaseScalar = np.generic | str # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. -_BaseDType = np.dtype[np.generic] +TBaseDType = np.dtype[np.generic] # These two type parameters are covariant because we want # x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] # to type check -TScalar_co = TypeVar("TScalar_co", bound=_BaseScalar, covariant=True) -TDType_co = TypeVar("TDType_co", bound=_BaseDType, covariant=True) +TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) +TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) @dataclass(frozen=True, kw_only=True, slots=True) @@ -69,7 +69,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: + def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -86,7 +86,7 @@ def check_dtype(cls: type[Self], dtype: _BaseDType) -> TypeGuard[TDType_co]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: + def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a dtype object. @@ -113,7 +113,7 @@ def from_dtype(cls: type[Self], dtype: _BaseDType) -> Self: @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: _BaseDType) -> Self: + def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a native dtype without checking. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index cd252fa181..440f238ac0 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -8,7 +8,7 @@ from zarr.abc.metadata import Metadata from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.wrapper import TDType_co, TScalar_co, ZDType, _BaseDType, _BaseScalar +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType if TYPE_CHECKING: from typing import Literal, Self @@ -61,7 +61,7 @@ class ArrayV2MetadataDict(TypedDict): class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords - dtype: ZDType[_BaseDType, _BaseScalar] + dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index bd8e8193cc..07856a3c7c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -16,7 +16,7 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import JSON, ChunkCoords - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar import json @@ -83,7 +83,7 @@ def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: return abcs[0] -def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[_BaseDType, _BaseScalar]) -> None: +def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseScalar]) -> None: """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec @@ -142,7 +142,7 @@ class ArrayV3MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV3Metadata(Metadata): shape: ChunkCoords - data_type: ZDType[_BaseDType, _BaseScalar] + data_type: ZDType[TBaseDType, TBaseScalar] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any @@ -157,7 +157,7 @@ def __init__( self, *, shape: Iterable[int], - data_type: ZDType[_BaseDType, _BaseScalar], + data_type: ZDType[TBaseDType, TBaseScalar], chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: ChunkKeyEncodingLike, fill_value: object, @@ -223,7 +223,7 @@ def ndim(self) -> int: return len(self.shape) @property - def dtype(self) -> ZDType[_BaseDType, _BaseScalar]: + def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: return self.data_type @property diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 84df5f38d8..ef605be41a 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -10,7 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import BytesLike +from zarr.core.common import BytesLike, ZarrFormat from zarr.core.dtype.npy.bool import Bool @@ -83,5 +83,5 @@ def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: return cls() raise ValueError - def to_json(self, zarr_format): + def to_json(self, zarr_format: ZarrFormat) -> str: return self._zarr_v3_name diff --git a/tests/test_dtype.py b/tests/test_dtype/test_dtype.py similarity index 58% rename from tests/test_dtype.py rename to tests/test_dtype/test_dtype.py index 2b520383b1..566a04b5fb 100644 --- a/tests/test_dtype.py +++ b/tests/test_dtype/test_dtype.py @@ -1,48 +1,42 @@ from __future__ import annotations -import os -import re -import sys from typing import TYPE_CHECKING, Any, get_args -import zarr -from zarr.core.config import config -from zarr.core.dtype.npy.bool import Bool -from zarr.core.dtype.npy.complex import Complex64, Complex128 -from zarr.core.dtype.npy.float import Float16, Float32, Float64 -from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.sized import FixedLengthAscii, FixedLengthBytes, FixedLengthUnicode -from zarr.core.dtype.npy.time import DateTime64 +from zarr.core.dtype import ( + DTYPE, + Bool, + Complex64, + Complex128, + DateTime64, + FixedLengthAscii, + FixedLengthBytes, + FixedLengthUnicode, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + Structured, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthString, + ZDType, +) from .conftest import zdtype_examples if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import _BaseDType, _BaseScalar + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar import numpy as np import pytest -from zarr.core.dtype import ( - DTYPE, - VariableLengthString, - ZDType, - data_type_registry, - get_data_type_from_json, -) from zarr.core.dtype.common import DataTypeValidationError -from zarr.core.dtype.npy.sized import ( - Structured, -) -from zarr.core.dtype.registry import DataTypeRegistry - - -@pytest.fixture -def data_type_registry_fixture() -> DataTypeRegistry: - return DataTypeRegistry() - _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType @@ -177,7 +171,7 @@ def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None ], ) def test_to_json_value_v2( - wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any + wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v2 @@ -213,7 +207,7 @@ def test_to_json_value_v2( ], ) def test_to_json_value_v3( - wrapper: ZDType[_BaseDType, _BaseScalar], input_value: Any, expected_json: Any + wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any ) -> None: """ Test the to_json_value method for each dtype wrapper for zarr v3 @@ -246,132 +240,9 @@ def test_to_json_value_v3( ], ) def test_from_json_value( - wrapper: ZDType[_BaseDType, _BaseScalar], json_value: Any, expected_value: Any + wrapper: ZDType[TBaseDType, TBaseScalar], json_value: Any, expected_value: Any ) -> None: """ Test the from_json_value method for each dtype wrapper. """ assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value - - -class TestRegistry: - @staticmethod - def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that registering a dtype in a data type registry works. - """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) - - @staticmethod - def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that registering a new dtype with the same name works (overriding the previous one). - """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - - class NewBool(Bool): - def default_value(self) -> np.bool_: - return np.True_ - - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) - - @staticmethod - @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] - ) - def test_match_dtype( - data_type_registry_fixture: DataTypeRegistry, - wrapper_cls: type[ZDType[_BaseDType, _BaseScalar]], - dtype_str: str, - ) -> None: - """ - Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. - """ - data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) - - @staticmethod - def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: - """ - Test that match_dtype raises an error if the dtype is not registered. - """ - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" - ): - data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) - - with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype) - - @staticmethod - @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes( - zdtype: ZDType[_BaseDType, _BaseScalar], zarr_format: ZarrFormat - ) -> None: - """ - Test that the registered dtypes can be retrieved from the registry. - """ - - assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype - assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format - ) - == zdtype - ) - - @staticmethod - @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_match_dtype_unique( - zdtype: ZDType[Any, Any], - data_type_registry_fixture: DataTypeRegistry, - zarr_format: ZarrFormat, - ) -> None: - """ - Test that the match_dtype method uniquely specifies a registered data type. We create a local registry - that excludes the data type class being tested, and ensure that an instance of the wrapped data type - fails to match anything in the registry - """ - for _cls in get_args(DTYPE): - if _cls is not type(zdtype): - data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - - dtype_instance = zdtype.to_dtype() - - msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" - with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(dtype_instance) - - instance_dict = zdtype.to_json(zarr_format=zarr_format) - msg = f"No data type wrapper found that matches {instance_dict}" - with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) - - -# this is copied from the registry tests -- we should deduplicate -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - -@pytest.mark.usefixtures("set_path") -def test_entrypoint_codec(zarr_format: ZarrFormat) -> None: - from package_with_entrypoint import TestDataType - - instance = TestDataType() - dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index d39d308112..f3082d0c3b 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -9,22 +9,21 @@ import numpy as np import pytest -from zarr.core.dtype.common import ENDIANNESS_STR, JSONFloatV2, SpecialFloatStrings +from zarr.core.dtype.common import Endianness, JSONFloat, SpecialFloats from zarr.core.dtype.npy.common import ( - NumpyEndiannessStr, + EndiannessNumpy, bytes_from_json, bytes_to_json, - check_json_bool, - check_json_complex_float_v2, - check_json_complex_float_v3, + check_json_float, check_json_float_v2, check_json_float_v3, check_json_int, - check_json_str, + complex_float_to_json, complex_float_to_json_v2, complex_float_to_json_v3, endianness_from_numpy_str, endianness_to_numpy_str, + float_from_json, float_from_json_v2, float_from_json_v3, float_to_json_v2, @@ -32,7 +31,7 @@ ) if TYPE_CHECKING: - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import ZarrFormat def nan_equal(a: object, b: object) -> bool: @@ -45,7 +44,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2_roundtrip_cases: tuple[tuple[JSONFloatV2, float | np.floating[Any]], ...] = ( +json_float_v2: list[tuple[JSONFloat, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -53,9 +52,19 @@ def nan_equal(a: object, b: object) -> bool: ("NaN", float("nan")), ("NaN", np.nan), (1.0, 1.0), -) +] -json_float_v3_cases = json_float_v2_roundtrip_cases +# exactly the same as v2, for now, until we get support for the special NaN encoding defined in the +# v3 spec +json_float_v3: list[tuple[JSONFloat, float | np.floating[Any]]] = [ + ("Infinity", float("inf")), + ("Infinity", np.inf), + ("-Infinity", float("-inf")), + ("-Infinity", -np.inf), + ("NaN", float("nan")), + ("NaN", np.nan), + (1.0, 1.0), +] @pytest.mark.parametrize( @@ -67,10 +76,10 @@ def test_endianness_from_numpy_str(data: str, expected: str | None) -> None: Test that endianness_from_numpy_str correctly converts a numpy str literal to a human-readable literal value. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(NumpyEndiannessStr): + if data in get_args(EndiannessNumpy): assert endianness_from_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {get_args(NumpyEndiannessStr)}" + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(EndiannessNumpy)}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_from_numpy_str(data) # type: ignore[arg-type] @@ -84,23 +93,21 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: Test that endianness_to_numpy_str correctly converts a human-readable literal value to a numpy str literal. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in ENDIANNESS_STR: + if data in get_args(Endianness) + (None,): assert endianness_to_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}" + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(Endianness)}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_to_numpy_str(data) # type: ignore[arg-type] -@pytest.mark.parametrize( - ("data", "expected"), json_float_v2_roundtrip_cases + (("SHOULD_ERR", ""),) -) -def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> None: +@pytest.mark.parametrize(("data", "expected"), json_float_v2 + [("SHOULD_ERR", "")]) +def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data != "SHOULD_ERR": + if data in get_args(SpecialFloats) or isinstance(data, float): assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] else: msg = f"could not convert string to float: {data!r}" @@ -108,36 +115,37 @@ def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> N float_from_json_v2(data) # type: ignore[arg-type] -@pytest.mark.parametrize( - ("data", "expected"), json_float_v3_cases + (("SHOULD_ERR", ""), ("0x", "")) -) -def test_float_from_json_v3(data: JSONFloatV2 | str, expected: float | str) -> None: +@pytest.mark.parametrize(("data", "expected"), json_float_v3 + [("SHOULD_ERR", "")]) +def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data == "SHOULD_ERR": - msg = ( - f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" - " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." - ) + if data in get_args(SpecialFloats) or isinstance(data, float): + assert nan_equal(float_from_json_v3(data), expected) # type: ignore[arg-type] + else: + msg = f"could not convert string to float: {data!r}" with pytest.raises(ValueError, match=msg): - float_from_json_v3(data) - elif data == "0x": - msg = ( - f"Invalid hexadecimal float value: {data!r}. " - "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" - ) + float_from_json_v3(data) # type: ignore[arg-type] - with pytest.raises(ValueError, match=msg): - float_from_json_v3(data) + +@pytest.mark.parametrize(("data", "expected"), json_float_v2) +def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: + """ + Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. + This test also checks that an invalid string input raises a ``ValueError`` + """ + observed = float_from_json(data, zarr_format=zarr_format) + if zarr_format == 2: + expected = float_from_json_v2(data) else: - assert nan_equal(float_from_json_v3(data), expected) + expected = float_from_json_v3(data) + assert nan_equal(observed, expected) # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v2_roundtrip_cases) -def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) -> None: +@pytest.mark.parametrize(("expected", "data"), json_float_v2) +def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v2 """ @@ -146,8 +154,8 @@ def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v3_cases) -def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloatV2) -> None: +@pytest.mark.parametrize(("expected", "data"), json_float_v3) +def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v3 """ @@ -180,10 +188,8 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_roundtrip_cases) -def test_complex_to_json_v2( - float_data: float | np.floating[Any], json_expected: JSONFloatV2 -) -> None: +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2) +def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v2 format. @@ -198,10 +204,8 @@ def test_complex_to_json_v2( # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) -def test_complex_to_json_v3( - float_data: float | np.floating[Any], json_expected: JSONFloatV2 -) -> None: +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v3 format. @@ -215,9 +219,9 @@ def test_complex_to_json_v3( assert complex_float_to_json_v3(cplx_npy) == (json_expected, json_expected) -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) def test_complex_float_to_json( - float_data: float | np.floating[Any], json_expected: JSONFloatV2, zarr_format: ZarrFormat + float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat ) -> None: """ Test that complex numbers are correctly converted to JSON in v2 or v3 formats, depending @@ -230,27 +234,18 @@ def test_complex_float_to_json( cplx = complex(float_data, float_data) cplx_npy = np.complex128(cplx) - if zarr_format == 2: - assert complex_float_to_json_v2(cplx) == (json_expected, json_expected) - assert complex_float_to_json_v2(cplx_npy) == ( - json_expected, - json_expected, - ) - elif zarr_format == 3: - assert complex_float_to_json_v3(cplx) == (json_expected, json_expected) - assert complex_float_to_json_v3(cplx_npy) == ( - json_expected, - json_expected, - ) - else: - raise ValueError("zarr_format must be 2 or 3") # pragma: no cover + assert complex_float_to_json(cplx, zarr_format=zarr_format) == (json_expected, json_expected) + assert complex_float_to_json(cplx_npy, zarr_format=zarr_format) == ( + json_expected, + json_expected, + ) -check_json_float_cases = get_args(SpecialFloatStrings) + (1.0, 2) +check_json_float_cases = get_args(SpecialFloats) + (1.0, 2) @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v2_valid(data: JSONFloatV2 | int) -> None: +def test_check_json_float_v2_valid(data: JSONFloat | int) -> None: assert check_json_float_v2(data) @@ -259,7 +254,7 @@ def test_check_json_float_v2_invalid() -> None: @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v3_valid(data: JSONFloatV2 | int) -> None: +def test_check_json_float_v3_valid(data: JSONFloat | int) -> None: assert check_json_float_v3(data) @@ -267,76 +262,16 @@ def test_check_json_float_v3_invalid() -> None: assert not check_json_float_v3("invalid") -check_json_complex_float_true_cases: tuple[list[JSONFloatV2], ...] = ( - [0.0, 1.0], - [0.0, 1.0], - [-1.0, "NaN"], - ["Infinity", 1.0], - ["Infinity", "NaN"], -) - -check_json_complex_float_false_cases: tuple[object, ...] = ( - 0.0, - "foo", - [0.0], - [1.0, 2.0, 3.0], - [1.0, "_infinity_"], - {"hello": 1.0}, -) - - -@pytest.mark.parametrize("data", check_json_complex_float_true_cases) -def test_check_json_complex_float_v2_true(data: JSON) -> None: - assert check_json_complex_float_v2(data) - - -@pytest.mark.parametrize("data", check_json_complex_float_false_cases) -def test_check_json_complex_float_v2_false(data: JSON) -> None: - assert not check_json_complex_float_v2(data) - - -@pytest.mark.parametrize("data", check_json_complex_float_true_cases) -def test_check_json_complex_float_v3_true(data: JSON) -> None: - assert check_json_complex_float_v3(data) - - -@pytest.mark.parametrize("data", check_json_complex_float_false_cases) -def test_check_json_complex_float_v3_false(data: JSON) -> None: - assert not check_json_complex_float_v3(data) - - -@pytest.mark.parametrize("data", check_json_complex_float_true_cases) -def test_check_json_complex_float_true(data: JSON, zarr_format: ZarrFormat) -> None: - if zarr_format == 2: - assert check_json_complex_float_v2(data) - elif zarr_format == 3: - assert check_json_complex_float_v3(data) - else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - -@pytest.mark.parametrize("data", check_json_complex_float_false_cases) -def test_check_json_complex_float_false(data: JSON, zarr_format: ZarrFormat) -> None: +@pytest.mark.parametrize("data", check_json_float_cases) +def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> None: + observed = check_json_float(data, zarr_format=zarr_format) if zarr_format == 2: - assert not check_json_complex_float_v2(data) - elif zarr_format == 3: - assert not check_json_complex_float_v3(data) + expected = check_json_float_v2(data) else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + expected = check_json_float_v3(data) + assert observed == expected def test_check_json_int() -> None: assert check_json_int(0) assert not check_json_int(1.0) - - -def test_check_json_str() -> None: - assert check_json_str("0") - assert not check_json_str(1.0) - - -def test_check_json_bool() -> None: - assert check_json_bool(True) - assert check_json_bool(False) - assert not check_json_bool(1.0) - assert not check_json_bool("True") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 95ede9e1d7..5e87945b3a 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -9,32 +9,26 @@ import pytest import zarr -from tests.conftest import skip_object_dtype from zarr.core.config import config from zarr.core.dtype import ( - AnyDType, + DTYPE, Bool, - DataTypeRegistry, - DateTime64, - FixedLengthUTF32, - Int8, - Int16, + FixedLengthUnicode, TBaseDType, TBaseScalar, - VariableLengthUTF8, ZDType, data_type_registry, get_data_type_from_json, - parse_data_type, ) +from zarr.core.dtype.registry import DataTypeRegistry + +from .conftest import zdtype_examples if TYPE_CHECKING: from collections.abc import Generator from zarr.core.common import ZarrFormat -from .test_dtype.conftest import zdtype_examples - @pytest.fixture def data_type_registry_fixture() -> DataTypeRegistry: @@ -59,7 +53,7 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): - def default_scalar(self) -> np.bool_: + def default_value(self) -> np.bool_: return np.True_ data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) @@ -67,7 +61,7 @@ def default_scalar(self) -> np.bool_: @staticmethod @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUTF32, "|U4")] + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] ) def test_match_dtype( data_type_registry_fixture: DataTypeRegistry, @@ -85,31 +79,25 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non """ Test that match_dtype raises an error if the dtype is not registered. """ - outside_dtype_name = "int8" - outside_dtype = np.dtype(outside_dtype_name) - msg = f"No Zarr data type found that matches dtype '{outside_dtype!r}'" - with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(outside_dtype) + outside_dtype = "int8" + with pytest.raises( + ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" + ): + data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype_name) + data_type_registry_fixture.get(outside_dtype) @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) -> None: + def test_registered_dtypes( + zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat + ) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ - skip_object_dtype(zdtype) - assert data_type_registry.match_dtype(zdtype.to_native_dtype()) == zdtype - @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes_match_json( - zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat - ) -> None: + assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype assert ( data_type_registry.match_json( zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format @@ -118,7 +106,6 @@ def test_registered_dtypes_match_json( ) @staticmethod - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) def test_match_dtype_unique( zdtype: ZDType[Any, Any], @@ -130,19 +117,18 @@ def test_match_dtype_unique( that excludes the data type class being tested, and ensure that an instance of the wrapped data type fails to match anything in the registry """ - skip_object_dtype(zdtype) - for _cls in get_args(AnyDType): + for _cls in get_args(DTYPE): if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - dtype_instance = zdtype.to_native_dtype() + dtype_instance = zdtype.to_dtype() - msg = f"No Zarr data type found that matches dtype '{dtype_instance!r}'" + msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_dtype(dtype_instance) instance_dict = zdtype.to_json(zarr_format=zarr_format) - msg = f"No Zarr data type found that matches {instance_dict!r}" + msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) @@ -167,35 +153,6 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry._lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance - data_type_registry.unregister(TestDataType._zarr_v3_name) - - -@pytest.mark.parametrize( - ("dtype_params", "expected", "zarr_format"), - [ - ("str", VariableLengthUTF8(), 2), - ("str", VariableLengthUTF8(), 3), - ("int8", Int8(), 3), - (Int8(), Int8(), 3), - (">i2", Int16(endianness="big"), 2), - ("datetime64[10s]", DateTime64(unit="s", scale_factor=10), 2), - ( - {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, - DateTime64(unit="s", scale_factor=10), - 3, - ), - ], -) -def test_parse_data_type( - dtype_params: Any, expected: ZDType[Any, Any], zarr_format: ZarrFormat -) -> None: - """ - Test that parse_data_type accepts alternative representations of ZDType instances, and resolves - those inputs to the expected ZDType instance. - """ - observed = parse_data_type(dtype_params, zarr_format=zarr_format) - assert observed == expected From 4c67302e4672a80d63eb17aab674702547333faf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 2 May 2025 18:47:18 +0200 Subject: [PATCH 073/129] finish common tests --- src/zarr/core/dtype/npy/common.py | 162 +++++++++++------------ tests/conftest.py | 20 --- tests/test_dtype/confttest.py | 22 +++ tests/test_dtype/test_npy/test_common.py | 87 ++++++++++-- 4 files changed, 179 insertions(+), 112 deletions(-) create mode 100644 tests/test_dtype/confttest.py diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 8ef1286e6f..8033e48291 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -261,6 +261,29 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) +def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: + """ + Convert a float to JSON, parametrized by the zarr format version. + + Parameters + ---------- + data : float | np.floating + The float value to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + JSONFloat + The JSON representation of the float. + """ + if zarr_format == 2: + return float_to_json_v2(data) + else: + return float_to_json_v3(data) + raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") + + def complex_float_to_json_v3( data: complex | np.complexfloating[Any, Any], ) -> tuple[JSONFloat, JSONFloat]: @@ -324,26 +347,60 @@ def complex_float_to_json( raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") -def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: +def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: """ - Convert a float to JSON, parametrized by the zarr format version. + Convert a JSON complex float to a complex number (v2). Parameters ---------- - data : float | np.floating - The float value to convert. + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) + + +def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: + """ + Convert a JSON complex float to a complex number (v3). + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. + + Returns + ------- + np.complexfloating + The complex number. + """ + return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) + + +def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: + """ + Convert a JSON complex float to a complex number based on zarr format. + + Parameters + ---------- + data : tuple[JSONFloat, JSONFloat] + The JSON complex float to convert. zarr_format : ZarrFormat The zarr format version. Returns ------- - JSONFloat - The JSON representation of the float. + np.complexfloating + The complex number. """ if zarr_format == 2: - return float_to_json_v2(data) + return complex_float_from_json_v2(data) else: - return float_to_json_v3(data) + return complex_float_from_json_v3(data) raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") @@ -366,9 +423,9 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: return isinstance(data, float | int) -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: """ - Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x + Check if a JSON value represents a float (v3). Parameters ---------- @@ -378,20 +435,15 @@ def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl Returns ------- Bool - True if the data is a complex float, False otherwise. + True if the data is a float, False otherwise. """ - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and len(data) == 2 - and check_json_float_v2(data[0]) - and check_json_float_v2(data[1]) - ) + # TODO: handle the special JSON serialization of different NaN values + return check_json_float_v2(data) -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Check if a JSON value represents a float (v3). + Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x Parameters ---------- @@ -401,10 +453,15 @@ def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: Returns ------- Bool - True if the data is a float, False otherwise. + True if the data is a complex float, False otherwise. """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and len(data) == 2 + and check_json_float_v2(data[0]) + and check_json_float_v2(data[1]) + ) def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: @@ -434,7 +491,7 @@ def check_json_complex_float( data: JSON, zarr_format: ZarrFormat ) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: """ - Check if a JSON value represents a complex float based on zarr format. + Check if a JSON value represents a complex float, given a zarr format. Parameters ---------- @@ -524,60 +581,3 @@ def check_json_bool(data: JSON) -> TypeGuard[bool]: True if the data is a boolean, False otherwise. """ return isinstance(data, bool) - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v2). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) - - -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: - """ - Convert a JSON complex float to a complex number (v3). - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - - Returns - ------- - np.complexfloating - The complex number. - """ - return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) - - -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") diff --git a/tests/conftest.py b/tests/conftest.py index 78f44c3822..f690478f2e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,13 +22,8 @@ from zarr.core.common import JSON, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import ( - DateTime64, - Structured, - TimeDelta64, - data_type_registry, get_data_type_from_native_dtype, ) -from zarr.core.dtype.common import HasLength from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -44,7 +39,6 @@ from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ChunkCoords, MemoryOrder, ShapeLike, ZarrFormat - from zarr.core.dtype.wrapper import ZDType async def parse_store( @@ -437,17 +431,3 @@ def meta_from_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, ) - - -# Generate a collection of zdtype instances for use in testing. -zdtype_examples: tuple[ZDType[Any, Any], ...] = () -for wrapper_cls in data_type_registry.contents.values(): - # The Structured dtype has to be constructed with some actual fields - if wrapper_cls is Structured: - zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) - elif issubclass(wrapper_cls, HasLength): - zdtype_examples += (wrapper_cls(length=1),) - elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", interval=10),) - else: - zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/confttest.py b/tests/test_dtype/confttest.py new file mode 100644 index 0000000000..aba08a08c5 --- /dev/null +++ b/tests/test_dtype/confttest.py @@ -0,0 +1,22 @@ +# Generate a collection of zdtype instances for use in testing. +from typing import Any + +import numpy as np + +from zarr.core.dtype import data_type_registry +from zarr.core.dtype.common import HasLength +from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.wrapper import ZDType + +zdtype_examples: tuple[ZDType[Any, Any], ...] = () +for wrapper_cls in data_type_registry.contents.values(): + # The Structured dtype has to be constructed with some actual fields + if wrapper_cls is Structured: + zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + elif issubclass(wrapper_cls, HasLength): + zdtype_examples += (wrapper_cls(length=1),) + elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): + zdtype_examples += (wrapper_cls(unit="s", interval=10),) + else: + zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index f3082d0c3b..69beae38e3 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -14,10 +14,15 @@ EndiannessNumpy, bytes_from_json, bytes_to_json, + check_json_bool, + check_json_complex_float, + check_json_complex_float_v2, + check_json_complex_float_v3, check_json_float, check_json_float_v2, check_json_float_v3, check_json_int, + check_json_str, complex_float_to_json, complex_float_to_json_v2, complex_float_to_json_v3, @@ -31,7 +36,7 @@ ) if TYPE_CHECKING: - from zarr.core.common import ZarrFormat + from zarr.core.common import JSON, ZarrFormat def nan_equal(a: object, b: object) -> bool: @@ -44,7 +49,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v2_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -56,7 +61,7 @@ def nan_equal(a: object, b: object) -> bool: # exactly the same as v2, for now, until we get support for the special NaN encoding defined in the # v3 spec -json_float_v3: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v3_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -101,7 +106,7 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: endianness_to_numpy_str(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2 + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. @@ -115,7 +120,7 @@ def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> Non float_from_json_v2(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v3 + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. @@ -129,7 +134,7 @@ def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> Non float_from_json_v3(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2) +@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases) def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. @@ -144,7 +149,7 @@ def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: Za # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v2) +@pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v2 @@ -154,7 +159,7 @@ def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) - # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v3) +@pytest.mark.parametrize(("expected", "data"), json_float_v3_cases) def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: """ Test that floats are JSON-encoded properly for zarr v3 @@ -188,7 +193,7 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v2 format. @@ -204,7 +209,7 @@ def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: """ Test that complex numbers are correctly converted to JSON in v3 format. @@ -219,7 +224,7 @@ def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: assert complex_float_to_json_v3(cplx_npy) == (json_expected, json_expected) -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_float_to_json( float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat ) -> None: @@ -272,6 +277,66 @@ def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> Non assert observed == expected +check_json_complex_float_true_cases = ( + [0.0, 1.0], + (0.0, 1.0), + [-1.0, "NaN"], + ["Infinity", 1.0], + ["Infinity", "NaN"], +) + +check_json_complex_float_false_cases = ( + 0.0, + "foo", + [0.0], + [1.0, 2.0, 3.0], + [1.0, "_infinity_"], + {"hello": 1.0}, +) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_v2_true(data: JSON) -> None: + assert check_json_complex_float_v2(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_v2_false(data: JSON) -> None: + assert not check_json_complex_float_v2(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_v3_true(data: JSON) -> None: + assert check_json_complex_float_v3(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_v3_false(data: JSON) -> None: + assert not check_json_complex_float_v3(data) + + +@pytest.mark.parametrize("data", check_json_complex_float_true_cases) +def test_check_json_complex_float_true(data: JSON, zarr_format: ZarrFormat) -> None: + assert check_json_complex_float(data, zarr_format=zarr_format) + + +@pytest.mark.parametrize("data", check_json_complex_float_false_cases) +def test_check_json_complex_float_false(data: JSON, zarr_format: ZarrFormat) -> None: + assert not check_json_complex_float(data, zarr_format=zarr_format) + + def test_check_json_int() -> None: assert check_json_int(0) assert not check_json_int(1.0) + + +def test_check_json_str() -> None: + assert check_json_str("0") + assert not check_json_str(1.0) + + +def test_check_json_bool() -> None: + assert check_json_bool(True) + assert check_json_bool(False) + assert not check_json_bool(1.0) + assert not check_json_bool("True") From 4140ca0c42d0d176dd493fb6b7a8003b338d4cb6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 7 May 2025 13:15:34 +0200 Subject: [PATCH 074/129] wip: test infrastructure for dtypes --- tests/test_array.py | 2 +- tests/test_dtype/conftest.py | 50 +---- tests/test_dtype/confttest.py | 22 -- tests/test_dtype/test_npy/test_common.py | 10 +- tests/test_dtype/test_npy/test_int.py | 267 +---------------------- tests/test_dtype/test_wrapper.py | 221 ++++++------------- tests/test_dtype_registry.py | 4 +- 7 files changed, 88 insertions(+), 488 deletions(-) delete mode 100644 tests/test_dtype/confttest.py diff --git a/tests/test_array.py b/tests/test_array.py index 4a99730f7c..d7462a4a15 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -57,7 +57,7 @@ from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath -from .conftest import zdtype_examples +from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 0650d143c6..6e171cb435 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -1,12 +1,11 @@ # Generate a collection of zdtype instances for use in testing. -import warnings from typing import Any import numpy as np from zarr.core.dtype import data_type_registry from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.structured import Structured +from zarr.core.dtype.npy.sized import Structured from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType @@ -14,55 +13,16 @@ for wrapper_cls in data_type_registry.contents.values(): # The Structured dtype has to be constructed with some actual fields if wrapper_cls is Structured: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - zdtype_examples += ( - wrapper_cls.from_native_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), - ) + zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", scale_factor=10),) + zdtype_examples += (wrapper_cls(unit="s", interval=10),) else: zdtype_examples += (wrapper_cls(),) -def pytest_generate_tests(metafunc: Any) -> None: - """ - This is a pytest hook to parametrize class-scoped fixtures. - - This hook allows us to define class-scoped fixtures as class attributes and then - generate the parametrize calls for pytest. This allows the fixtures to be - reused across multiple tests within the same class. - - For example, if you had a regular pytest class like this: - - class TestClass: - @pytest.mark.parametrize("param_a", [1, 2, 3]) - def test_method(self, param_a): - ... - - Child classes inheriting from ``TestClass`` would not be able to override the ``param_a`` fixture - - this implementation of ``pytest_generate_tests`` allows you to define class-scoped fixtures as - class attributes, which allows the following to work: - - class TestExample: - param_a = [1, 2, 3] - - def test_example(self, param_a): - ... - - # this class will have its test_example method parametrized with the values of TestB.param_a - class TestB(TestExample): - param_a = [1, 2, 100, 10] - - """ - # Iterate over all the fixtures defined in the class - # and parametrize them with the values defined in the class - # This allows us to define class-scoped fixtures as class attributes - # and then generate the parametrize calls for pytest +def pytest_generate_tests(metafunc): for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): - params = getattr(metafunc.cls, fixture_name) - metafunc.parametrize(fixture_name, params, scope="class", ids=str) + metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") diff --git a/tests/test_dtype/confttest.py b/tests/test_dtype/confttest.py deleted file mode 100644 index aba08a08c5..0000000000 --- a/tests/test_dtype/confttest.py +++ /dev/null @@ -1,22 +0,0 @@ -# Generate a collection of zdtype instances for use in testing. -from typing import Any - -import numpy as np - -from zarr.core.dtype import data_type_registry -from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.sized import Structured -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -from zarr.core.dtype.wrapper import ZDType - -zdtype_examples: tuple[ZDType[Any, Any], ...] = () -for wrapper_cls in data_type_registry.contents.values(): - # The Structured dtype has to be constructed with some actual fields - if wrapper_cls is Structured: - zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) - elif issubclass(wrapper_cls, HasLength): - zdtype_examples += (wrapper_cls(length=1),) - elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", interval=10),) - else: - zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 69beae38e3..69a14a92b0 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -61,15 +61,7 @@ def nan_equal(a: object, b: object) -> bool: # exactly the same as v2, for now, until we get support for the special NaN encoding defined in the # v3 spec -json_float_v3_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ - ("Infinity", float("inf")), - ("Infinity", np.inf), - ("-Infinity", float("-inf")), - ("-Infinity", -np.inf), - ("NaN", float("nan")), - ("NaN", np.nan), - (1.0, 1.0), -] +json_float_v3_cases = json_float_v2_cases @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index efc4fae496..a90af53c58 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -2,21 +2,20 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType -from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.int import Int8 -class TestInt8(BaseTestZDType): +class TestInt8(_TestZDType): test_cls = Int8 - scalar_type = np.int8 valid_dtype = (np.dtype(np.int8),) invalid_dtype = ( np.dtype(np.int16), np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = ({"name": "|i1", "object_codec_id": None},) - valid_json_v3 = ("int8",) + valid_json_v2 = ("|i1",) + valid_json_v3_cases = ("int8",) invalid_json_v2 = ( ">i1", "int8", @@ -28,256 +27,6 @@ class TestInt8(BaseTestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = ((Int8(), 1), (Int8(), -1)) - scalar_v3_params = ((Int8(), 1), (Int8(), -1)) - cast_value_params = ( - (Int8(), 1, np.int8(1)), - (Int8(), -1, np.int8(-1)), - ) - invalid_scalar_params = ((Int8(), {"set!"}), (Int8(), ("tuple",))) - item_size_params = (Int8(),) - - -class TestInt16(BaseTestZDType): - test_cls = Int16 - scalar_type = np.int16 - valid_dtype = (np.dtype(">i2"), np.dtype("i2", "object_codec_id": None}, - {"name": "i4"), np.dtype("i4", "object_codec_id": None}, - {"name": "i8"), np.dtype("i8", "object_codec_id": None}, - {"name": "u2"), np.dtype("u2", "object_codec_id": None}, - {"name": "u4"), np.dtype("u4", "object_codec_id": None}, - {"name": "u8"), np.dtype("u8", "object_codec_id": None}, - {"name": " None: + assert self.test_cls().check_value(1) + assert not self.test_cls().check_value(["foo"]) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index cc365e86d4..c6093ebb01 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -1,165 +1,86 @@ from __future__ import annotations -import re -from typing import TYPE_CHECKING, Any, ClassVar - -import pytest - -from zarr.core.dtype.common import DTypeSpec_V2, DTypeSpec_V3, HasItemSize - -if TYPE_CHECKING: - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - - -""" -class _TestZDTypeSchema: - # subclasses define the URL for the schema, if available - schema_url: ClassVar[str] = "" - - @pytest.fixture(scope="class") - def get_schema(self) -> object: - response = requests.get(self.schema_url) - response.raise_for_status() - return json_schema.loads(response.text) - - def test_schema(self, schema: json_schema.Schema) -> None: - assert schema.is_valid(self.test_cls.to_json(zarr_format=2)) -""" - - -class BaseTestZDType: - """ - A base class for testing ZDType subclasses. This class works in conjunction with the custom - pytest collection function ``pytest_generate_tests`` defined in conftest.py, which applies the - following procedure when generating tests: - - At test generation time, for each test fixture referenced by a method on this class - pytest will look for an attribute with the same name as that fixture. Pytest will assume that - this class attribute is a tuple of values to be used for generating a parametrized test fixture. - - This means that child classes can, by using different values for these class attributes, have - customized test parametrization. - - Attributes - ---------- - test_cls : type[ZDType[TBaseDType, TBaseScalar]] - The ZDType subclass being tested. - scalar_type : ClassVar[type[TBaseScalar]] - The expected scalar type for the ZDType. - valid_dtype : ClassVar[tuple[TBaseDType, ...]] - A tuple of valid numpy dtypes for the ZDType. - invalid_dtype : ClassVar[tuple[TBaseDType, ...]] - A tuple of invalid numpy dtypes for the ZDType. - valid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] - A tuple of valid JSON representations for Zarr format version 2. - invalid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] - A tuple of invalid JSON representations for Zarr format version 2. - valid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] - A tuple of valid JSON representations for Zarr format version 3. - invalid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] - A tuple of invalid JSON representations for Zarr format version 3. - cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] - A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. - scalar_v2_params : ClassVar[tuple[Any, ...]] - A tuple of (dtype, scalar json) tuples for testing - ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v2 - scalar_v3_params : ClassVar[tuple[Any, ...]] - A tuple of (dtype, scalar json) tuples for testing - ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v3 - invalid_scalar_params : ClassVar[tuple[Any, ...]] - A tuple of (dtype, value) tuples, where each value is expected to fail ZDType.cast_value. - item_size_params : ClassVar[tuple[Any, ...]] - A tuple of (dtype, expected) tuples for testing ZDType.item_size - """ +from typing import Any, ClassVar +import hypothesis.strategies as st +import numpy as np +from hypothesis.extra import numpy as npst + +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + + +def all_dtypes() -> st.SearchStrategy[np.dtype[np.generic]]: + return ( + npst.boolean_dtypes() + | npst.integer_dtypes(endianness="=") + | npst.unsigned_integer_dtypes(endianness="=") + | npst.floating_dtypes(endianness="=") + | npst.complex_number_dtypes(endianness="=") + | npst.byte_string_dtypes(endianness="=") + | npst.unicode_string_dtypes(endianness="=") + | npst.datetime64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="=") + ) + + +def get_classvar_attributes(cls: type) -> dict[str, Any]: + classvar_attributes = {} + for name, annotation in cls.__annotations__.items(): + if getattr(annotation, "__origin__", None) is ClassVar: + classvar_attributes[name] = getattr(cls, name) + return classvar_attributes + + +class _TestZDType: test_cls: type[ZDType[TBaseDType, TBaseScalar]] - scalar_type: ClassVar[type[TBaseScalar]] + valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[DTypeSpec_V2, ...]] = () - invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () + valid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () + invalid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () + + valid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () + invalid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () - valid_json_v3: ClassVar[tuple[DTypeSpec_V3, ...]] = () - invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + def test_check_dtype_valid(self, valid_dtype: Any) -> None: + assert self.test_cls.check_dtype(valid_dtype) - # for testing scalar round-trip serialization, we need a tuple of (data type json, scalar json) - # pairs. the first element of the pair is used to create a dtype instance, and the second - # element is the json serialization of the scalar that we want to round-trip. + def test_check_dtype_invalid(self, invalid_dtype: Any) -> None: + assert not self.test_cls.check_dtype(invalid_dtype) - scalar_v2_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...]] = () - scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any, Any], ...]] = () - # Some data types, like bool and string, can consume any python object as a scalar. - # So we allow passing None in to this test to indicate that it should be skipped. - invalid_scalar_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...] | tuple[None]] = () - item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] = () + def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: + zdtype = self.test_cls.from_dtype(valid_dtype) + assert zdtype.to_dtype() == valid_dtype - def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: - # An equality check for json-encoded scalars. This defaults to regular equality, - # but some classes may need to override this for special cases - return scalar1 == scalar2 + """ @abc.abstractmethod + def test_cast_value(self, value: Any) -> None: + raise NotImplementedError - def scalar_equals(self, scalar1: object, scalar2: object) -> bool: - # An equality check for scalars. This defaults to regular equality, - # but some classes may need to override this for special cases - return scalar1 == scalar2 + @abc.abstractmethod + def test_check_value(self) -> None: + raise NotImplementedError - def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: - assert self.test_cls._check_native_dtype(valid_dtype) + @abc.abstractmethod + def test_default_value(self) -> None: + raise NotImplementedError - def test_check_dtype_invalid(self, invalid_dtype: object) -> None: - assert not self.test_cls._check_native_dtype(invalid_dtype) # type: ignore[arg-type] + @abc.abstractmethod + def test_check_json(self, value: Any) -> None: + raise NotImplementedError - def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: - zdtype = self.test_cls.from_native_dtype(valid_dtype) - assert zdtype.to_native_dtype() == valid_dtype - - def test_from_json_roundtrip_v2(self, valid_json_v2: DTypeSpec_V2) -> None: - zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) - assert zdtype.to_json(zarr_format=2) == valid_json_v2 - - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - def test_from_json_roundtrip_v3(self, valid_json_v3: DTypeSpec_V3) -> None: - zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) - assert zdtype.to_json(zarr_format=3) == valid_json_v3 - - def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[ZDType[Any, Any], Any]) -> None: - zdtype, scalar_json = scalar_v2_params - scalar = zdtype.from_json_scalar(scalar_json, zarr_format=2) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=2)) - - def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[ZDType[Any, Any], Any]) -> None: - zdtype, scalar_json = scalar_v3_params - scalar = zdtype.from_json_scalar(scalar_json, zarr_format=3) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=3)) - - def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) -> None: - zdtype, value, expected = cast_value_params - observed = zdtype.cast_scalar(value) - assert self.scalar_equals(expected, observed) - # check that casting is idempotent - assert self.scalar_equals(zdtype.cast_scalar(observed), observed) - - def test_invalid_scalar( - self, invalid_scalar_params: tuple[ZDType[Any, Any], Any] | None - ) -> None: - if invalid_scalar_params is None: - pytest.skip(f"No test data provided for {self}.{__name__}") - zdtype, data = invalid_scalar_params - msg = ( - f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " - f"data type {zdtype}." - ) - with pytest.raises(TypeError, match=re.escape(msg)): - zdtype.cast_scalar(data) - - def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: - """ - Test that the item_size attribute matches the numpy dtype itemsize attribute, for dtypes - with a fixed scalar size. - """ - if isinstance(item_size_params, HasItemSize): - assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize - else: - pytest.skip(f"Data type {item_size_params} does not implement HasItemSize") + @abc.abstractmethod + def test_from_json_roundtrip_v2(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_roundtrip_v3(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_value_roundtrip_v2(self, value: Any) -> None: + raise NotImplementedError + + @abc.abstractmethod + def test_from_json_value_roundtrip_v3(self, value: Any) -> None: + raise NotImplementedError """ diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 5e87945b3a..98380b86f7 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -22,13 +22,13 @@ ) from zarr.core.dtype.registry import DataTypeRegistry -from .conftest import zdtype_examples - if TYPE_CHECKING: from collections.abc import Generator from zarr.core.common import ZarrFormat +from .test_dtype.conftest import zdtype_examples + @pytest.fixture def data_type_registry_fixture() -> DataTypeRegistry: From b1aa6ae2d627b24f95ff761c1771dd732c676d63 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 7 May 2025 18:03:00 +0200 Subject: [PATCH 075/129] wip: use class-based tests for all dtypes --- tests/test_dtype/conftest.py | 32 ++++- tests/test_dtype/test_npy/test_bool.py | 22 +-- tests/test_dtype/test_npy/test_complex.py | 65 +-------- tests/test_dtype/test_npy/test_float.py | 117 ++-------------- tests/test_dtype/test_npy/test_int.py | 162 +++++++++++++++++++++- tests/test_dtype/test_npy/test_sized.py | 131 +++++++++++++++++ tests/test_dtype/test_npy/test_string.py | 124 ++--------------- tests/test_dtype/test_npy/test_time.py | 143 ++----------------- tests/test_dtype/test_wrapper.py | 24 ++-- 9 files changed, 380 insertions(+), 440 deletions(-) create mode 100644 tests/test_dtype/test_npy/test_sized.py diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 6e171cb435..2b4bb0b685 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -22,7 +22,37 @@ zdtype_examples += (wrapper_cls(),) -def pytest_generate_tests(metafunc): +def pytest_generate_tests(metafunc: Any) -> None: + """ + pytest hook to parametrize class-scoped fixtures. + + This hook allows us to define class-scoped fixtures as class attributes and then + generate the parametrize calls for pytest. This allows the fixtures to be + reused across multiple tests within the same class. + + For example, if you had a regular pytest class like this: + + class TestClass: + @pytest.mark.parametrize("param_a", [1, 2, 3]) + def test_method(self, param_a): + ... + + Child classes inheriting from ``TestClass`` would not be able to override the ``param_a`` fixture + + this implementation of ``pytest_generate_tests`` allows you to define class-scoped fixtures as + class attributes, which allows the following to work: + + class TestExample: + param_a = [1, 2, 3] + + def test_example(self, param_a): + ... + + # this class will have its test_example method parametrized with the values of TestB.param_a + class TestB(TestExample): + param_a = [1, 2, 100, 10] + + """ for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index da30214b3b..e4e5dd541e 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -2,21 +2,20 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType +from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.bool import Bool -class TestBool(BaseTestZDType): +class TestBool(_TestZDType): test_cls = Bool - valid_dtype = (np.dtype(np.bool_),) invalid_dtype = ( np.dtype(np.int8), np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = ({"name": "|b1", "object_codec_id": None},) - valid_json_v3 = ("bool",) + valid_json_v2 = Bool._zarr_v2_names + valid_json_v3_cases = (Bool._zarr_v3_name,) invalid_json_v2 = ( "|b1", "bool", @@ -27,16 +26,3 @@ class TestBool(BaseTestZDType): "|f8", {"name": "bool", "configuration": {"endianness": "little"}}, ) - - scalar_v2_params = ((Bool(), True), (Bool(), False)) - scalar_v3_params = ((Bool(), True), (Bool(), False)) - - cast_value_params = ( - (Bool(), "true", np.True_), - (Bool(), True, np.True_), - (Bool(), False, np.False_), - (Bool(), np.True_, np.True_), - (Bool(), np.False_, np.False_), - ) - invalid_scalar_params = (None,) - item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b4ce42be58..6621d625d9 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -1,21 +1,12 @@ from __future__ import annotations -import math - import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType +from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 -class _BaseTestFloat(BaseTestZDType): - def scalar_equals(self, scalar1: object, scalar2: object) -> bool: - if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] - return True - return super().scalar_equals(scalar1, scalar2) - - -class TestComplex64(_BaseTestFloat): +class TestComplex64(_TestZDType): test_cls = Complex64 valid_dtype = (np.dtype(">c8"), np.dtype("c8", "object_codec_id": None}, - {"name": "c16"), np.dtype("c16", "object_codec_id": None}, - {"name": " bool: - if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] - return True - return super().scalar_equals(scalar1, scalar2) - - hex_string_params: tuple[tuple[str, float], ...] = () - - def test_hex_encoding(self, hex_string_params: tuple[str, float]) -> None: - """ - Test that hexadecimal strings can be read as NaN values - """ - hex_string, expected = hex_string_params - zdtype = self.test_cls() - observed = zdtype.from_json_scalar(hex_string, zarr_format=3) - assert self.scalar_equals(observed, expected) - - -class TestFloat16(_BaseTestFloat): +class TestFloat16(_TestZDType): test_cls = Float16 valid_dtype = (np.dtype(">f2"), np.dtype("f2", "object_codec_id": None}, - {"name": "f4"), np.dtype("f4", "object_codec_id": None}, - {"name": "f8"), np.dtype("f8", "object_codec_id": None}, - {"name": "i1", "int8", @@ -27,6 +27,156 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) - def test_check_value(self) -> None: - assert self.test_cls().check_value(1) - assert not self.test_cls().check_value(["foo"]) + +class TestInt16(_TestZDType): + test_cls = Int16 + valid_dtype = (np.dtype(">i2"), np.dtype("i4"), np.dtype("i8"), np.dtype("u2"), np.dtype("u4"), np.dtype("u8"), np.dtype("U10"), np.dtype("U10", "i4"), ("field2", ">f8")], + [("field1", ">i8"), ("field2", ">i4")], + ) + valid_json_v3_cases = ( + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int32", "configuration": {"endianness": "big"}}), + ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ] + }, + }, + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int64", "configuration": {"endianness": "big"}}), + ("field2", {"name": "int32", "configuration": {"endianness": "big"}}), + ] + }, + }, + ) + invalid_json_v2 = ( + [("field1", "|i1"), ("field2", "|f8")], + [("field1", "|S10"), ("field2", "|f8")], + ) + invalid_json_v3 = ( + { + "name": "structured", + "configuration": { + "fields": [ + ("field1", {"name": "int32", "configuration": {"endianness": "invalid"}}), + ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ] + }, + }, + {"name": "invalid_name"}, + ) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 2cde6a1ac1..2f77379f01 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -1,61 +1,44 @@ from __future__ import annotations import numpy as np -import pytest -from tests.test_dtype.test_wrapper import BaseTestZDType -from zarr.core.dtype import FixedLengthUTF32 -from zarr.core.dtype.common import UnstableSpecificationWarning -from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthUTF8 +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthString if _NUMPY_SUPPORTS_VLEN_STRING: - class TestVariableLengthString(BaseTestZDType): - test_cls = VariableLengthUTF8 # type: ignore[assignment] - valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] + class TestVariableLengthString(_TestZDType): + test_cls = VariableLengthString + valid_dtype = (np.dtypes.StringDType(),) invalid_dtype = ( np.dtype(np.int8), np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},) - valid_json_v3 = ("string",) + valid_json_v2 = ("|O",) + valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) invalid_json_v2 = ( "|S10", "|f8", "invalid", ) invalid_json_v3 = ( - {"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) - scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) - scalar_v3_params = ( - (VariableLengthUTF8(), ""), - (VariableLengthUTF8(), "hi"), - ) - - cast_value_params = ( - (VariableLengthUTF8(), "", np.str_("")), - (VariableLengthUTF8(), "hi", np.str_("hi")), - ) - # anything can become a string - invalid_scalar_params = (None,) - item_size_params = (VariableLengthUTF8(),) - else: - class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] - test_cls = VariableLengthUTF8 # type: ignore[assignment] + class TestVariableLengthString(_TestZDType): + test_cls = VariableLengthString valid_dtype = (np.dtype("O"),) invalid_dtype = ( np.dtype(np.int8), np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},) - valid_json_v3 = ("string",) + valid_json_v2 = ("|O",) + valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) invalid_json_v2 = ( "|S10", "|f8", @@ -65,86 +48,3 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) - - scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi")) - scalar_v3_params = ( - (VariableLengthUTF8(), ""), - (VariableLengthUTF8(), "hi"), - ) - - cast_value_params = ( - (VariableLengthUTF8(), "", np.str_("")), - (VariableLengthUTF8(), "hi", np.str_("hi")), - ) - # anything can become a string - invalid_scalar_params = (None,) - item_size_params = (VariableLengthUTF8(),) - - -class TestFixedLengthUTF32(BaseTestZDType): - test_cls = FixedLengthUTF32 - valid_dtype = (np.dtype(">U10"), np.dtype("U10", "object_codec_id": None}, - {"name": " None: - """ - Test that we get a warning when serializing a dtype without a zarr v3 spec to json - when zarr_format is 3 - """ - with pytest.raises(UnstableSpecificationWarning): - zdtype.to_json(zarr_format=3) - - -def test_invalid_size() -> None: - """ - Test that it's impossible to create a data type that has no length - """ - length = 0 - msg = f"length must be >= 1, got {length}." - with pytest.raises(ValueError, match=msg): - FixedLengthUTF32(length=length) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index b94b600cbf..a5d2cce545 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,32 +1,12 @@ from __future__ import annotations -import re -from typing import get_args - import numpy as np -import pytest - -from tests.test_dtype.test_wrapper import BaseTestZDType -from zarr.core.dtype.npy.common import DateTimeUnit -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int - - -class _TestTimeBase(BaseTestZDType): - def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: - # This method gets overridden here to support the equivalency between NaT and - # -9223372036854775808 fill values - nat_scalars = (-9223372036854775808, "NaT") - if scalar1 in nat_scalars and scalar2 in nat_scalars: - return True - return scalar1 == scalar2 - def scalar_equals(self, scalar1: object, scalar2: object) -> bool: - if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] - return True - return super().scalar_equals(scalar1, scalar2) +from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -class TestDateTime64(_TestTimeBase): +class TestDateTime64(_TestZDType): test_cls = DateTime64 valid_dtype = (np.dtype("datetime64[10ns]"), np.dtype("datetime64[us]"), np.dtype("datetime64")) invalid_dtype = ( @@ -34,15 +14,10 @@ class TestDateTime64(_TestTimeBase): np.dtype(np.float64), np.dtype("timedelta64[ns]"), ) - valid_json_v2 = ( - {"name": ">M8", "object_codec_id": None}, - {"name": ">M8[s]", "object_codec_id": None}, - {"name": "M8", ">M8[s]", "m8", "object_codec_id": None}, - {"name": ">m8[s]", "object_codec_id": None}, - {"name": " None: - """ - Test that an invalid unit raises a ValueError. - """ - unit = "invalid" - msg = f"unit must be one of ('Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'μs', 'ns', 'ps', 'fs', 'as', 'generic'), got {unit!r}." - with pytest.raises(ValueError, match=re.escape(msg)): - DateTime64(unit=unit) # type: ignore[arg-type] - with pytest.raises(ValueError, match=re.escape(msg)): - TimeDelta64(unit=unit) # type: ignore[arg-type] - - -def test_time_scale_factor_too_low() -> None: - """ - Test that an invalid unit raises a ValueError. - """ - scale_factor = 0 - msg = f"scale_factor must be > 0, got {scale_factor}." - with pytest.raises(ValueError, match=msg): - DateTime64(scale_factor=scale_factor) - with pytest.raises(ValueError, match=msg): - TimeDelta64(scale_factor=scale_factor) - - -def test_time_scale_factor_too_high() -> None: - """ - Test that an invalid unit raises a ValueError. - """ - scale_factor = 2**31 - msg = f"scale_factor must be < 2147483648, got {scale_factor}." - with pytest.raises(ValueError, match=msg): - DateTime64(scale_factor=scale_factor) - with pytest.raises(ValueError, match=msg): - TimeDelta64(scale_factor=scale_factor) - - -@pytest.mark.parametrize("unit", get_args(DateTimeUnit)) -@pytest.mark.parametrize("scale_factor", [1, 10]) -@pytest.mark.parametrize("value", [0, 1, 10]) -def test_datetime_from_int(unit: DateTimeUnit, scale_factor: int, value: int) -> None: - """ - Test datetime_from_int. - """ - expected = np.int64(value).view(f"datetime64[{scale_factor}{unit}]") - assert datetime_from_int(value, unit=unit, scale_factor=scale_factor) == expected diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index c6093ebb01..bbe74d9a0f 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -37,22 +37,30 @@ class _TestZDType: valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () - invalid_json_v2: ClassVar[tuple[str | dict[str, Any], ...]] = () + valid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () + invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () - valid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () - invalid_json_v3: ClassVar[tuple[str | dict[str, Any], ...]] = () + valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () - def test_check_dtype_valid(self, valid_dtype: Any) -> None: - assert self.test_cls.check_dtype(valid_dtype) + def test_check_dtype_valid(self, valid_dtype: object) -> None: + assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] - def test_check_dtype_invalid(self, invalid_dtype: Any) -> None: - assert not self.test_cls.check_dtype(invalid_dtype) + def test_check_dtype_invalid(self, invalid_dtype: object) -> None: + assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: zdtype = self.test_cls.from_dtype(valid_dtype) assert zdtype.to_dtype() == valid_dtype + def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: + zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) + assert zdtype.to_json(zarr_format=2) == valid_json_v2 + + def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: + zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) + assert zdtype.to_json(zarr_format=3) == valid_json_v3 + """ @abc.abstractmethod def test_cast_value(self, value: Any) -> None: raise NotImplementedError From 813a3b9dbbfec0374f8ffb7a3233870107edf62d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 8 May 2025 17:14:43 +0200 Subject: [PATCH 076/129] fill out more tests, and adjust sized dtypes --- src/zarr/core/dtype/npy/sized.py | 49 +++++++++---------- src/zarr/core/dtype/npy/time.py | 38 +++++++++------ src/zarr/core/dtype/wrapper.py | 13 +++++ tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_npy/test_bool.py | 7 ++- tests/test_dtype/test_npy/test_complex.py | 22 +++++++-- tests/test_dtype/test_npy/test_float.py | 30 ++++++++++-- tests/test_dtype/test_npy/test_int.py | 56 +++++++++++++++------- tests/test_dtype/test_npy/test_sized.py | 56 +++++++++++++++++----- tests/test_dtype/test_npy/test_string.py | 4 +- tests/test_dtype/test_npy/test_time.py | 54 ++++++++++++++++++--- tests/test_dtype/test_wrapper.py | 58 ++++++++++++----------- 12 files changed, 275 insertions(+), 114 deletions(-) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index d9524a4891..032a1ec5c0 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -23,11 +23,10 @@ class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" - item_size_bits: ClassVar[int] = 8 @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + return cls(length=dtype.itemsize) def to_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @@ -43,12 +42,10 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data + and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and "configuration" in data and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) + and "length_bytes" in data["configuration"] ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -58,7 +55,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, + "configuration": {"length_bytes": self.length}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -67,7 +64,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.bytes_: @@ -94,12 +91,11 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.void" - item_size_bits: ClassVar[int] = 8 + _zarr_v3_name = "numpy.fixed_length_bytes" @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize // (cls.item_size_bits // 8)) + return cls(length=dtype.itemsize) def to_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type @@ -114,9 +110,10 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data - and isinstance(data["name"], str) - and (re.match(r"^r\d+$", data["name"]) is not None) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -124,7 +121,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return self.to_dtype().str elif zarr_format == 3: - return {"name": f"r{self.length * self.item_size_bits}"} + return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod @@ -132,7 +129,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=int(data["name"][1:]) // cls.item_size_bits) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod @@ -178,13 +175,13 @@ def _cast_value_unsafe(self, value: object) -> np.void: class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point + item_size_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( - length=dtype.itemsize // (cls.item_size_bits // 8), + length=dtype.itemsize // (cls.item_size_bytes), endianness=endianness_from_numpy_str(byte_order), ) @@ -203,12 +200,12 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: elif zarr_format == 3: return ( isinstance(data, dict) - and "name" in data + and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and "configuration" in data and isinstance(data["configuration"], dict) - and "length_bits" in data["configuration"] - and isinstance(data["configuration"]["length_bits"], int) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -218,7 +215,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bits": self.length * self.item_size_bits}, + "configuration": {"length_bytes": self.length * self.item_size_bytes}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -227,7 +224,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bits"] // cls.item_size_bits) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.item_size_bytes) # type: ignore[arg-type, index, call-overload, operator] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: @@ -344,7 +341,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in data ) ) - elif zarr_format == 3: # noqa: SIM102 + elif zarr_format == 3: if isinstance(data, dict) and "configuration" in data: config = data["configuration"] if isinstance(config, dict) and "fields" in config: @@ -354,6 +351,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: for f_name, f_dtype in meta_fields ) return cls(fields=fields) + else: + raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + else: + raise TypeError(f"Invalid type: {data}. Expected a dictionary.") raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index f691bd88c8..b8fc85b297 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -105,20 +105,31 @@ class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): # because the particular numpy dtype we are wrapping does not allow direct construction via # cls.dtype_cls() _numpy_name: ClassVar[_DTypeName] - interval: int + scale_factor: int unit: DateTimeUnit + def __post_init__(self) -> None: + if self.scale_factor < 1: + raise ValueError(f"scale_factor must be > 0, got {self.scale_factor}.") + if self.scale_factor >= 2**31: + raise ValueError(f"scale_factor must be < 2147483648, got {self.scale_factor}.") + if self.unit not in get_args(DateTimeUnit): + raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") + @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - unit, interval = np.datetime_data(dtype.name) + unit, scale_factor = np.datetime_data(dtype.name) + unit = cast("DateTimeUnit", unit) byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls(unit=unit, interval=interval, endianness=endianness_from_numpy_str(byteorder)) # type: ignore[arg-type] + return cls( + unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) + ) def to_dtype(self) -> _BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. - dtype_string = f"{self._numpy_name}[{self.interval}{self.unit}]" + dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod @@ -127,8 +138,8 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: unit = data["configuration"]["unit"] # type: ignore[index, call-overload] - interval = data["configuration"]["interval"] # type: ignore[index, call-overload] - return cls(unit=unit, interval=interval) # type: ignore[arg-type] + scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] + return cls(unit=unit, scale_factor=scale_factor) # type: ignore[arg-type] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json(self, zarr_format: ZarrFormat) -> JSON: @@ -137,7 +148,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "interval": self.interval}, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -166,7 +177,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has _zarr_v3_name = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: @@ -174,7 +185,7 @@ def default_value(self) -> np.timedelta64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data): - return self.to_dtype().type(data, f"{self.interval}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") def _cast_value_unsafe(self, value: object) -> np.timedelta64: @@ -202,8 +213,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: and data["name"] == cls._zarr_v3_name and set(data.keys()) == {"name", "configuration"} and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "interval"} - and data["configuration"]["unit"] in get_args(DateTimeUnit) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -215,14 +225,14 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd _zarr_v2_names = (">M8", " np.datetime64: return np.datetime64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data): - return self.to_dtype().type(data, f"{self.interval}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") def _cast_value_unsafe(self, value: object) -> np.datetime64: @@ -248,7 +258,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit", "interval"} + and set(data["configuration"].keys()) == {"unit", "scale_factor"} and data["configuration"]["unit"] in get_args(DateTimeUnit) ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index be51db3ae5..0600fab80b 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,6 +22,7 @@ from __future__ import annotations +import warnings from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar @@ -329,3 +330,15 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal The native scalar value. """ ... + + +def v3_unstable_dtype_warning(dtype: ZDType[TBaseDType, TBaseScalar]) -> None: + msg = ( + f"You are using a data type ({dtype}) that does not have a stable Zarr V3 specification." + "Be advised that arrays stored with this data type may be unreadable by other Zarr " + "libraries, and possibly future versions of Zarr-Python as well. " + "Use this data type at your own risk." + "See https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for a list" + "of data types with a stable Zarr V3 specification." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 2b4bb0b685..d8ef17a039 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -17,7 +17,7 @@ elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): - zdtype_examples += (wrapper_cls(unit="s", interval=10),) + zdtype_examples += (wrapper_cls(unit="s", scale_factor=10),) else: zdtype_examples += (wrapper_cls(),) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index e4e5dd541e..1040683846 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -14,8 +14,8 @@ class TestBool(_TestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = Bool._zarr_v2_names - valid_json_v3_cases = (Bool._zarr_v3_name,) + valid_json_v2 = ("|b1",) + valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", "bool", @@ -26,3 +26,6 @@ class TestBool(_TestZDType): "|f8", {"name": "bool", "configuration": {"endianness": "little"}}, ) + + scalar_v2_params = (("|b1", True), ("|b1", False)) + scalar_v3_params = (("bool", True), ("bool", False)) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index 6621d625d9..aac514028d 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -14,8 +14,8 @@ class TestComplex64(_TestZDType): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = Complex64._zarr_v2_names - valid_json_v3_cases = (Complex64._zarr_v3_name,) + valid_json_v2 = (">c8", ">c8") + valid_json_v3 = ("complex64",) invalid_json_v2 = ( "|c8", "complex64", @@ -27,6 +27,13 @@ class TestComplex64(_TestZDType): {"name": "complex64", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">c8", (1.0, 1.0)), ("c8", (0, "NaN"))) + scalar_v3_params = ( + ("complex64", (1.0, 1.0)), + ("complex64", (-1.0, "Infinity")), + ("complex64", (0, "NaN")), + ) + class TestComplex128(_TestZDType): test_cls = Complex128 @@ -36,8 +43,8 @@ class TestComplex128(_TestZDType): np.dtype(np.float64), np.dtype(np.complex64), ) - valid_json_v2 = Complex128._zarr_v2_names - valid_json_v3_cases = (Complex128._zarr_v3_name,) + valid_json_v2 = (">c16", "c16", (1.0, 1.0)), ("c16", (0, "NaN"))) + scalar_v3_params = ( + ("complex128", (1.0, 1.0)), + ("complex128", (-1.0, "Infinity")), + ("complex128", (0, "NaN")), + ) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index a9de0145c6..232ed1e32c 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -15,7 +15,7 @@ class TestFloat16(_TestZDType): np.dtype(np.float32), ) valid_json_v2 = Float16._zarr_v2_names - valid_json_v3_cases = (Float16._zarr_v3_name,) + valid_json_v3 = (Float16._zarr_v3_name,) invalid_json_v2 = ( "|f2", "float16", @@ -27,6 +27,14 @@ class TestFloat16(_TestZDType): {"name": "float16", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">f2", 1.0), ("f2", "Infinity")) + scalar_v3_params = ( + ("float16", 1.0), + ("float16", -1.0), + ("float16", "NaN"), + ("float16", "Infinity"), + ) + class TestFloat32(_TestZDType): test_cls = Float32 @@ -37,7 +45,7 @@ class TestFloat32(_TestZDType): np.dtype(np.float64), ) valid_json_v2 = Float32._zarr_v2_names - valid_json_v3_cases = (Float32._zarr_v3_name,) + valid_json_v3 = (Float32._zarr_v3_name,) invalid_json_v2 = ( "|f4", "float32", @@ -49,6 +57,14 @@ class TestFloat32(_TestZDType): {"name": "float32", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = ((">f4", 1.0), ("f4", "Infinity")) + scalar_v3_params = ( + ("float32", 1.0), + ("float32", -1.0), + ("float32", "NaN"), + ("float32", "Infinity"), + ) + class TestFloat64(_TestZDType): test_cls = Float64 @@ -59,7 +75,7 @@ class TestFloat64(_TestZDType): np.dtype(np.float32), ) valid_json_v2 = Float64._zarr_v2_names - valid_json_v3_cases = (Float64._zarr_v3_name,) + valid_json_v3 = (Float64._zarr_v3_name,) invalid_json_v2 = ( "|f8", "float64", @@ -70,3 +86,11 @@ class TestFloat64(_TestZDType): "|i1", {"name": "float64", "configuration": {"endianness": "little"}}, ) + + scalar_v2_params = ((">f8", 1.0), ("f8", "Infinity")) + scalar_v3_params = ( + ("float64", 1.0), + ("float64", -1.0), + ("float64", "NaN"), + ("float64", "Infinity"), + ) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 2f149ff58f..99f698fc8e 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -14,8 +14,8 @@ class TestInt8(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int8._zarr_v2_names - valid_json_v3_cases = (Int8._zarr_v3_name,) + valid_json_v2 = ("|i1",) + valid_json_v3 = ("int8",) invalid_json_v2 = ( ">i1", "int8", @@ -27,6 +27,9 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = (("|i1", 1), ("|i1", -1)) + scalar_v3_params = (("int8", 1), ("int8", -1)) + class TestInt16(_TestZDType): test_cls = Int16 @@ -36,8 +39,8 @@ class TestInt16(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int16._zarr_v2_names - valid_json_v3_cases = (Int16._zarr_v3_name,) + valid_json_v2 = (">i2", "i2", -1)) + scalar_v3_params = (("int16", 1), ("int16", -1)) + class TestInt32(_TestZDType): test_cls = Int32 @@ -58,8 +64,8 @@ class TestInt32(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int32._zarr_v2_names - valid_json_v3_cases = (Int32._zarr_v3_name,) + valid_json_v2 = (">i4", "i4", -1)) + scalar_v3_params = (("int32", 1), ("int32", -1)) + class TestInt64(_TestZDType): test_cls = Int64 @@ -80,8 +89,8 @@ class TestInt64(_TestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = Int64._zarr_v2_names - valid_json_v3_cases = (Int64._zarr_v3_name,) + valid_json_v2 = (">i8", "i8", -1)) + scalar_v3_params = (("int64", 1), ("int64", -1)) + class TestUInt8(_TestZDType): test_cls = UInt8 @@ -102,8 +114,8 @@ class TestUInt8(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt8._zarr_v2_names - valid_json_v3_cases = (UInt8._zarr_v3_name,) + valid_json_v2 = ("|u1",) + valid_json_v3 = ("uint8",) invalid_json_v2 = ( "|u1", "uint8", @@ -115,6 +127,9 @@ class TestUInt8(_TestZDType): {"name": "uint8", "configuration": {"endianness": "little"}}, ) + scalar_v2_params = (("|u1", 1), ("|u1", 0)) + scalar_v3_params = (("uint8", 1), ("uint8", 0)) + class TestUInt16(_TestZDType): test_cls = UInt16 @@ -124,8 +139,8 @@ class TestUInt16(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt16._zarr_v2_names - valid_json_v3_cases = (UInt16._zarr_v3_name,) + valid_json_v2 = (">u2", "u2", 0)) + scalar_v3_params = (("uint16", 1), ("uint16", 0)) + class TestUInt32(_TestZDType): test_cls = UInt32 @@ -146,8 +164,8 @@ class TestUInt32(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt32._zarr_v2_names - valid_json_v3_cases = (UInt32._zarr_v3_name,) + valid_json_v2 = (">u4", "u4", 0)) + scalar_v3_params = (("uint32", 1), ("uint32", 0)) + class TestUInt64(_TestZDType): test_cls = UInt64 @@ -168,8 +189,8 @@ class TestUInt64(_TestZDType): np.dtype(np.int16), np.dtype(np.float64), ) - valid_json_v2 = UInt64._zarr_v2_names - valid_json_v3_cases = (UInt64._zarr_v3_name,) + valid_json_v2 = (">u8", "u8", 0)) + scalar_v3_params = (("uint64", 1), ("uint64", 0)) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 887d734fd3..17f4b2af2d 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -20,9 +20,7 @@ class TestFixedLengthAscii(_TestZDType): np.dtype("|U10"), ) valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3_cases = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 80}}, - ) + valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) invalid_json_v2 = ( "|S", "|U10", @@ -33,6 +31,13 @@ class TestFixedLengthAscii(_TestZDType): {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) + scalar_v2_params = (("|S0", ""), ("|S2", "YWI="), ("|S4", "YWJjZA==")) + scalar_v3_params = ( + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 0}}, ""), + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 16}}, "YWI="), + ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 32}}, "YWJjZA=="), + ) + class TestFixedLengthBytes(_TestZDType): test_cls = FixedLengthBytes @@ -43,17 +48,28 @@ class TestFixedLengthBytes(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|V10",) - valid_json_v3_cases = ({"name": "r80"},) + valid_json_v3 = ( + {"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 0}}, + {"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 8}}, + ) + invalid_json_v2 = ( "|V", "|S10", "|f8", ) invalid_json_v3 = ( - {"name": "r0"}, + {"name": "r10"}, {"name": "r-80"}, ) + scalar_v2_params = (("|V0", ""), ("|V2", "YWI="), ("|V4", "YWJjZA==")) + scalar_v3_params = ( + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, ""), + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, "YWI="), + ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 4}}, "YWJjZA=="), + ) + class TestFixedLengthUnicode(_TestZDType): test_cls = FixedLengthUnicode @@ -64,9 +80,7 @@ class TestFixedLengthUnicode(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = (">U10", "U0", ""), ("i4"), ("field2", ">f8")], [("field1", ">i8"), ("field2", ">i4")], ) - valid_json_v3_cases = ( + valid_json_v3 = ( { "name": "structured", "configuration": { "fields": [ - ("field1", {"name": "int32", "configuration": {"endianness": "big"}}), - ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), + ("field1", "int32"), + ("field2", "float64"), ] }, }, @@ -107,8 +128,17 @@ class TestStructured(_TestZDType): "name": "structured", "configuration": { "fields": [ - ("field1", {"name": "int64", "configuration": {"endianness": "big"}}), - ("field2", {"name": "int32", "configuration": {"endianness": "big"}}), + ( + "field1", + { + "name": "numpy.datetime64", + "configuration": {"unit": "s", "scale_factor": 1}, + }, + ), + ( + "field2", + {"name": "numpy.fixed_length_ucs4", "configuration": {"length_bytes": 32}}, + ), ] }, }, diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 2f77379f01..fbb0aaa86d 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -16,7 +16,7 @@ class TestVariableLengthString(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|O",) - valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + valid_json_v3 = ("numpy.variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", @@ -38,7 +38,7 @@ class TestVariableLengthString(_TestZDType): np.dtype("|S10"), ) valid_json_v2 = ("|O",) - valid_json_v3_cases = ({"name": "numpy.variable_length_utf8"},) + valid_json_v3 = ("numpy.variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index a5d2cce545..2a8ff6ac98 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,6 +1,9 @@ from __future__ import annotations +import re + import numpy as np +import pytest from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -15,9 +18,9 @@ class TestDateTime64(_TestZDType): np.dtype("timedelta64[ns]"), ) valid_json_v2 = (">M8", ">M8[s]", " None: + """ + Test that an invalid unit raises a ValueError. + """ + unit = "invalid" + msg = f"unit must be one of ('Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'μs', 'ns', 'ps', 'fs', 'as', 'generic'), got {unit!r}." + with pytest.raises(ValueError, match=re.escape(msg)): + DateTime64(unit=unit) # type: ignore[arg-type] + with pytest.raises(ValueError, match=re.escape(msg)): + TimeDelta64(unit=unit) # type: ignore[arg-type] + + +def test_time_scale_factor_too_low() -> None: + """ + Test that an invalid unit raises a ValueError. + """ + scale_factor = 0 + msg = f"scale_factor must be > 0, got {scale_factor}." + with pytest.raises(ValueError, match=msg): + DateTime64(scale_factor=scale_factor) + with pytest.raises(ValueError, match=msg): + TimeDelta64(scale_factor=scale_factor) + + +def test_time_scale_factor_too_high() -> None: + """ + Test that an invalid unit raises a ValueError. + """ + scale_factor = 2**31 + msg = f"scale_factor must be < 2147483648, got {scale_factor}." + with pytest.raises(ValueError, match=msg): + DateTime64(scale_factor=scale_factor) + with pytest.raises(ValueError, match=msg): + TimeDelta64(scale_factor=scale_factor) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index bbe74d9a0f..49e05340e0 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -1,34 +1,9 @@ from __future__ import annotations -from typing import Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar -import hypothesis.strategies as st -import numpy as np -from hypothesis.extra import numpy as npst - -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - - -def all_dtypes() -> st.SearchStrategy[np.dtype[np.generic]]: - return ( - npst.boolean_dtypes() - | npst.integer_dtypes(endianness="=") - | npst.unsigned_integer_dtypes(endianness="=") - | npst.floating_dtypes(endianness="=") - | npst.complex_number_dtypes(endianness="=") - | npst.byte_string_dtypes(endianness="=") - | npst.unicode_string_dtypes(endianness="=") - | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="=") - ) - - -def get_classvar_attributes(cls: type) -> dict[str, Any]: - classvar_attributes = {} - for name, annotation in cls.__annotations__.items(): - if getattr(annotation, "__origin__", None) is ClassVar: - classvar_attributes[name] = getattr(cls, name) - return classvar_attributes +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType class _TestZDType: @@ -43,6 +18,13 @@ class _TestZDType: valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + # for testing scalar round-trip serialization, we need a tuple of (data type json, scalar json) + # pairs. the first element of the pair is used to create a dtype instance, and the second + # element is the json serialization of the scalar that we want to round-trip. + + scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + def test_check_dtype_valid(self, valid_dtype: object) -> None: assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] @@ -61,6 +43,26 @@ def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 + def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: + dtype_json, scalar_json = scalar_v2_params + zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) + scalar = zdtype.from_json_value(scalar_json, zarr_format=2) + assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + + def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: + dtype_json, scalar_json = scalar_v3_params + zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) + scalar = zdtype.from_json_value(scalar_json, zarr_format=3) + assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + + @staticmethod + def _scalar_equals(a: object, b: object) -> bool: + """ + Compare two scalars for equality. Subclasses that test dtypes with scalars that don't allow + simple equality like nans should override this method. + """ + return a == b + """ @abc.abstractmethod def test_cast_value(self, value: Any) -> None: raise NotImplementedError From a832110883cca5af9da57e01a49f7bcdd45ff471 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 12 May 2025 13:37:52 +0200 Subject: [PATCH 077/129] wip: json schema test --- tests/test_dtype/conftest.py | 10 +++++- tests/test_dtype/test_wrapper.py | 61 ++++++++++---------------------- 2 files changed, 28 insertions(+), 43 deletions(-) diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index d8ef17a039..9c7825c0d1 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -53,6 +53,14 @@ class TestB(TestExample): param_a = [1, 2, 100, 10] """ + # Iterate over all the fixtures defined in the class + # and parametrize them with the values defined in the class + # This allows us to define class-scoped fixtures as class attributes + # and then generate the parametrize calls for pytest for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): - metafunc.parametrize(fixture_name, getattr(metafunc.cls, fixture_name), scope="class") + params = getattr(metafunc.cls, fixture_name) + if len(params) == 0: + msg = f"{metafunc.cls}.{fixture_name} is empty. Please provide a non-empty sequence of values." + raise ValueError(msg) + metafunc.parametrize(fixture_name, params, scope="class") diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 49e05340e0..defd3fffc5 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -5,6 +5,23 @@ if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +import pytest +import requests + + +class _TestZDTypeSchema: + # subclasses define the URL for the schema, if available + schema_url: ClassVar[str] = "" + + @pytest.fixture(scope="class") + def get_schema(self) -> object: + response = requests.get(self.schema_url) + response.raise_for_status() + return json_schema.loads(response.text) + + def test_schema(self, schema: json_schema.Schema) -> None: + assert schema.is_valid(self.test_cls.to_json(zarr_format=2)) + class _TestZDType: test_cls: type[ZDType[TBaseDType, TBaseScalar]] @@ -47,50 +64,10 @@ def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: dtype_json, scalar_json = scalar_v2_params zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + assert scalar_json == zdtype.to_json_value(scalar, zarr_format=2) def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: dtype_json, scalar_json = scalar_v3_params zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert self._scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) - - @staticmethod - def _scalar_equals(a: object, b: object) -> bool: - """ - Compare two scalars for equality. Subclasses that test dtypes with scalars that don't allow - simple equality like nans should override this method. - """ - return a == b - - """ @abc.abstractmethod - def test_cast_value(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_check_value(self) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_default_value(self) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_check_json(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_roundtrip_v2(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_roundtrip_v3(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_value_roundtrip_v2(self, value: Any) -> None: - raise NotImplementedError - - @abc.abstractmethod - def test_from_json_value_roundtrip_v3(self, value: Any) -> None: - raise NotImplementedError """ + assert scalar_json == zdtype.to_json_value(scalar, zarr_format=3) From 557ecdd4e5552a4d5ae7fd8d1524e42f53135ac1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:04:20 +0200 Subject: [PATCH 078/129] add casting tests --- src/zarr/core/dtype/npy/bool.py | 5 +- src/zarr/core/dtype/npy/complex.py | 4 +- src/zarr/core/dtype/npy/float.py | 4 +- src/zarr/core/dtype/npy/int.py | 4 +- src/zarr/core/dtype/npy/sized.py | 37 ++-- src/zarr/core/dtype/npy/time.py | 16 +- src/zarr/core/dtype/wrapper.py | 8 +- tests/test_dtype/test_dtype.py | 248 ---------------------- tests/test_dtype/test_npy/test_bool.py | 13 +- tests/test_dtype/test_npy/test_complex.py | 49 ++++- tests/test_dtype/test_npy/test_float.py | 76 +++++-- tests/test_dtype/test_npy/test_int.py | 72 +++++-- tests/test_dtype/test_npy/test_sized.py | 79 +++++-- tests/test_dtype/test_npy/test_string.py | 30 ++- tests/test_dtype/test_npy/test_time.py | 63 +++++- tests/test_dtype/test_wrapper.py | 38 ++-- tests/test_properties.py | 10 +- 17 files changed, 385 insertions(+), 371 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 776acf4f8c..c80033c54e 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -101,14 +101,11 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: """ if check_json_bool(data): return self._cast_value_unsafe(data) - raise TypeError(f"Invalid type: {data}. Expected a boolean.") + raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover def check_value(self, data: object) -> bool: # Anything can become a bool return True - def cast_value(self, value: object) -> np.bool_: - return self._cast_value_unsafe(value) - def _cast_value_unsafe(self, value: object) -> np.bool_: return np.bool_(value) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 6e19266660..fab4ca9893 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -84,9 +84,7 @@ def check_value(self, value: object) -> bool: return isinstance(value, ComplexLike) def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a complex scalar.") + return self.to_dtype().type(value) # type: ignore[arg-type, return-value] def default_value(self) -> TComplexScalar_co: """ diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 15baaaadaa..bedd6a4751 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -76,9 +76,7 @@ def check_value(self, value: object) -> TypeGuard[FloatLike]: return isinstance(value, FloatLike) def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to a float.") + return self.to_dtype().type(value) # type: ignore[return-value, arg-type] def default_value(self) -> TFloatScalar_co: """ diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 7da7245162..78d9499243 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -71,9 +71,7 @@ def check_value(self, value: object) -> TypeGuard[IntLike]: return isinstance(value, IntLike) def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - if self.check_value(value): - return self.to_dtype().type(value) # type: ignore[return-value] - raise TypeError(f"Invalid type: {value}. Expected a value castable to an integer.") + return self.to_dtype().type(value) # type: ignore[return-value, arg-type] def default_value(self) -> TIntScalar_co: """ diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 032a1ec5c0..281c634856 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -76,7 +76,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes) @@ -162,7 +162,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): return self.to_dtype().type(base64.standard_b64decode(data)) - raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.") + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) @@ -234,9 +234,9 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return self.to_dtype().type(data) + if check_json_str(data): + return self.to_dtype().type(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: return isinstance(data, str | np.str_ | bytes) @@ -332,6 +332,7 @@ def check_json( def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: from zarr.core.dtype import get_data_type_from_json + # This is a horrible mess, because this data type is recursive if cls.check_json(data, zarr_format=zarr_format): if zarr_format == 2: # structured dtypes are constructed directly from a list of lists @@ -352,9 +353,13 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: ) return cls(fields=fields) else: - raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + raise TypeError( + f"Invalid type: {data}. Expected a dictionary." + ) # pragma: no cover else: - raise TypeError(f"Invalid type: {data}. Expected a dictionary.") + raise TypeError( + f"Invalid type: {data}. Expected a dictionary." + ) # pragma: no cover raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") @@ -368,16 +373,12 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) def check_value(self, data: object) -> bool: - # not sure which values we should accept for structured dtypes. - try: - np.array([data], dtype=self.to_dtype()) - return True # noqa: TRY300 - except ValueError: - return False + # TODO: implement something here! + return True def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() - return cast("np.void", np.array([as_bytes], dtype=dtype.str).view(dtype)[0]) + if check_json_str(data): + as_bytes = bytes_from_json(data, zarr_format=zarr_format) + dtype = self.to_dtype() + return cast("np.void", np.array([as_bytes]).view(dtype)[0]) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index b8fc85b297..bbdd41d13f 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -33,7 +33,7 @@ _DTypeName = Literal["datetime64", "timedelta64"] -def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.datetime64: +def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np.datetime64: """ Convert an integer to a datetime64. @@ -43,15 +43,15 @@ def datetime_from_int(data: int, *, unit: DateTimeUnit, interval: int) -> np.dat The integer to convert. unit : DateTimeUnit The unit of the datetime64. - interval : int - The interval of the datetime64. + scale_factor : int + The scale factor of the datetime64. Returns ------- np.datetime64 The datetime64 value. """ - dtype_name = f"datetime64[{interval}{unit}]" + dtype_name = f"datetime64[{scale_factor}{unit}]" return cast("np.datetime64", np.int64(data).view(dtype_name)) @@ -184,9 +184,9 @@ def default_value(self) -> np.timedelta64: return np.timedelta64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data): + if check_json_int(data) or data == "NaT": return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.timedelta64: return self.to_dtype().type(value) # type: ignore[arg-type] @@ -231,9 +231,9 @@ def default_value(self) -> np.datetime64: return np.datetime64("NaT") def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data): + if check_json_int(data) or data == "NaT": return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") - raise TypeError(f"Invalid type: {data}. Expected an integer.") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.datetime64: return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 0600fab80b..199cbda5d8 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -159,7 +159,13 @@ def cast_value(self, data: object) -> TScalar_co: """ if self.check_value(data): return self._cast_value_unsafe(data) - raise TypeError(f"Invalid value: {data}") + msg = ( + f"The value {data} failed a type check." + f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}." + f"Consult the documentation for {self} to determine the possible values that can" + "be cast to scalars of the wrapped data type." + ) + raise TypeError(msg) @abstractmethod def check_value(self, data: object) -> bool: diff --git a/tests/test_dtype/test_dtype.py b/tests/test_dtype/test_dtype.py index 566a04b5fb..e69de29bb2 100644 --- a/tests/test_dtype/test_dtype.py +++ b/tests/test_dtype/test_dtype.py @@ -1,248 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, get_args - -from zarr.core.dtype import ( - DTYPE, - Bool, - Complex64, - Complex128, - DateTime64, - FixedLengthAscii, - FixedLengthBytes, - FixedLengthUnicode, - Float16, - Float32, - Float64, - Int8, - Int16, - Int32, - Int64, - Structured, - UInt8, - UInt16, - UInt32, - UInt64, - VariableLengthString, - ZDType, -) - -from .conftest import zdtype_examples - -if TYPE_CHECKING: - from zarr.core.common import ZarrFormat - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar - -import numpy as np -import pytest - -from zarr.core.dtype.common import DataTypeValidationError - -_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") -VLEN_STRING_DTYPE: np.dtypes.StringDType | np.dtypes.ObjectDType -if _NUMPY_SUPPORTS_VLEN_STRING: - VLEN_STRING_DTYPE = np.dtypes.StringDType() - VLEN_STRING_CODE = "T" -else: - VLEN_STRING_DTYPE = np.dtypes.ObjectDType() - VLEN_STRING_CODE = "O" - - -def test_zdtype_examples() -> None: - """ - Test that all the elements of the exported union type DTYPE have an example in the variable - zdtype_examples, which we use for testing. - - If this test fails, that means that either there is a data type that does not have an example, - or there is a data type that is missing from the DTYPE union type. - """ - assert set(map(type, zdtype_examples)) == set(get_args(DTYPE)) - - -@pytest.mark.parametrize( - ("wrapper_cls", "np_dtype"), - [ - (Bool, "bool"), - (Int8, "int8"), - (Int16, "int16"), - (Int32, "int32"), - (Int64, "int64"), - (UInt8, "uint8"), - (UInt16, "uint16"), - (UInt32, "uint32"), - (UInt64, "uint64"), - (Float32, "float32"), - (Float64, "float64"), - (Complex64, "complex64"), - (Complex128, "complex128"), - (FixedLengthUnicode, "U"), - (FixedLengthAscii, "S"), - (FixedLengthBytes, "V"), - (VariableLengthString, VLEN_STRING_CODE), - (Structured, np.dtype([("a", np.float64), ("b", np.int8)])), - (DateTime64, "datetime64[s]"), - ], -) -def test_wrap(wrapper_cls: type[ZDType[Any, Any]], np_dtype: np.dtype[np.generic] | str) -> None: - """ - Test that the wrapper class has the correct dtype class bound to the dtype_cls variable - Test that the ``wrap`` method produces an instance of the wrapper class - Test that the ``unwrap`` method returns the original dtype - """ - dt = np.dtype(np_dtype) - assert wrapper_cls.dtype_cls is type(dt) - wrapped = wrapper_cls.from_dtype(dt) - - with pytest.raises(DataTypeValidationError, match="Invalid dtype"): - wrapper_cls.from_dtype("not a dtype") # type: ignore[arg-type] - assert isinstance(wrapped, wrapper_cls) - assert wrapped.to_dtype() == dt - - -@pytest.mark.parametrize("zdtype", zdtype_examples) -def test_to_json_roundtrip(zdtype: ZDType[Any, Any], zarr_format: ZarrFormat) -> None: - """ - Test that a zdtype instance can round-trip through its JSON form - """ - as_dict = zdtype.to_json(zarr_format=zarr_format) - assert zdtype.from_json(as_dict, zarr_format=zarr_format) == zdtype - - -@pytest.mark.parametrize( - ("wrapper", "expected_default"), - [ - (Bool(), np.False_), - (Int8(), np.int8(0)), - (UInt8(), np.uint8(0)), - (Int16(), np.int16(0)), - (UInt16(), np.uint16(0)), - (Int32(), np.int32(0)), - (UInt32(), np.uint32(0)), - (Int64(), np.int64(0)), - (UInt64(), np.uint64(0)), - (Float16(), np.float16(0)), - (Float32(), np.float32(0)), - (Float64(), np.float64(0)), - (Complex64(), np.complex64(0)), - (Complex128(), np.complex128(0)), - (FixedLengthAscii(length=3), np.bytes_(b"")), - (FixedLengthBytes(length=3), np.void(b"\x00\x00\x00")), - (FixedLengthUnicode(length=3), np.str_("")), - ( - Structured(fields=(("a", Float64()), ("b", Int8()))), - np.array([0], dtype=[("a", np.float64), ("b", np.int8)])[0], - ), - (VariableLengthString(), ""), - (DateTime64(unit="s"), np.datetime64("NaT")), - ], -) -def test_default_value(wrapper: ZDType[Any, Any], expected_default: Any) -> None: - """ - Test that the default_value method is correctly set for each dtype wrapper. - """ - if isinstance(wrapper, DateTime64): - assert np.isnan(wrapper.default_value()) - else: - assert wrapper.default_value() == expected_default - - -@pytest.mark.parametrize( - ("wrapper", "input_value", "expected_json"), - [ - (Bool(), np.bool_(True), True), - (Int8(), np.int8(42), 42), - (UInt8(), np.uint8(42), 42), - (Int16(), np.int16(42), 42), - (UInt16(), np.uint16(42), 42), - (Int32(), np.int32(42), 42), - (UInt32(), np.uint32(42), 42), - (Int64(), np.int64(42), 42), - (UInt64(), np.uint64(42), 42), - (Float16(), np.float16(42.0), 42.0), - (Float32(), np.float32(42.0), 42.0), - (Float64(), np.float64(42.0), 42.0), - (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), - (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), - (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicode(length=4), np.str_("test"), "test"), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), - ], -) -def test_to_json_value_v2( - wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any -) -> None: - """ - Test the to_json_value method for each dtype wrapper for zarr v2 - """ - assert wrapper.to_json_value(input_value, zarr_format=2) == expected_json - - -# NOTE! This test is currently a direct copy of the v2 version. When or if we change JSON serialization -# in a v3-specific manner, this test must be changed. -# TODO: Apply zarr-v3-specific changes to this test as needed -@pytest.mark.parametrize( - ("wrapper", "input_value", "expected_json"), - [ - (Bool(), np.bool_(True), True), - (Int8(), np.int8(42), 42), - (UInt8(), np.uint8(42), 42), - (Int16(), np.int16(42), 42), - (UInt16(), np.uint16(42), 42), - (Int32(), np.int32(42), 42), - (UInt32(), np.uint32(42), 42), - (Int64(), np.int64(42), 42), - (UInt64(), np.uint64(42), 42), - (Float16(), np.float16(42.0), 42.0), - (Float32(), np.float32(42.0), 42.0), - (Float64(), np.float64(42.0), 42.0), - (Complex64(), np.complex64(42.0 + 1.0j), (42.0, 1.0)), - (Complex128(), np.complex128(42.0 + 1.0j), (42.0, 1.0)), - (FixedLengthAscii(length=4), np.bytes_(b"test"), "dGVzdA=="), - (FixedLengthBytes(length=4), np.void(b"test"), "dGVzdA=="), - (FixedLengthUnicode(length=4), np.str_("test"), "test"), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), np.datetime64("2021-01-01T00:00:00", "s"), 1609459200), - ], -) -def test_to_json_value_v3( - wrapper: ZDType[TBaseDType, TBaseScalar], input_value: Any, expected_json: Any -) -> None: - """ - Test the to_json_value method for each dtype wrapper for zarr v3 - """ - assert wrapper.to_json_value(input_value, zarr_format=3) == expected_json - - -@pytest.mark.parametrize( - ("wrapper", "json_value", "expected_value"), - [ - (Bool(), True, np.bool_(True)), - (Int8(), 42, np.int8(42)), - (UInt8(), 42, np.uint8(42)), - (Int16(), 42, np.int16(42)), - (UInt16(), 42, np.uint16(42)), - (Int32(), 42, np.int32(42)), - (UInt32(), 42, np.uint32(42)), - (Int64(), 42, np.int64(42)), - (UInt64(), 42, np.uint64(42)), - (Float16(), 42.0, np.float16(42.0)), - (Float32(), 42.0, np.float32(42.0)), - (Float64(), 42.0, np.float64(42.0)), - (Complex64(), (42.0, 1.0), np.complex64(42.0 + 1.0j)), - (Complex128(), (42.0, 1.0), np.complex128(42.0 + 1.0j)), - (FixedLengthAscii(length=4), "dGVzdA==", np.bytes_(b"test")), - (FixedLengthBytes(length=4), "dGVzdA==", np.void(b"test")), - (FixedLengthUnicode(length=4), "test", np.str_("test")), - (VariableLengthString(), "test", "test"), - (DateTime64(unit="s"), 1609459200, np.datetime64("2021-01-01T00:00:00", "s")), - ], -) -def test_from_json_value( - wrapper: ZDType[TBaseDType, TBaseScalar], json_value: Any, expected_value: Any -) -> None: - """ - Test the from_json_value method for each dtype wrapper. - """ - assert wrapper.from_json_value(json_value, zarr_format=2) == expected_value diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 1040683846..086a2cfee8 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -8,6 +8,7 @@ class TestBool(_TestZDType): test_cls = Bool + valid_dtype = (np.dtype(np.bool_),) invalid_dtype = ( np.dtype(np.int8), @@ -27,5 +28,13 @@ class TestBool(_TestZDType): {"name": "bool", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|b1", True), ("|b1", False)) - scalar_v3_params = (("bool", True), ("bool", False)) + scalar_v2_params = ((Bool(), True), (Bool(), False)) + scalar_v3_params = ((Bool(), True), (Bool(), False)) + + cast_value_params = ( + (Bool(), "true", np.True_), + (Bool(), True, np.True_), + (Bool(), False, np.False_), + (Bool(), np.True_, np.True_), + (Bool(), np.False_, np.False_), + ) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index aac514028d..b24bc4d7c8 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -1,12 +1,21 @@ from __future__ import annotations +import math + import numpy as np from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 -class TestComplex64(_TestZDType): +class _BaseTestFloat(_TestZDType): + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestComplex64(_BaseTestFloat): test_cls = Complex64 valid_dtype = (np.dtype(">c8"), np.dtype("c8", ">c8") + valid_json_v2 = (">c8", "c8", (1.0, 1.0)), ("c8", (0, "NaN"))) + scalar_v2_params = ( + (Complex64(), (1.0, 1.0)), + (Complex64(), (-1.0, "Infinity")), + (Complex64(), (0, "NaN")), + ) scalar_v3_params = ( - ("complex64", (1.0, 1.0)), - ("complex64", (-1.0, "Infinity")), - ("complex64", (0, "NaN")), + (Complex64(), (1.0, 1.0)), + (Complex64(), (-1.0, "Infinity")), + (Complex64(), (0, "NaN")), + ) + cast_value_params = ( + (Complex64(), complex(1.0, 1.0), np.complex64(complex(1.0, 1.0))), + (Complex64(), complex(-1.0, math.inf), np.complex64(complex(-1.0, math.inf))), + (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) -class TestComplex128(_TestZDType): +class TestComplex128(_BaseTestFloat): test_cls = Complex128 valid_dtype = (np.dtype(">c16"), np.dtype("c16", (1.0, 1.0)), ("c16", (0, "NaN"))) + scalar_v2_params = ( + (Complex128(), (1.0, 1.0)), + (Complex128(), (-1.0, "Infinity")), + (Complex128(), (0, "NaN")), + ) scalar_v3_params = ( - ("complex128", (1.0, 1.0)), - ("complex128", (-1.0, "Infinity")), - ("complex128", (0, "NaN")), + (Complex128(), (1.0, 1.0)), + (Complex128(), (-1.0, "Infinity")), + (Complex128(), (0, "NaN")), + ) + cast_value_params = ( + (Complex128(), complex(1.0, 1.0), np.complex128(complex(1.0, 1.0))), + (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), + (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index 232ed1e32c..5981d09514 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -6,7 +6,14 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 -class TestFloat16(_TestZDType): +class _BaseTestFloat(_TestZDType): + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestFloat16(_BaseTestFloat): test_cls = Float16 valid_dtype = (np.dtype(">f2"), np.dtype("f2", 1.0), ("f2", "Infinity")) + scalar_v2_params = ( + (Float16(), 1.0), + (Float16(), -1.0), + (Float16(), "NaN"), + (Float16(), "Infinity"), + ) scalar_v3_params = ( - ("float16", 1.0), - ("float16", -1.0), - ("float16", "NaN"), - ("float16", "Infinity"), + (Float16(), 1.0), + (Float16(), -1.0), + (Float16(), "NaN"), + (Float16(), "Infinity"), + ) + cast_value_params = ( + (Float16(), 1.0, np.float16(1.0)), + (Float16(), -1.0, np.float16(-1.0)), + (Float16(), "NaN", np.float16("NaN")), ) -class TestFloat32(_TestZDType): +class TestFloat32(_BaseTestFloat): test_cls = Float32 + scalar_type = np.float32 valid_dtype = (np.dtype(">f4"), np.dtype("f4", 1.0), ("f4", "Infinity")) + scalar_v2_params = ( + (Float32(), 1.0), + (Float32(), -1.0), + (Float32(), "NaN"), + (Float32(), "Infinity"), + ) scalar_v3_params = ( - ("float32", 1.0), - ("float32", -1.0), - ("float32", "NaN"), - ("float32", "Infinity"), + (Float32(), 1.0), + (Float32(), -1.0), + (Float32(), "NaN"), + (Float32(), "Infinity"), + ) + + cast_value_params = ( + (Float32(), 1.0, np.float32(1.0)), + (Float32(), -1.0, np.float32(-1.0)), + (Float32(), "NaN", np.float32("NaN")), ) -class TestFloat64(_TestZDType): +class TestFloat64(_BaseTestFloat): test_cls = Float64 valid_dtype = (np.dtype(">f8"), np.dtype("f8", 1.0), ("f8", "Infinity")) + scalar_v2_params = ( + (Float64(), 1.0), + (Float64(), -1.0), + (Float64(), "NaN"), + (Float64(), "Infinity"), + ) scalar_v3_params = ( - ("float64", 1.0), - ("float64", -1.0), - ("float64", "NaN"), - ("float64", "Infinity"), + (Float64(), 1.0), + (Float64(), -1.0), + (Float64(), "NaN"), + (Float64(), "Infinity"), + ) + + cast_value_params = ( + (Float64(), 1.0, np.float64(1.0)), + (Float64(), -1.0, np.float64(-1.0)), + (Float64(), "NaN", np.float64("NaN")), ) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 99f698fc8e..637b594e1b 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -8,6 +8,7 @@ class TestInt8(_TestZDType): test_cls = Int8 + scalar_type = np.int8 valid_dtype = (np.dtype(np.int8),) invalid_dtype = ( np.dtype(np.int16), @@ -27,12 +28,17 @@ class TestInt8(_TestZDType): {"name": "int8", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|i1", 1), ("|i1", -1)) - scalar_v3_params = (("int8", 1), ("int8", -1)) + scalar_v2_params = ((Int8(), 1), (Int8(), -1)) + scalar_v3_params = ((Int8(), 1), (Int8(), -1)) + cast_value_params = ( + (Int8(), 1, np.int8(1)), + (Int8(), -1, np.int8(-1)), + ) class TestInt16(_TestZDType): test_cls = Int16 + scalar_type = np.int16 valid_dtype = (np.dtype(">i2"), np.dtype("i2", -1)) - scalar_v3_params = (("int16", 1), ("int16", -1)) + scalar_v2_params = ((Int16(), 1), (Int16(), -1)) + scalar_v3_params = ((Int16(), 1), (Int16(), -1)) + cast_value_params = ( + (Int16(), 1, np.int16(1)), + (Int16(), -1, np.int16(-1)), + ) class TestInt32(_TestZDType): test_cls = Int32 + scalar_type = np.int32 valid_dtype = (np.dtype(">i4"), np.dtype("i4", -1)) - scalar_v3_params = (("int32", 1), ("int32", -1)) + scalar_v2_params = ((Int32(), 1), (Int32(), -1)) + scalar_v3_params = ((Int32(), 1), (Int32(), -1)) + cast_value_params = ( + (Int32(), 1, np.int32(1)), + (Int32(), -1, np.int32(-1)), + ) class TestInt64(_TestZDType): test_cls = Int64 + scalar_type = np.int64 valid_dtype = (np.dtype(">i8"), np.dtype("i8", -1)) - scalar_v3_params = (("int64", 1), ("int64", -1)) + scalar_v2_params = ((Int64(), 1), (Int64(), -1)) + scalar_v3_params = ((Int64(), 1), (Int64(), -1)) + cast_value_params = ( + (Int64(), 1, np.int64(1)), + (Int64(), -1, np.int64(-1)), + ) class TestUInt8(_TestZDType): test_cls = UInt8 + scalar_type = np.uint8 valid_dtype = (np.dtype(np.uint8),) invalid_dtype = ( np.dtype(np.int8), @@ -127,12 +148,17 @@ class TestUInt8(_TestZDType): {"name": "uint8", "configuration": {"endianness": "little"}}, ) - scalar_v2_params = (("|u1", 1), ("|u1", 0)) - scalar_v3_params = (("uint8", 1), ("uint8", 0)) + scalar_v2_params = ((UInt8(), 1), (UInt8(), 0)) + scalar_v3_params = ((UInt8(), 1), (UInt8(), 0)) + cast_value_params = ( + (UInt8(), 1, np.uint8(1)), + (UInt8(), 0, np.uint8(0)), + ) class TestUInt16(_TestZDType): test_cls = UInt16 + scalar_type = np.uint16 valid_dtype = (np.dtype(">u2"), np.dtype("u2", 0)) - scalar_v3_params = (("uint16", 1), ("uint16", 0)) + scalar_v2_params = ((UInt16(), 1), (UInt16(), 0)) + scalar_v3_params = ((UInt16(), 1), (UInt16(), 0)) + cast_value_params = ( + (UInt16(), 1, np.uint16(1)), + (UInt16(), 0, np.uint16(0)), + ) class TestUInt32(_TestZDType): test_cls = UInt32 + scalar_type = np.uint32 valid_dtype = (np.dtype(">u4"), np.dtype("u4", 0)) - scalar_v3_params = (("uint32", 1), ("uint32", 0)) + scalar_v2_params = ((UInt32(), 1), (UInt32(), 0)) + scalar_v3_params = ((UInt32(), 1), (UInt32(), 0)) + cast_value_params = ( + (UInt32(), 1, np.uint32(1)), + (UInt32(), 0, np.uint32(0)), + ) class TestUInt64(_TestZDType): test_cls = UInt64 + scalar_type = np.uint64 valid_dtype = (np.dtype(">u8"), np.dtype("u8", 0)) - scalar_v3_params = (("uint64", 1), ("uint64", 0)) + scalar_v2_params = ((UInt64(), 1), (UInt64(), 0)) + scalar_v3_params = ((UInt64(), 1), (UInt64(), 0)) + cast_value_params = ( + (UInt64(), 1, np.uint64(1)), + (UInt64(), 0, np.uint64(0)), + ) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 17f4b2af2d..2ded5bbb7c 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -1,8 +1,12 @@ from __future__ import annotations +from typing import Any + import numpy as np from tests.test_dtype.test_wrapper import _TestZDType +from zarr.core.dtype.npy.float import Float16, Float64 +from zarr.core.dtype.npy.int import Int32, Int64 from zarr.core.dtype.npy.sized import ( FixedLengthAscii, FixedLengthBytes, @@ -31,11 +35,20 @@ class TestFixedLengthAscii(_TestZDType): {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) - scalar_v2_params = (("|S0", ""), ("|S2", "YWI="), ("|S4", "YWJjZA==")) + scalar_v2_params = ( + (FixedLengthAscii(length=0), ""), + (FixedLengthAscii(length=2), "YWI="), + (FixedLengthAscii(length=4), "YWJjZA=="), + ) scalar_v3_params = ( - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 0}}, ""), - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 16}}, "YWI="), - ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 32}}, "YWJjZA=="), + (FixedLengthAscii(length=0), ""), + (FixedLengthAscii(length=2), "YWI="), + (FixedLengthAscii(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (FixedLengthAscii(length=0), "", np.bytes_("")), + (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), + (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), ) @@ -63,11 +76,20 @@ class TestFixedLengthBytes(_TestZDType): {"name": "r-80"}, ) - scalar_v2_params = (("|V0", ""), ("|V2", "YWI="), ("|V4", "YWJjZA==")) + scalar_v2_params = ( + (FixedLengthBytes(length=0), ""), + (FixedLengthBytes(length=2), "YWI="), + (FixedLengthBytes(length=4), "YWJjZA=="), + ) scalar_v3_params = ( - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, ""), - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 2}}, "YWI="), - ({"name": "numpy.fixed_length_bytes", "configuration": {"length_bytes": 4}}, "YWJjZA=="), + (FixedLengthBytes(length=0), ""), + (FixedLengthBytes(length=2), "YWI="), + (FixedLengthBytes(length=4), "YWJjZA=="), + ) + cast_value_params = ( + (FixedLengthBytes(length=0), b"", np.void(b"")), + (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), + (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), ) @@ -91,11 +113,17 @@ class TestFixedLengthUnicode(_TestZDType): {"name": "numpy.fixed_length_ucs4", "configuration": {"length_bits": "invalid"}}, ) - scalar_v2_params = ((">U0", ""), (" bool: + if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): + return np.array_equal(scalar1, scalar2) + return super().scalar_equals(scalar1, scalar2) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index fbb0aaa86d..c87f538be5 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -8,8 +8,8 @@ if _NUMPY_SUPPORTS_VLEN_STRING: class TestVariableLengthString(_TestZDType): - test_cls = VariableLengthString - valid_dtype = (np.dtypes.StringDType(),) + test_cls = VariableLengthString # type: ignore[assignment] + valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] invalid_dtype = ( np.dtype(np.int8), np.dtype(np.float64), @@ -27,10 +27,21 @@ class TestVariableLengthString(_TestZDType): {"name": "invalid_name"}, ) + scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v3_params = ( + (VariableLengthString(), ""), + (VariableLengthString(), "hi"), + ) + + cast_value_params = ( + (VariableLengthString(), "", np.str_("")), + (VariableLengthString(), "hi", np.str_("hi")), + ) + else: - class TestVariableLengthString(_TestZDType): - test_cls = VariableLengthString + class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] + test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtype("O"),) invalid_dtype = ( np.dtype(np.int8), @@ -48,3 +59,14 @@ class TestVariableLengthString(_TestZDType): {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) + + scalar_v2_params = ((VariableLengthString(), ""), (VariableLengthString(), "hi")) + scalar_v3_params = ( + (VariableLengthString(), ""), + (VariableLengthString(), "hi"), + ) + + cast_value_params = ( + (VariableLengthString(), "", np.str_("")), + (VariableLengthString(), "hi", np.str_("hi")), + ) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index 2a8ff6ac98..f8f8b5ae47 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,15 +1,32 @@ from __future__ import annotations import re +from typing import get_args import numpy as np import pytest from tests.test_dtype.test_wrapper import _TestZDType -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.npy.common import DateTimeUnit +from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int -class TestDateTime64(_TestZDType): +class _TestTimeBase(_TestZDType): + def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # This method gets overridden here to support the equivalency between NaT and + # -9223372036854775808 fill values + nat_scalars = (-9223372036854775808, "NaT") + if scalar1 in nat_scalars and scalar2 in nat_scalars: + return True + return scalar1 == scalar2 + + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] + return True + return super().scalar_equals(scalar1, scalar2) + + +class TestDateTime64(_TestTimeBase): test_cls = DateTime64 valid_dtype = (np.dtype("datetime64[10ns]"), np.dtype("datetime64[us]"), np.dtype("datetime64")) invalid_dtype = ( @@ -32,8 +49,23 @@ class TestDateTime64(_TestZDType): {"name": "datetime64", "configuration": {"unit": 123}}, ) + scalar_v2_params = ( + (DateTime64(unit="ns", scale_factor=1), 1), + (DateTime64(unit="ns", scale_factor=1), "NaT"), + ) + scalar_v3_params = ( + (DateTime64(unit="ns", scale_factor=1), 1), + (DateTime64(unit="ns", scale_factor=1), "NaT"), + ) -class TestTimeDelta64(_TestZDType): + cast_value_params = ( + (DateTime64(unit="Y", scale_factor=1), "1", np.datetime64("1", "Y")), + (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), + (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), + ) + + +class TestTimeDelta64(_TestTimeBase): test_cls = TimeDelta64 valid_dtype = (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[us]")) invalid_dtype = ( @@ -57,6 +89,20 @@ class TestTimeDelta64(_TestZDType): {"name": "timedelta64", "configuration": {"unit": 123}}, ) + scalar_v2_params = ( + (TimeDelta64(unit="ns", scale_factor=1), 1), + (TimeDelta64(unit="ns", scale_factor=1), "NaT"), + ) + scalar_v3_params = ( + (TimeDelta64(unit="ns", scale_factor=1), 1), + (TimeDelta64(unit="ns", scale_factor=1), "NaT"), + ) + + cast_value_params = ( + (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), + (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), + ) + def test_time_invalid_unit() -> None: """ @@ -92,3 +138,14 @@ def test_time_scale_factor_too_high() -> None: DateTime64(scale_factor=scale_factor) with pytest.raises(ValueError, match=msg): TimeDelta64(scale_factor=scale_factor) + + +@pytest.mark.parametrize("unit", get_args(DateTimeUnit)) +@pytest.mark.parametrize("scale_factor", [1, 10]) +@pytest.mark.parametrize("value", [0, 1, 10]) +def test_datetime_from_int(unit: DateTimeUnit, scale_factor: int, value: int) -> None: + """ + Test datetime_from_int. + """ + expected = np.int64(value).view(f"datetime64[{scale_factor}{unit}]") + assert datetime_from_int(value, unit=unit, scale_factor=scale_factor) == expected diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index defd3fffc5..ddf43524e0 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -5,10 +5,8 @@ if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -import pytest -import requests - +""" class _TestZDTypeSchema: # subclasses define the URL for the schema, if available schema_url: ClassVar[str] = "" @@ -21,11 +19,12 @@ def get_schema(self) -> object: def test_schema(self, schema: json_schema.Schema) -> None: assert schema.is_valid(self.test_cls.to_json(zarr_format=2)) +""" class _TestZDType: test_cls: type[ZDType[TBaseDType, TBaseScalar]] - + scalar_type: ClassVar[type[TBaseScalar]] valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () @@ -42,6 +41,18 @@ class _TestZDType: scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] + + def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # An equality check for json-encoded scalars. This defaults to regular equality, + # but some classes may need to override this for special cases + return scalar1 == scalar2 + + def scalar_equals(self, scalar1: object, scalar2: object) -> bool: + # An equality check for scalars. This defaults to regular equality, + # but some classes may need to override this for special cases + return scalar1 == scalar2 + def test_check_dtype_valid(self, valid_dtype: object) -> None: assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] @@ -60,14 +71,17 @@ def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 - def test_scalar_roundtrip_v2(self, scalar_v2_params: Any) -> None: - dtype_json, scalar_json = scalar_v2_params - zdtype = self.test_cls.from_json(dtype_json, zarr_format=2) + def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[Any, Any]) -> None: + zdtype, scalar_json = scalar_v2_params scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert scalar_json == zdtype.to_json_value(scalar, zarr_format=2) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) - def test_scalar_roundtrip_v3(self, scalar_v3_params: Any) -> None: - dtype_json, scalar_json = scalar_v3_params - zdtype = self.test_cls.from_json(dtype_json, zarr_format=3) + def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[Any, Any]) -> None: + zdtype, scalar_json = scalar_v3_params scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert scalar_json == zdtype.to_json_value(scalar, zarr_format=3) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + + def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: + zdtype, value, expected = cast_value_params + observed = zdtype.cast_value(value) + assert self.scalar_equals(expected, observed) diff --git a/tests/test_properties.py b/tests/test_properties.py index 31fa17ce93..abfa0d599e 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,11 +75,11 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") -@given(data=st.data()) -def test_array_roundtrip(data: st.DataObject) -> None: - nparray = data.draw(numpy_arrays()) - zarray = data.draw(arrays(arrays=st.just(nparray))) +@settings(deadline=300) +@given(data=st.data(), zarr_format=zarr_formats) +def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: + nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) + zarray = data.draw(arrays(arrays=st.just(nparray), zarr_formats=st.just(zarr_format))) assert_array_equal(nparray, zarray[:]) From 3484a1c34b4e3cdc974560098ea00306d9638f9d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:11:53 +0200 Subject: [PATCH 079/129] use relative link for changes --- changes/2874.feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index 26eda3a257..d0adcd6533 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,2 @@ Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation documentation `_ \ No newline at end of file From b58346a29dfdd35f94545f1d3e02cfa7df8fe838 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:12:23 +0200 Subject: [PATCH 080/129] typo --- changes/2874.feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index d0adcd6533..50634e5395 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,2 @@ Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation documentation `_ \ No newline at end of file +v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file From aa156f22c79149a6bfacf1f9de13568d57e140a0 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:24:08 +0200 Subject: [PATCH 081/129] make bytes codec dtype logic a bit more literate --- src/zarr/codecs/bytes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 6c28bfe543..a87df060e7 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -58,7 +58,10 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if array_spec.dtype.to_dtype().itemsize == 1: + # Note: this check is numpy-dtype-specific + # For single-byte (e.g., uint8) or 0-byte (e.g., S0) dtypes, + # endianness does not apply. + if array_spec.dtype.to_dtype().itemsize < 2: if self.endian is not None: return replace(self, endian=None) elif self.endian is None: @@ -77,7 +80,8 @@ async def _decode_single( endian_str = cast( "Endianness | None", self.endian.value if self.endian is not None else None ) - dtype = chunk_spec.dtype.to_dtype().newbyteorder(endianness_to_numpy_str(endian_str)) + new_byte_order = endianness_to_numpy_str(endian_str) + dtype = chunk_spec.dtype.to_dtype().newbyteorder(new_byte_order) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): From 5c51c529d88e72966e3683fb45be23c1938558b2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:25:05 +0200 Subject: [PATCH 082/129] increase deadline to 500ms --- tests/test_properties.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_properties.py b/tests/test_properties.py index abfa0d599e..5677cafc48 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,7 +75,7 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@settings(deadline=300) +@settings(deadline=500) @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) From 0a2b567ebb698df3aac60fe6bc4d8e82ff051cbd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:40:20 +0200 Subject: [PATCH 083/129] fewer commented sections of problematic lru_store_cache section of the sharding codecs --- src/zarr/codecs/sharding.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index b3ff0953d4..8282af70d9 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -357,10 +357,13 @@ def __init__( object.__setattr__(self, "index_location", index_location_parsed) # Use instance-local lru_cache to avoid memory leaks - # TODO: fix these when we don't get hashability errors for certain numpy dtypes + + # numpy void scalars are not hashable, which means an array spec with a fill value that is + # a numpy void scalar will break the lru_cache. This is commented for now but should be + # fixed # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) - # object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) - # object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) + object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) + object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) # todo: typedict return type def __getstate__(self) -> dict[str, Any]: From 4b2b6ec1dbdf8bfaa724ae00026ae31a346cd9d3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 13:56:49 +0200 Subject: [PATCH 084/129] add link to gh issue about lru_cache for sharding codec --- src/zarr/codecs/sharding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 8282af70d9..914236d700 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -360,7 +360,7 @@ def __init__( # numpy void scalars are not hashable, which means an array spec with a fill value that is # a numpy void scalar will break the lru_cache. This is commented for now but should be - # fixed + # fixed. See https://github.com/zarr-developers/zarr-python/issues/3054 # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) From b737e6749f7a9ed340e5ceec6b7fee984f46f611 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 14:28:22 +0200 Subject: [PATCH 085/129] attempt to speed up hypothesis tests by reducing max array size --- tests/test_properties.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_properties.py b/tests/test_properties.py index 5677cafc48..d8f70e63d7 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,7 +75,6 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@settings(deadline=500) @given(data=st.data(), zarr_format=zarr_formats) def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) From d4615e002e2d5a3a49014a289318d95543e21b67 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 15:57:30 +0200 Subject: [PATCH 086/129] clean up docs --- docs/user-guide/data_types.rst | 116 +++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 36 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index a281b349de..81a09a6485 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -5,8 +5,10 @@ Zarr's data type model ---------------------- Every Zarr array has a "data type", which defines the meaning and physical layout of the -array's elements. Zarr is heavily influenced by `NumPy `_, and -Zarr-Python supports creating arrays with Numpy data types:: +array's elements. As Zarr Python is tightly integrated with `NumPy `_, +it's easy to create arrays with NumPy data types: + +.. code-block:: python >>> import zarr >>> import numpy as np @@ -14,58 +16,103 @@ Zarr-Python supports creating arrays with Numpy data types:: >>> z -Unlike Numpy arrays, Zarr arrays are designed to be persisted to storage and read by Zarr implementations in different programming languages. -This means Zarr data types must be interpreted correctly when clients read an array. So each Zarr data type defines a procedure for -encoding/decoding that data type to/from Zarr array metadata, and also encoding/decoding **instances** of that data type to/from -array metadata. These serialization procedures depend on the Zarr format. +Unlike NumPy arrays, Zarr arrays are designed to accessed by Zarr +implementations in different programming languages. This means Zarr data types must be interpreted +correctly when clients read an array. Each Zarr data type defines procedures for +encoding and decoding both the data type itself, and scalars from that data type to and from Zarr array metadata. And these serialization procedures +depend on the Zarr format. Data types in Zarr version 2 ----------------------------- -Version 2 of the Zarr format defined its data types relative to `Numpy's data types `_, and added a few non-Numpy data types as well. -Thus the JSON identifier for a Numpy-compatible data type is just the Numpy ``str`` attribute of that dtype:: +Version 2 of the Zarr format defined its data types relative to +`NumPy's data types `_, +and added a few non-NumPy data types as well. Thus the JSON identifier for a NumPy-compatible data +type is just the NumPy ``str`` attribute of that data type: + +.. code-block:: python - >>> import zarr - >>> import numpy as np - >>> import json - >>> store = {} - >>> np_dtype = np.dtype('int64') - >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) - >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] - >>> assert dtype_meta == np_dtype.str # True - >>> dtype_meta - '>> import zarr + >>> import numpy as np + >>> import json + >>> + >>> store = {} + >>> np_dtype = np.dtype('int64') + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> dtype_meta + '>> assert dtype_meta == np_dtype.str .. note:: - The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following Numpy's example, - in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. + The ``<`` character in the data type metadata encodes the + `endianness `_, + or "byte order", of the data type. Following NumPy's example, + in Zarr version 2 each data type has an endianness where applicable. + However, Zarr version 3 data types do not store endianness information. + +In addition to defining a representation of the data type itself (which in the example above was +just a simple string ``"i2`` **or** ``M[10s]"`` in + Zarr V2. This is more compact, but can be harder to parse. + +For more about data types in Zarr V3, see the +`V3 specification `_. + +Data types in Zarr Python ------------------------- -The two Zarr formats that Zarr-Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as Numpy-compatible strings, while data types in Zarr version +The two Zarr formats that Zarr Python supports specify data types in two different ways: +data types in Zarr version 2 are encoded as NumPy-compatible strings, while data types in Zarr version 3 are encoded as either strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. -To abstract over these syntactical and semantic differences, Zarr-Python uses a class called `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ to wrap native data types (e.g., Numpy data types) and provide Zarr V2 and Zarr V3 compatibility routines. -Each data type supported by Zarr-Python is modeled by a subclass of ``ZDType``, which provides an API for the following operations: +To abstract over these syntactical and semantic differences, Zarr Python uses a class called +`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ provide Zarr V2 and Zarr V3 compatibility +routines for ""native" data types. In this context, a "native" data type is a Python class, +typically defined in another library, that models an array's data type. For example, ``np.uint8`` is a native +data type defined in NumPy, which Zarr Python wraps with a ``ZDType`` instance called +`UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. + +Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an +API for the following operations: - Wrapping / unwrapping a native data type - Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. @@ -104,7 +151,4 @@ Example Usage Custom Data Types ~~~~~~~~~~~~~~~~~ -Users can define custom data types by subclassing `ZDType` and implementing the required methods. -Once defined, the custom data type can be registered with Zarr-Python to enable seamless integration with the library. - \ No newline at end of file From aafb348a9e1b2a352f67c5c160cde148d8fd9b9a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:01:52 +0200 Subject: [PATCH 087/129] remove placeholder --- docs/user-guide/data_types.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 81a09a6485..ff43dd8d19 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -147,8 +147,3 @@ Example Usage # Deserialize a scalar value scalar_value = int8.from_json_value(42, zarr_format=3) assert scalar_value == np.int8(42) - -Custom Data Types -~~~~~~~~~~~~~~~~~ - - \ No newline at end of file From 3ba3c2290ccf3f3865e1a957214a07c858c048b9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:36:07 +0200 Subject: [PATCH 088/129] make final example section doctested and more readable --- docs/user-guide/data_types.rst | 58 +++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index ff43dd8d19..777a69816e 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -9,7 +9,6 @@ array's elements. As Zarr Python is tightly integrated with `NumPy >> import zarr >>> import numpy as np >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) @@ -122,28 +121,51 @@ API for the following operations: Example Usage ~~~~~~~~~~~~~ +Create a ``ZDType`` from a native data type: + +.. code-block:: python + + >>> from zarr.core.dtype import Int8 + >>> import numpy as np + >>> int8 = Int8.from_dtype(np.dtype('int8')) + +Convert back to native data type: + +.. code-block:: python + + >>> native_dtype = int8.to_dtype() + >>> assert native_dtype == np.dtype('int8') + +Get the default scalar value for the data type: + .. code-block:: python - from zarr.core.dtype.wrapper import Int8 + >>> default_value = int8.default_value() + >>> assert default_value == np.int8(0) - # Create a ZDType instance from a native dtype - int8 = Int8.from_dtype(np.dtype('int8')) - # Convert back to native dtype - native_dtype = int8.to_dtype() - assert native_dtype == np.dtype('int8') +Serialize to JSON for Zarr V2 and V3 - # Get the default value - default_value = int8.default_value() - assert default_value == np.int8(0) +.. code-block:: python - # Serialize to JSON - json_representation = int8.to_json(zarr_format=3) + >>> json_v2 = int8.to_json(zarr_format=2) + >>> json_v2 + '|i1' + >>> json_v3 = int8.to_json(zarr_format=3) + >>> json_v3 + 'int8' - # Serialize a scalar value - json_value = int8.to_json_value(42, zarr_format=3) - assert json_value == 42 +Serialize a scalar value to JSON: + +.. code-block:: python + + >>> json_value = int8.to_json_value(42, zarr_format=3) + >>> json_value + 42 + +Deserialize a scalar value from JSON: + +.. code-block:: python - # Deserialize a scalar value - scalar_value = int8.from_json_value(42, zarr_format=3) - assert scalar_value == np.int8(42) + >>> scalar_value = int8.from_json_value(42, zarr_format=3) + >>> assert scalar_value == np.int8(42) From d5154c01bb0f9556d2cff705f81483323ea61bf8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:40:57 +0200 Subject: [PATCH 089/129] revert change to auto chunking --- docs/user-guide/performance.rst | 2 +- src/zarr/core/chunk_grids.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 9f2e730785..4bcffc15ff 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -52,7 +52,7 @@ a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') >>> z4.chunks - (313, 625) + (625, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 74bf9b6ba8..6701aca182 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -63,7 +63,7 @@ def _guess_chunks( """ if isinstance(shape, int): shape = (shape,) - typesize = max(typesize, 8) + typesize = max(typesize, 1) ndims = len(shape) # require chunks to have non-zero length for all dimensions chunks = np.maximum(np.array(shape, dtype="=f8"), 1) From d936c0e88257f64dcef420601fa2c569d692f417 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 16:46:21 +0200 Subject: [PATCH 090/129] revert quotation of literal type --- src/zarr/core/array_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 279bf6edf0..5d4321da82 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -63,7 +63,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast("Literal['order', 'write_empty_chunks']", f.name) + field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: From 906caf74abda7be83f22258d820c0de8d9f1a3fa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:17:55 +0200 Subject: [PATCH 091/129] lint --- src/zarr/core/dtype/__init__.py | 52 ++++++++++++++++++++++++--------- src/zarr/core/dtype/npy/time.py | 8 ++--- tests/test_config.py | 6 +++- tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_dtype.py | 0 tests/test_dtype_registry.py | 35 ++++++++++++++++++++-- 6 files changed, 81 insertions(+), 22 deletions(-) delete mode 100644 tests/test_dtype/test_dtype.py diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 1a18849a13..a8cdfc0cbc 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeAlias, get_args +from typing import TYPE_CHECKING, TypeAlias from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool @@ -30,8 +30,10 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ + "Bool", "Complex64", "Complex128", + "DataTypeRegistry", "DataTypeValidationError", "DateTime64", "FixedLengthAscii", @@ -45,6 +47,8 @@ "Int32", "Int64", "Structured", + "TBaseDType", + "TBaseScalar", "TimeDelta64", "TimeDelta64", "UInt8", @@ -59,25 +63,47 @@ data_type_registry = DataTypeRegistry() -INTEGER_DTYPE = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -FLOAT_DTYPE = Float16 | Float32 | Float64 -COMPLEX_DTYPE = Complex64 | Complex128 -STRING_DTYPE = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -DTYPE = ( +IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 +INTEGER_DTYPE = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 + +FloatDType = Float16 | Float32 | Float64 +FLOAT_DTYPE = Float16, Float32, Float64 + +ComplexFloatDType = Complex64 | Complex128 +COMPLEX_FLOAT_DTYPE = Complex64, Complex128 + +StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii +STRING_DTYPE = FixedLengthUnicode, VariableLengthString, FixedLengthAscii + +TimeDType = DateTime64 | TimeDelta64 +TIME_DTYPE = DateTime64, TimeDelta64 + +AnyDType = ( Bool - | INTEGER_DTYPE - | FLOAT_DTYPE - | COMPLEX_DTYPE - | STRING_DTYPE + | IntegerDType + | FloatDType + | ComplexFloatDType + | StringDType | FixedLengthBytes | Structured - | DateTime64 - | TimeDelta64 + | TimeDType +) +# mypy has trouble inferring the type of variablelengthstring dtype, because its class definition +# depends on the installed numpy version. That's why the type: ignore statement is needed here. +ANY_DTYPE: tuple[type[ZDType[TBaseDType, TBaseScalar]], ...] = ( # type: ignore[assignment] + Bool, + *INTEGER_DTYPE, + *FLOAT_DTYPE, + *COMPLEX_FLOAT_DTYPE, + *STRING_DTYPE, + FixedLengthBytes, + Structured, + *TIME_DTYPE, ) ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] -for dtype in get_args(DTYPE): +for dtype in ANY_DTYPE: data_type_registry.register(dtype._zarr_v3_name, dtype) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index bbdd41d13f..ea44d76b56 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -173,7 +173,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit for ``TimeDelta64`` is optional. """ - dtype_cls = np.dtypes.TimeDelta64DType + dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] _zarr_v3_name = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.timedelta64: @@ -220,7 +220,7 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): - dtype_cls = np.dtypes.DateTime64DType + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name = "numpy.datetime64" _zarr_v2_names = (">M8", " np.datetime64: def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, value: object) -> np.datetime64: diff --git a/tests/test_config.py b/tests/test_config.py index e9b3921339..7878ebde58 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import os from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any from unittest import mock from unittest.mock import Mock @@ -44,6 +44,9 @@ TestNDArrayLike, ) +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import ZDType + def test_config_defaults_set() -> None: # regression test for available defaults @@ -323,6 +326,7 @@ async def test_default_codecs(dtype_category: str) -> None: """ Test that the default compressors are sensitive to the current setting of the config. """ + zdtype: ZDType[Any, Any] if dtype_category == "variable-length-string": zdtype = VariableLengthString() else: diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 9c7825c0d1..bf58a17556 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -24,7 +24,7 @@ def pytest_generate_tests(metafunc: Any) -> None: """ - pytest hook to parametrize class-scoped fixtures. + This is a pytest hook to parametrize class-scoped fixtures. This hook allows us to define class-scoped fixtures as class attributes and then generate the parametrize calls for pytest. This allows the fixtures to be diff --git a/tests/test_dtype/test_dtype.py b/tests/test_dtype/test_dtype.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 98380b86f7..aaca2f0862 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -11,16 +11,20 @@ import zarr from zarr.core.config import config from zarr.core.dtype import ( - DTYPE, + AnyDType, Bool, + DataTypeRegistry, + DateTime64, FixedLengthUnicode, + Int8, + Int16, TBaseDType, TBaseScalar, ZDType, data_type_registry, get_data_type_from_json, + parse_data_type, ) -from zarr.core.dtype.registry import DataTypeRegistry if TYPE_CHECKING: from collections.abc import Generator @@ -117,7 +121,7 @@ def test_match_dtype_unique( that excludes the data type class being tested, and ensure that an instance of the wrapped data type fails to match anything in the registry """ - for _cls in get_args(DTYPE): + for _cls in get_args(AnyDType): if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) @@ -156,3 +160,28 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance + + +@pytest.mark.parametrize( + ("dtype_params", "expected", "zarr_format"), + [ + ("int8", Int8(), 3), + (Int8(), Int8(), 3), + (">i2", Int16(endianness="big"), 2), + ("datetime64[10s]", DateTime64(unit="s", scale_factor=10), 2), + ( + {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, + DateTime64(unit="s", scale_factor=10), + 3, + ), + ], +) +def test_parse_data_type( + dtype_params: Any, expected: ZDType[Any, Any], zarr_format: ZarrFormat +) -> None: + """ + Test that parse_data_type accepts alternative representations of ZDType instances, and resolves + those inputs to the expected ZDType instance. + """ + observed = parse_data_type(dtype_params, zarr_format=zarr_format) + assert observed == expected From 6d34f7ebf890a7a6dfaa6501286ed2769b177d97 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:34:14 +0200 Subject: [PATCH 092/129] fix broken code block --- docs/user-guide/data_types.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 777a69816e..a4d8314a5e 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -9,6 +9,7 @@ array's elements. As Zarr Python is tightly integrated with `NumPy >> import zarr >>> import numpy as np >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) From ef1c722d46f23e3af026f647d5e3f23b609786d6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 18:57:02 +0200 Subject: [PATCH 093/129] specialize test to handle stringdtype changes coming in numpy 2.3 --- tests/test_array.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index d7462a4a15..7ed716996e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -49,6 +49,7 @@ from zarr.core.dtype.npy.sized import ( Structured, ) +from zarr.core.dtype.npy.string import VariableLengthString from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType from zarr.core.group import AsyncGroup @@ -1018,14 +1019,26 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor # Structured dtypes do not have a numpy string representation that uniquely identifies them if not isinstance(dtype, Structured): - c = zarr.create_array( - store, - name="c", - shape=(5,), - chunks=(5,), - dtype=dtype.to_dtype().str, - zarr_format=zarr_format, - ) + if isinstance(dtype, VariableLengthString): + # in numpy 2.3, StringDType().str becomes the string 'StringDType()' which numpy + # does not accept as a string representation of the dtype. + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().char, + zarr_format=zarr_format, + ) + else: + c = zarr.create_array( + store, + name="c", + shape=(5,), + chunks=(5,), + dtype=dtype.to_dtype().str, + zarr_format=zarr_format, + ) assert a.dtype == c.dtype @staticmethod From b4f2a59acf8288504dc1901f37ffca2666bbc39b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 13 May 2025 21:56:57 +0200 Subject: [PATCH 094/129] add docstring to _TestZDType class --- tests/test_dtype/test_wrapper.py | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index ddf43524e0..608e272690 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -23,6 +23,40 @@ def test_schema(self, schema: json_schema.Schema) -> None: class _TestZDType: + """ + A base class for testing ZDType subclasses. This class works in conjunction with the custom + pytest collection function ``pytest_generate_tests`` defined in conftest.py, which applies the + following procedure when generating tests: + + At test generation time, for each test fixture referenced by a method on this class + pytest will look for an attribute with the same name as that fixture. Pytest will assume that + this class attribute is a tuple of values to be used for generating a parametrized test fixture. + + This means that child classes can, by using different values for these class attributes, have + customized test parametrization. + + Attributes + ---------- + test_cls : type[ZDType[TBaseDType, TBaseScalar]] + The ZDType subclass being tested. + scalar_type : ClassVar[type[TBaseScalar]] + The expected scalar type for the ZDType. + valid_dtype : ClassVar[tuple[TBaseDType, ...]] + A tuple of valid numpy dtypes for the ZDType. + invalid_dtype : ClassVar[tuple[TBaseDType, ...]] + A tuple of invalid numpy dtypes for the ZDType. + valid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] + A tuple of valid JSON representations for Zarr format version 2. + invalid_json_v2 : ClassVar[tuple[str | dict[str, object] | list[object], ...]] + A tuple of invalid JSON representations for Zarr format version 2. + valid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] + A tuple of valid JSON representations for Zarr format version 3. + invalid_json_v3 : ClassVar[tuple[str | dict[str, object], ...]] + A tuple of invalid JSON representations for Zarr format version 3. + cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] + A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. + """ + test_cls: type[ZDType[TBaseDType, TBaseScalar]] scalar_type: ClassVar[type[TBaseScalar]] valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () From c5caccabd2dd238dca9bde81170a3f5c8cfa3eaa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 15 May 2025 12:52:58 +0200 Subject: [PATCH 095/129] type hints --- src/zarr/core/dtype/__init__.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index a8cdfc0cbc..b973691f0f 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeAlias +from typing import TYPE_CHECKING, Final, TypeAlias from zarr.core.dtype.common import DataTypeValidationError from zarr.core.dtype.npy.bool import Bool @@ -64,19 +64,19 @@ data_type_registry = DataTypeRegistry() IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 -INTEGER_DTYPE = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 +INTEGER_DTYPE: Final = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 FloatDType = Float16 | Float32 | Float64 -FLOAT_DTYPE = Float16, Float32, Float64 +FLOAT_DTYPE: Final = Float16, Float32, Float64 ComplexFloatDType = Complex64 | Complex128 -COMPLEX_FLOAT_DTYPE = Complex64, Complex128 +COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -STRING_DTYPE = FixedLengthUnicode, VariableLengthString, FixedLengthAscii +STRING_DTYPE: Final = FixedLengthUnicode, VariableLengthString, FixedLengthAscii TimeDType = DateTime64 | TimeDelta64 -TIME_DTYPE = DateTime64, TimeDelta64 +TIME_DTYPE: Final = DateTime64, TimeDelta64 AnyDType = ( Bool @@ -90,7 +90,7 @@ ) # mypy has trouble inferring the type of variablelengthstring dtype, because its class definition # depends on the installed numpy version. That's why the type: ignore statement is needed here. -ANY_DTYPE: tuple[type[ZDType[TBaseDType, TBaseScalar]], ...] = ( # type: ignore[assignment] +ANY_DTYPE: Final = ( Bool, *INTEGER_DTYPE, *FLOAT_DTYPE, @@ -101,10 +101,12 @@ *TIME_DTYPE, ) -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] +# This type models inputs that can be coerced to a ZDType +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] | str for dtype in ANY_DTYPE: - data_type_registry.register(dtype._zarr_v3_name, dtype) + # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType + data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] # TODO: find a better name for this function From 20e45a22b3468783cf5e0f1a072f57a8ff75bfd9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 16 May 2025 12:08:18 +0200 Subject: [PATCH 096/129] add numcodecs protocol --- src/zarr/abc/bikeshed.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/zarr/abc/bikeshed.py diff --git a/src/zarr/abc/bikeshed.py b/src/zarr/abc/bikeshed.py new file mode 100644 index 0000000000..3d01c234dd --- /dev/null +++ b/src/zarr/abc/bikeshed.py @@ -0,0 +1,29 @@ +from collections.abc import Mapping +from typing import Generic, Self, TypeVar + +import numpy as np +from typing_extensions import Buffer, Protocol, runtime_checkable + +BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] +BaseConfig = Mapping[str, object] +TNCodecConfig = TypeVar("TNCodecConfig", bound=BaseConfig) + + +@runtime_checkable +class Numcodec(Protocol, Generic[TNCodecConfig]): + """ + This protocol models the numcodecs.abc.Codec interface. + """ + + codec_id: str + + def encode(self, buf: BufferOrNDArray) -> BufferOrNDArray: ... + + def decode( + self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None + ) -> BufferOrNDArray: ... + + def get_config(self) -> TNCodecConfig: ... + + @classmethod + def from_config(cls, config: TNCodecConfig) -> Self: ... From 165d1069f18908a70efccd39cb48fb22afdc4883 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 16 May 2025 12:21:33 +0200 Subject: [PATCH 097/129] expand changelog --- changes/2874.feature.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst index 50634e5395..4c50532ae0 100644 --- a/changes/2874.feature.rst +++ b/changes/2874.feature.rst @@ -1,2 +1,9 @@ -Adds zarr-specific data type classes. This replaces the direct use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. For more on this new feature, see the `documentation `_ \ No newline at end of file +Adds zarr-specific data type classes. This replaces the internal use of numpy data types for zarr +v2 and a fixed set of string enums for zarr v3. This change is largely internal, but it does +change the type of the ``dtype`` and ``data_type`` fields on the ``ArrayV2Metadata`` and +``ArrayV3Metadata`` classes. It also changes the JSON metadata representation of the +variable-length string data type, but the old metadata representation can still be +used when reading arrays. The logic for automatically choosing the chunk encoding for a given data +type has also changed, and this necessitated changes to the ``config`` API. + +For more on this new feature, see the `documentation `_ \ No newline at end of file From e1c7fbcf381937a4af8286c079a67ed9cddad152 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 16 May 2025 18:50:33 +0200 Subject: [PATCH 098/129] tweak docstring --- src/zarr/core/dtype/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 199cbda5d8..3a56a85788 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -55,7 +55,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): Attributes ---------- dtype_cls : ClassVar[type[TDType]] - The numpy dtype class. This is a class variable. Instances of this class cannot set it. + The wrapped dtype class. This is a class variable. Instances of this class cannot set it. _zarr_v3_name : ClassVar[str] The name given to the wrapped data type by a zarr v3 data type specification. Note that this is not necessarily the same name that will appear in metadata documents, as some data types From 01017766d881d91c2a05d271888fcbcd4e53dc0f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 19 May 2025 12:36:55 +0200 Subject: [PATCH 099/129] support v3 nan strings in JSON for float dtypes --- src/zarr/core/dtype/common.py | 8 +- src/zarr/core/dtype/npy/common.py | 198 +++++------------------ src/zarr/core/dtype/npy/complex.py | 33 ++-- src/zarr/core/dtype/npy/float.py | 49 ++++-- src/zarr/core/metadata/v3.py | 6 +- tests/test_dtype/test_npy/test_common.py | 108 ++++++------- tests/test_dtype/test_npy/test_float.py | 15 ++ tests/test_metadata/test_v3.py | 11 +- 8 files changed, 180 insertions(+), 248 deletions(-) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 4249c57b1f..ecc475192c 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,11 +1,13 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Literal +from typing import Final, Literal Endianness = Literal["little", "big"] -SpecialFloats = Literal["NaN", "Infinity", "-Infinity"] -JSONFloat = float | SpecialFloats +SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] +SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") +JSONFloatV2 = float | SpecialFloatStrings +JSONFloatV3 = float | SpecialFloatStrings | str class DataTypeValidationError(ValueError): ... diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 8033e48291..2481dcb150 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import struct import sys from collections.abc import Sequence from typing import ( @@ -18,7 +19,7 @@ import numpy as np -from zarr.core.dtype.common import Endianness, JSONFloat +from zarr.core.dtype.common import SPECIAL_FLOAT_STRINGS, Endianness, JSONFloatV2, JSONFloatV3 if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -112,7 +113,7 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: ) -def float_from_json_v2(data: JSONFloat) -> float: +def float_from_json_v2(data: JSONFloatV2) -> float: """ Convert a JSON float to a float (Zarr v2). @@ -137,7 +138,7 @@ def float_from_json_v2(data: JSONFloat) -> float: return float(data) -def float_from_json_v3(data: JSONFloat) -> float: +def float_from_json_v3(data: JSONFloatV3) -> float: """ Convert a JSON float to a float (v3). @@ -150,31 +151,35 @@ def float_from_json_v3(data: JSONFloat) -> float: ------- float The float value. - """ - # todo: support the v3-specific NaN handling - return float_from_json_v2(data) - -def float_from_json(data: JSONFloat, *, zarr_format: ZarrFormat) -> float: - """ - Convert a JSON float to a float based on zarr format. - - Parameters - ---------- - data : JSONFloat - The JSON float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - float - The float value. - """ - if zarr_format == 2: - return float_from_json_v2(data) - else: - return float_from_json_v3(data) + Notes + ----- + Zarr V3 allows floats to be stored as hex strings. To quote the spec: + "...for float32, "NaN" is equivalent to "0x7fc00000". + This representation is the only way to specify a NaN value other than the specific NaN value + denoted by "NaN"." + """ + + if isinstance(data, str): + if data in SPECIAL_FLOAT_STRINGS: + return float_from_json_v2(data) # type: ignore[arg-type] + if not data.startswith("0x"): + msg = ( + f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" + " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." + ) + raise ValueError(msg) + if len(data[2:]) == 4: + dtype_code = ">e" + elif len(data[2:]) == 8: + dtype_code = ">f" + elif len(data[2:]) == 16: + dtype_code = ">d" + else: + msg = f"Invalid float value: {data!r}. Expected a string of length 4, 8, or 16." + raise ValueError(msg) + return float(struct.unpack(dtype_code, bytes.fromhex(data[2:]))[0]) + return float_from_json_v2(data) def bytes_from_json(data: str, *, zarr_format: ZarrFormat) -> bytes: @@ -221,7 +226,7 @@ def bytes_to_json(data: bytes, zarr_format: ZarrFormat) -> str: return base64.b64encode(data).decode("ascii") -def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: +def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloatV2: """ Convert a float to JSON (v2). @@ -242,7 +247,7 @@ def float_to_json_v2(data: float | np.floating[Any]) -> JSONFloat: return float(data) -def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: +def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloatV3: """ Convert a float to JSON (v3). @@ -261,32 +266,9 @@ def float_to_json_v3(data: float | np.floating[Any]) -> JSONFloat: return float_to_json_v2(data) -def float_to_json(data: float | np.floating[Any], *, zarr_format: ZarrFormat) -> JSONFloat: - """ - Convert a float to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : float | np.floating - The float value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - JSONFloat - The JSON representation of the float. - """ - if zarr_format == 2: - return float_to_json_v2(data) - else: - return float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - def complex_float_to_json_v3( data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloat, JSONFloat]: +) -> tuple[JSONFloatV3, JSONFloatV3]: """ Convert a complex number to JSON as defined by the Zarr V3 spec. @@ -305,7 +287,7 @@ def complex_float_to_json_v3( def complex_float_to_json_v2( data: complex | np.complexfloating[Any, Any], -) -> tuple[JSONFloat, JSONFloat]: +) -> tuple[JSONFloatV2, JSONFloatV2]: """ Convert a complex number to JSON as defined by the Zarr V2 spec. @@ -322,32 +304,7 @@ def complex_float_to_json_v2( return float_to_json_v2(data.real), float_to_json_v2(data.imag) -def complex_float_to_json( - data: complex | np.complexfloating[Any, Any], *, zarr_format: ZarrFormat -) -> tuple[JSONFloat, JSONFloat]: - """ - Convert a complex number to JSON, parametrized by the zarr format version. - - Parameters - ---------- - data : complex | np.complexfloating - The complex value to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - tuple[JSONFloat, JSONFloat] or JSONFloat - The JSON representation of the complex number. - """ - if zarr_format == 2: - return complex_float_to_json_v2(data) - else: - return complex_float_to_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: +def complex_float_from_json_v2(data: tuple[JSONFloatV2, JSONFloatV2]) -> complex: """ Convert a JSON complex float to a complex number (v2). @@ -364,7 +321,7 @@ def complex_float_from_json_v2(data: tuple[JSONFloat, JSONFloat]) -> complex: return complex(float_from_json_v2(data[0]), float_from_json_v2(data[1])) -def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: +def complex_float_from_json_v3(data: tuple[JSONFloatV3, JSONFloatV3]) -> complex: """ Convert a JSON complex float to a complex number (v3). @@ -381,30 +338,7 @@ def complex_float_from_json_v3(data: tuple[JSONFloat, JSONFloat]) -> complex: return complex(float_from_json_v3(data[0]), float_from_json_v3(data[1])) -def complex_float_from_json(data: tuple[JSONFloat, JSONFloat], zarr_format: ZarrFormat) -> complex: - """ - Convert a JSON complex float to a complex number based on zarr format. - - Parameters - ---------- - data : tuple[JSONFloat, JSONFloat] - The JSON complex float to convert. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - np.complexfloating - The complex number. - """ - if zarr_format == 2: - return complex_float_from_json_v2(data) - else: - return complex_float_from_json_v3(data) - raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.") - - -def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloatV2]: """ Check if a JSON value represents a float (v2). @@ -423,7 +357,7 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloat]: return isinstance(data, float | int) -def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: +def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloatV3]: """ Check if a JSON value represents a float (v3). @@ -437,11 +371,10 @@ def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloat]: Bool True if the data is a float, False otherwise. """ - # TODO: handle the special JSON serialization of different NaN values - return check_json_float_v2(data) + return check_json_float_v2(data) or (isinstance(data, str) and data.startswith("0x")) -def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloatV2, JSONFloatV2]]: """ Check if a JSON value represents a complex float, as per the behavior of zarr-python 2.x @@ -464,7 +397,7 @@ def check_json_complex_float_v2(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl ) -def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: +def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloatV3, JSONFloatV3]]: """ Check if a JSON value represents a complex float, as per the zarr v3 spec @@ -487,51 +420,6 @@ def check_json_complex_float_v3(data: JSON) -> TypeGuard[tuple[JSONFloat, JSONFl ) -def check_json_complex_float( - data: JSON, zarr_format: ZarrFormat -) -> TypeGuard[tuple[JSONFloat, JSONFloat]]: - """ - Check if a JSON value represents a complex float, given a zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data represents a complex float, False otherwise. - """ - if zarr_format == 2: - return check_json_complex_float_v2(data) - return check_json_complex_float_v3(data) - - -def check_json_float(data: JSON, zarr_format: ZarrFormat) -> TypeGuard[float]: - """ - Check if a JSON value represents a float based on zarr format. - - Parameters - ---------- - data : JSON - The JSON value to check. - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - Bool - True if the data is a float, False otherwise. - """ - if zarr_format == 2: - return check_json_float_v2(data) - else: - return check_json_float_v3(data) - - def check_json_int(data: JSON) -> TypeGuard[int]: """ Check if a JSON value is an integer. diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index fab4ca9893..3e5f640946 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -15,9 +15,12 @@ ComplexLike, TComplexDType_co, TComplexScalar_co, - check_json_complex_float, - complex_float_from_json, - complex_float_to_json, + check_json_complex_float_v2, + check_json_complex_float_v3, + complex_float_from_json_v2, + complex_float_from_json_v3, + complex_float_to_json_v2, + complex_float_to_json_v3, endianness_from_numpy_str, endianness_to_numpy_str, ) @@ -113,11 +116,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca TScalar_co The numpy float. """ - if check_json_complex_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(complex_float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) + if zarr_format == 2: + if check_json_complex_float_v2(data): + return self._cast_value_unsafe(complex_float_from_json_v2(data)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + elif zarr_format == 3: + if check_json_complex_float_v3(data): + return self._cast_value_unsafe(complex_float_from_json_v3(data)) + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: """ @@ -136,7 +147,11 @@ def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: The JSON-serializable form of the complex number, which is a list of two floats, each of which is encoding according to a zarr-format-specific encoding. """ - return complex_float_to_json(self.cast_value(data), zarr_format=zarr_format) + if zarr_format == 2: + return complex_float_to_json_v2(self.cast_value(data)) + elif zarr_format == 3: + return complex_float_to_json_v3(self.cast_value(data)) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index bedd6a4751..e4d6e42ef3 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -10,11 +10,14 @@ FloatLike, TFloatDType_co, TFloatScalar_co, - check_json_float, + check_json_float_v2, + check_json_float_v3, endianness_from_numpy_str, endianness_to_numpy_str, - float_from_json, - float_to_json, + float_from_json_v2, + float_from_json_v3, + float_to_json_v2, + float_to_json_v3, ) from zarr.core.dtype.wrapper import TBaseDType, ZDType @@ -72,11 +75,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> TypeGuard[FloatLike]: - return isinstance(value, FloatLike) + def check_value(self, data: object) -> TypeGuard[FloatLike]: + return isinstance(data, FloatLike) - def _cast_value_unsafe(self, value: object) -> TFloatScalar_co: - return self.to_dtype().type(value) # type: ignore[return-value, arg-type] + def _cast_value_unsafe(self, data: object) -> TFloatScalar_co: + return self.to_dtype().type(data) # type: ignore[return-value, arg-type] def default_value(self) -> TFloatScalar_co: """ @@ -105,13 +108,24 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala TScalar_co The numpy float. """ - if check_json_float(data, zarr_format=zarr_format): - return self._cast_value_unsafe(float_from_json(data, zarr_format=zarr_format)) - raise TypeError( - f"Invalid type: {data}. Expected a float or a special string encoding of a float." - ) - - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: + if zarr_format == 2: + if check_json_float_v2(data): + return self._cast_value_unsafe(float_from_json_v2(data)) + else: + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + elif zarr_format == 3: + if check_json_float_v3(data): + return self._cast_value_unsafe(float_from_json_v3(data)) + else: + raise TypeError( + f"Invalid type: {data}. Expected a float or a special string encoding of a float." + ) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str: """ Convert an object to a JSON-serializable float. @@ -128,7 +142,12 @@ def to_json_value(self, data: object, zarr_format: ZarrFormat) -> float | str: The JSON-serializable form of the float, which is potentially a number or a string. See the zarr specifications for details on the JSON encoding for floats. """ - return float_to_json(self._cast_value_unsafe(data), zarr_format=zarr_format) + if zarr_format == 2: + return float_to_json_v2(self._cast_value_unsafe(data)) + elif zarr_format == 3: + return float_to_json_v3(self._cast_value_unsafe(data)) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 07856a3c7c..1c62e4b41c 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -295,7 +295,11 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type - fill_value_parsed = data_type.from_json_value(_data.pop("fill_value"), zarr_format=3) + try: + fill = _data.pop("fill_value") + fill_value_parsed = data_type.from_json_value(fill, zarr_format=3) + except ValueError as e: + raise TypeError(f"Invalid fill_value: {fill!r}") from e # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 69a14a92b0..258ab48fe1 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -9,26 +9,22 @@ import numpy as np import pytest -from zarr.core.dtype.common import Endianness, JSONFloat, SpecialFloats +from zarr.core.dtype.common import Endianness, JSONFloatV2, SpecialFloatStrings from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, bytes_to_json, check_json_bool, - check_json_complex_float, check_json_complex_float_v2, check_json_complex_float_v3, - check_json_float, check_json_float_v2, check_json_float_v3, check_json_int, check_json_str, - complex_float_to_json, complex_float_to_json_v2, complex_float_to_json_v3, endianness_from_numpy_str, endianness_to_numpy_str, - float_from_json, float_from_json_v2, float_from_json_v3, float_to_json_v2, @@ -49,7 +45,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2_cases: list[tuple[JSONFloat, float | np.floating[Any]]] = [ +json_float_v2_cases: list[tuple[JSONFloatV2, float | np.floating[Any]]] = [ ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -99,12 +95,12 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: @pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) -def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> None: +def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloats) or isinstance(data, float): + if data in get_args(SpecialFloatStrings) or isinstance(data, float): assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] else: msg = f"could not convert string to float: {data!r}" @@ -113,36 +109,25 @@ def test_float_from_json_v2(data: JSONFloat | str, expected: float | str) -> Non @pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) -def test_float_from_json_v3(data: JSONFloat | str, expected: float | str) -> None: +def test_float_from_json_v3(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloats) or isinstance(data, float): - assert nan_equal(float_from_json_v3(data), expected) # type: ignore[arg-type] + if data in get_args(SpecialFloatStrings) or isinstance(data, float): + assert nan_equal(float_from_json_v3(data), expected) else: - msg = f"could not convert string to float: {data!r}" + msg = ( + f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" + " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." + ) with pytest.raises(ValueError, match=msg): - float_from_json_v3(data) # type: ignore[arg-type] - - -@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases) -def test_float_from_json(data: JSONFloat, expected: float | str, zarr_format: ZarrFormat) -> None: - """ - Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. - This test also checks that an invalid string input raises a ``ValueError`` - """ - observed = float_from_json(data, zarr_format=zarr_format) - if zarr_format == 2: - expected = float_from_json_v2(data) - else: - expected = float_from_json_v3(data) - assert nan_equal(observed, expected) + float_from_json_v3(data) # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) -def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) -> None: +def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v2 """ @@ -152,7 +137,7 @@ def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloat) - # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("expected", "data"), json_float_v3_cases) -def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloat) -> None: +def test_float_to_json_v3(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v3 """ @@ -186,7 +171,9 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) -def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: +def test_complex_to_json_v2( + float_data: float | np.floating[Any], json_expected: JSONFloatV2 +) -> None: """ Test that complex numbers are correctly converted to JSON in v2 format. @@ -202,7 +189,9 @@ def test_complex_to_json_v2(float_data: float | np.floating[Any], json_expected: # note the order of parameters relative to the order of the parametrized variable. @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) -def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: JSONFloat) -> None: +def test_complex_to_json_v3( + float_data: float | np.floating[Any], json_expected: JSONFloatV2 +) -> None: """ Test that complex numbers are correctly converted to JSON in v3 format. @@ -218,7 +207,7 @@ def test_complex_to_json_v3(float_data: float | np.floating[Any], json_expected: @pytest.mark.parametrize(("json_expected", "float_data"), json_float_v3_cases) def test_complex_float_to_json( - float_data: float | np.floating[Any], json_expected: JSONFloat, zarr_format: ZarrFormat + float_data: float | np.floating[Any], json_expected: JSONFloatV2, zarr_format: ZarrFormat ) -> None: """ Test that complex numbers are correctly converted to JSON in v2 or v3 formats, depending @@ -231,18 +220,27 @@ def test_complex_float_to_json( cplx = complex(float_data, float_data) cplx_npy = np.complex128(cplx) - assert complex_float_to_json(cplx, zarr_format=zarr_format) == (json_expected, json_expected) - assert complex_float_to_json(cplx_npy, zarr_format=zarr_format) == ( - json_expected, - json_expected, - ) + if zarr_format == 2: + assert complex_float_to_json_v2(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v2(cplx_npy) == ( + json_expected, + json_expected, + ) + elif zarr_format == 3: + assert complex_float_to_json_v3(cplx) == (json_expected, json_expected) + assert complex_float_to_json_v3(cplx_npy) == ( + json_expected, + json_expected, + ) + else: + raise ValueError("zarr_format must be 2 or 3") # pragma: no cover -check_json_float_cases = get_args(SpecialFloats) + (1.0, 2) +check_json_float_cases = get_args(SpecialFloatStrings) + (1.0, 2) @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v2_valid(data: JSONFloat | int) -> None: +def test_check_json_float_v2_valid(data: JSONFloatV2 | int) -> None: assert check_json_float_v2(data) @@ -251,7 +249,7 @@ def test_check_json_float_v2_invalid() -> None: @pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float_v3_valid(data: JSONFloat | int) -> None: +def test_check_json_float_v3_valid(data: JSONFloatV2 | int) -> None: assert check_json_float_v3(data) @@ -259,25 +257,15 @@ def test_check_json_float_v3_invalid() -> None: assert not check_json_float_v3("invalid") -@pytest.mark.parametrize("data", check_json_float_cases) -def test_check_json_float(data: JSONFloat | int, zarr_format: ZarrFormat) -> None: - observed = check_json_float(data, zarr_format=zarr_format) - if zarr_format == 2: - expected = check_json_float_v2(data) - else: - expected = check_json_float_v3(data) - assert observed == expected - - -check_json_complex_float_true_cases = ( +check_json_complex_float_true_cases: tuple[list[JSONFloatV2], ...] = ( + [0.0, 1.0], [0.0, 1.0], - (0.0, 1.0), [-1.0, "NaN"], ["Infinity", 1.0], ["Infinity", "NaN"], ) -check_json_complex_float_false_cases = ( +check_json_complex_float_false_cases: tuple[object, ...] = ( 0.0, "foo", [0.0], @@ -309,12 +297,22 @@ def test_check_json_complex_float_v3_false(data: JSON) -> None: @pytest.mark.parametrize("data", check_json_complex_float_true_cases) def test_check_json_complex_float_true(data: JSON, zarr_format: ZarrFormat) -> None: - assert check_json_complex_float(data, zarr_format=zarr_format) + if zarr_format == 2: + assert check_json_complex_float_v2(data) + elif zarr_format == 3: + assert check_json_complex_float_v3(data) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @pytest.mark.parametrize("data", check_json_complex_float_false_cases) def test_check_json_complex_float_false(data: JSON, zarr_format: ZarrFormat) -> None: - assert not check_json_complex_float(data, zarr_format=zarr_format) + if zarr_format == 2: + assert not check_json_complex_float_v2(data) + elif zarr_format == 3: + assert not check_json_complex_float_v3(data) + else: + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def test_check_json_int() -> None: diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index 5981d09514..ba43b6bcf6 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -12,6 +12,15 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return True return super().scalar_equals(scalar1, scalar2) + hex_nan_params: tuple[str, ...] = () + + def test_hex_nan(self, hex_nan_params: str) -> None: + """ + Test that hexadecimal strings can be read as NaN values + """ + zdtype = self.test_cls() + assert np.isnan(zdtype.from_json_value(hex_nan_params, zarr_format=3)) + class TestFloat16(_BaseTestFloat): test_cls = Float16 @@ -52,6 +61,8 @@ class TestFloat16(_BaseTestFloat): (Float16(), "NaN", np.float16("NaN")), ) + hex_nan_params = ("0x7fc0", "0x7fc1") + class TestFloat32(_BaseTestFloat): test_cls = Float32 @@ -94,6 +105,8 @@ class TestFloat32(_BaseTestFloat): (Float32(), "NaN", np.float32("NaN")), ) + hex_nan_params = ("0x7fc00000", "0x7fc00001") + class TestFloat64(_BaseTestFloat): test_cls = Float64 @@ -134,3 +147,5 @@ class TestFloat64(_BaseTestFloat): (Float64(), -1.0, np.float64(-1.0)), (Float64(), "NaN", np.float64("NaN")), ) + + hex_nan_params = ("0x7ff8000000000000", "0x7ff8000000000001") diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index d70095c045..9eb4d5ba1d 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,7 +12,6 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.npy.common import check_json_complex_float from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( @@ -28,7 +27,7 @@ from typing import Any from zarr.abc.codec import Codec - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import JSON from zarr.core.metadata.v3 import ( @@ -137,14 +136,6 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) -@pytest.mark.parametrize("data", [[1.0, 0.0, 3.0], [0, 1, 3], [1]]) -def test_complex_to_json_invalid(data: object, zarr_format: ZarrFormat) -> None: - assert not check_json_complex_float(data, zarr_format=zarr_format) - # match = f"Invalid type: {data}. Expected a sequence of two numbers." - # with pytest.raises(TypeError, match=re.escape(match)): - # complex_float_from_json(data=data, zarr_format=3) - - @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @pytest.mark.parametrize("dtype_str", [*int_dtypes, *float_dtypes, *complex_dtypes]) def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: From 1f0912860f1769a609958d8b7fdcd5f325f65ed6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 21 May 2025 17:21:28 +0200 Subject: [PATCH 100/129] revert removal of metadata chunk grid attribute --- src/zarr/core/array.py | 58 +++++++++++------------------------- src/zarr/core/metadata/v2.py | 8 ++++- src/zarr/core/metadata/v3.py | 14 +++++++++ tests/test_array.py | 2 +- tests/test_group.py | 2 +- 5 files changed, 41 insertions(+), 43 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6861111bab..7f8f3b637d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5,7 +5,6 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, field, replace -from functools import cached_property from itertools import starmap from logging import getLogger from typing import ( @@ -31,7 +30,7 @@ from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -41,7 +40,7 @@ default_buffer_prototype, ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype -from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, ChunkKeyEncodingLike, @@ -988,13 +987,6 @@ def chunks(self) -> ChunkCoords: """ return self.metadata.chunks - @cached_property - def chunk_grid(self) -> ChunkGrid: - if self.metadata.zarr_format == 2: - return RegularChunkGrid(chunk_shape=self.chunks) - else: - return self.metadata.chunk_grid - @property def shards(self) -> ChunkCoords | None: """Returns the shard shape of the Array. @@ -1318,20 +1310,6 @@ def nbytes(self) -> int: """ return self.size * self.dtype.itemsize - def get_chunk_spec( - self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype - ) -> ArraySpec: - assert isinstance(self.chunk_grid, RegularChunkGrid), ( - "Currently, only regular chunk grid is supported" - ) - return ArraySpec( - shape=self.chunk_grid.chunk_shape, - dtype=self._zdtype, - fill_value=self.metadata.fill_value, - config=array_config, - prototype=prototype, - ) - async def _get_selection( self, indexer: Indexer, @@ -1371,7 +1349,7 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.get_chunk_spec(chunk_coords, _config, prototype=prototype), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1426,7 +1404,7 @@ async def getitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.chunk_grid, + chunk_grid=self.metadata.chunk_grid, ) return await self._get_selection(indexer, prototype=prototype) @@ -1501,7 +1479,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.get_chunk_spec(chunk_coords, _config, prototype), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, is_complete_chunk, @@ -1556,7 +1534,7 @@ async def setitem( indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_grid=self.chunk_grid, + chunk_grid=self.metadata.chunk_grid, ) return await self._set_selection(indexer, value, prototype=prototype) @@ -1593,8 +1571,8 @@ async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) if delete_outside_chunks: # Remove all chunks outside of the new shape - old_chunk_coords = set(self.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.chunk_grid.all_chunk_coords(new_shape)) + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -2722,7 +2700,7 @@ def get_basic_selection( prototype = default_buffer_prototype() return sync( self._async_array._get_selection( - BasicIndexer(selection, self.shape, self._async_array.chunk_grid), + BasicIndexer(selection, self.shape, self.metadata.chunk_grid), out=out, fields=fields, prototype=prototype, @@ -2821,7 +2799,7 @@ def set_basic_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BasicIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_orthogonal_selection( @@ -2941,7 +2919,7 @@ def get_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3053,7 +3031,7 @@ def set_orthogonal_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype) ) @@ -3133,7 +3111,7 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3215,7 +3193,7 @@ def set_mask_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self._async_array.chunk_grid) + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_coordinate_selection( @@ -3295,7 +3273,7 @@ def get_coordinate_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) out_array = sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3380,7 +3358,7 @@ def set_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() # setup indexer - indexer = CoordinateIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) # handle value - need ndarray-like flatten value if not is_scalar(value, self.dtype): @@ -3495,7 +3473,7 @@ def get_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3588,7 +3566,7 @@ def set_block_selection( """ if prototype is None: prototype = default_buffer_prototype() - indexer = BlockIndexer(selection, self.shape, self._async_array.chunk_grid) + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 440f238ac0..3acb6ea472 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -2,11 +2,13 @@ import warnings from collections.abc import Iterable, Sequence -from typing import TYPE_CHECKING, Any, TypedDict +from functools import cached_property +from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict import numcodecs.abc from zarr.abc.metadata import Metadata +from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType @@ -115,6 +117,10 @@ def __init__( def ndim(self) -> int: return len(self.shape) + @cached_property + def chunk_grid(self) -> RegularChunkGrid: + return RegularChunkGrid(chunk_shape=self.chunks) + @property def shards(self) -> ChunkCoords | None: return None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1c62e4b41c..606d373cba 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -269,6 +269,20 @@ def inner_codecs(self) -> tuple[Codec, ...]: return self.codecs[0].codecs return self.codecs + def get_chunk_spec( + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype + ) -> ArraySpec: + assert isinstance(self.chunk_grid, RegularChunkGrid), ( + "Currently, only regular chunk grid is supported" + ) + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.fill_value, + config=array_config, + prototype=prototype, + ) + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) diff --git a/tests/test_array.py b/tests/test_array.py index 7ed716996e..b977156bac 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1360,7 +1360,7 @@ async def test_with_data(impl: Literal["sync", "async"], store: Store) -> None: elif impl == "async": arr = await create_array(store, name=name, data=data, zarr_format=3) stored = await arr._get_selection( - BasicIndexer(..., shape=arr.shape, chunk_grid=arr.chunk_grid), + BasicIndexer(..., shape=arr.shape, chunk_grid=arr.metadata.chunk_grid), prototype=default_buffer_prototype(), ) else: diff --git a/tests/test_group.py b/tests/test_group.py index e7723e185a..ac1afb539b 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1010,7 +1010,7 @@ async def test_asyncgroup_create_array( assert subnode.dtype == dtype # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete # here. - assert subnode.chunk_grid.chunk_shape == chunk_shape + assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape assert subnode.metadata.zarr_format == zarr_format From cc6d74153990cf99cdaeb7c67089ad189ce5608e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 13:23:15 +0200 Subject: [PATCH 101/129] use none to denote default fill value; remove old structured tests; use cast_value where appropriate --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 3 +- src/zarr/core/dtype/npy/sized.py | 23 +++++++--- src/zarr/core/dtype/wrapper.py | 6 +-- src/zarr/core/metadata/v2.py | 75 +++++--------------------------- src/zarr/core/metadata/v3.py | 2 +- tests/test_v2.py | 9 ++-- 7 files changed, 38 insertions(+), 82 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 4ce02e7b6d..e25835900d 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -602,7 +602,7 @@ def create( chunks: ChunkCoords | int | bool | None = None, dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", - fill_value: Any | None = DEFAULT_FILL_VALUE, # TODO: need type + fill_value: Any | None = None, # TODO: need type order: MemoryOrder | None = None, store: str | StoreLike | None = None, synchronizer: Any | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7f8f3b637d..4ec722c9ef 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -812,7 +812,8 @@ def _create_metadata_v2( ) -> ArrayV2Metadata: if dimension_separator is None: dimension_separator = "." - + if fill_value is None: + fill_value = dtype.default_value() # type: ignore[assignment] return ArrayV2Metadata( shape=shape, dtype=dtype, diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 281c634856..7ca507b84e 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -79,7 +79,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes) + # this is generous for backwards compatibility + return isinstance(data, np.bytes_ | str | bytes | int) def _cast_value_unsafe(self, value: object) -> np.bytes_: return self.to_dtype().type(value) @@ -168,7 +169,11 @@ def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_value_unsafe(self, value: object) -> np.void: - return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return] + native_dtype = self.to_dtype() + # Without the second argument, numpy will return a void scalar for dtype V1. + # The second argument ensures that, if native_dtype is something like V10, + # the result will actually be a V10 scalar. + return native_dtype.type(value, native_dtype) @dataclass(frozen=True, kw_only=True) @@ -239,7 +244,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_value(self, data: object) -> bool: - return isinstance(data, str | np.str_ | bytes) + # this is generous for backwards compatibility + return isinstance(data, str | np.str_ | bytes | int) def _cast_value_unsafe(self, value: object) -> np.str_: return self.to_dtype().type(value) @@ -254,8 +260,15 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): def default_value(self) -> np.void: return self._cast_value_unsafe(0) - def _cast_value_unsafe(self, value: object) -> np.void: - return cast("np.void", np.array([value], dtype=self.to_dtype())[0]) + def _cast_value_unsafe(self, data: object) -> np.void: + na_dtype = self.to_dtype() + if isinstance(data, bytes): + res = np.frombuffer(data, dtype=na_dtype)[0] + elif isinstance(data, list | tuple): + res = np.array([tuple(data)], dtype=na_dtype)[0] + else: + res = np.array([data], dtype=na_dtype)[0] + return cast("np.void", res) @classmethod def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 3a56a85788..c8e060e764 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -160,9 +160,9 @@ def cast_value(self, data: object) -> TScalar_co: if self.check_value(data): return self._cast_value_unsafe(data) msg = ( - f"The value {data} failed a type check." - f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}." - f"Consult the documentation for {self} to determine the possible values that can" + f"The value {data} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " + f"Consult the documentation for {self} to determine the possible values that can " "be cast to scalars of the wrapped data type." ) raise TypeError(msg) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3acb6ea472..e82c768b90 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -64,7 +64,7 @@ class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: ChunkCoords dtype: ZDType[TBaseDType, TBaseScalar] - fill_value: int | float | str | bytes | None = 0 + fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." @@ -97,7 +97,11 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=dtype.to_dtype()) + fill_value_parsed: TBaseScalar | None + if fill_value is not None: + fill_value_parsed = dtype.cast_value(fill_value) + else: + fill_value_parsed = fill_value attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -146,11 +150,10 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _ = parse_zarr_format(_data.pop("zarr_format")) dtype = get_data_type_from_native_dtype(_data["dtype"]) _data["dtype"] = dtype - if dtype.to_dtype().kind in "SV": - fill_value_encoded = _data.get("fill_value") - if fill_value_encoded is not None: - fill_value = base64.standard_b64decode(fill_value_encoded) - _data["fill_value"] = fill_value + fill_value_encoded = _data.get("fill_value") + if fill_value_encoded is not None: + fill_value = dtype.from_json_value(fill_value_encoded, zarr_format=2) + _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. # We don't want the ArrayV2Metadata constructor to fail just because someone put an @@ -293,61 +296,3 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: ) raise ValueError(msg) return data - - -def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """Handle structured dtype/fill value pairs""" - try: - if isinstance(fill_value, list): - return np.array([tuple(fill_value)], dtype=dtype)[0] - elif isinstance(fill_value, tuple): - return np.array([fill_value], dtype=dtype)[0] - elif isinstance(fill_value, bytes): - return np.frombuffer(fill_value, dtype=dtype)[0] - elif isinstance(fill_value, str): - decoded = base64.standard_b64decode(fill_value) - return np.frombuffer(decoded, dtype=dtype)[0] - else: - return np.array(fill_value, dtype=dtype)[()] - except Exception as e: - raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e - - -def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: - """ - Inspect a sequence of codecs / filters for an "object codec", i.e. a codec - that can serialize object arrays to contiguous bytes. Zarr python - maintains a hard-coded set of object codec ids. If any element from the input - has an id that matches one of the hard-coded object codec ids, that id - is returned immediately. - """ - - if fill_value is None or dtype.hasobject: - # no fill value - pass - elif not isinstance(fill_value, np.void) and fill_value == 0: - # this should be compatible across numpy versions for any array type, including - # structured arrays - fill_value = np.zeros((), dtype=dtype)[()] - - elif dtype.kind == "U": - # special case unicode because of encoding issues on Windows if passed through numpy - # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 - - if not isinstance(fill_value, str): - raise ValueError( - f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" - ) - else: - try: - if isinstance(fill_value, bytes) and dtype.kind == "V": - # special case for numpy 1.14 compatibility - fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] - else: - fill_value = np.array(fill_value, dtype=dtype)[()] - - except Exception as e: - msg = f"Fill_value {fill_value} is not valid for dtype {dtype}." - raise ValueError(msg) from e - - return fill_value diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 606d373cba..80ed722836 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -175,7 +175,7 @@ def __init__( chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific - fill_value_parsed = data_type.to_dtype().type(fill_value) + fill_value_parsed = data_type.cast_value(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) diff --git a/tests/test_v2.py b/tests/test_v2.py index f71ba84f01..ceea5c9539 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -16,10 +16,7 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import FixedLengthUTF32, Structured, VariableLengthUTF8 -from zarr.core.dtype.npy.bytes import NullTerminatedBytes -from zarr.core.dtype.wrapper import ZDType -from zarr.core.group import Group +from zarr.core.dtype.npy.sized import Structured from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -282,8 +279,8 @@ def test_structured_dtype_roundtrip(fill_value: float | bytes, tmp_path: Path) - def test_parse_structured_fill_value_valid( fill_value: Any, dtype: np.dtype[Any], expected_result: Any ) -> None: - zdtype = Structured.from_native_dtype(dtype) - result = zdtype.cast_scalar(fill_value) + zdtype = Structured.from_dtype(dtype) + result = zdtype.cast_value(fill_value) assert result.dtype == expected_result.dtype assert result == expected_result if isinstance(expected_result, np.void): From b12e30cf1ae606db8212f1d1e2935c0943e74b03 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 14:51:01 +0200 Subject: [PATCH 102/129] add item size abstraction --- src/zarr/codecs/blosc.py | 8 ++-- src/zarr/codecs/bytes.py | 6 +-- src/zarr/core/array.py | 5 ++- src/zarr/core/dtype/common.py | 13 +++++++ src/zarr/core/dtype/npy/bool.py | 13 +++++-- src/zarr/core/dtype/npy/complex.py | 22 +++++++---- src/zarr/core/dtype/npy/float.py | 16 +++++++- src/zarr/core/dtype/npy/int.py | 46 +++++++++++++++++++---- src/zarr/core/dtype/npy/sized.py | 45 +++++++++++++++------- src/zarr/core/dtype/npy/string.py | 8 ++-- src/zarr/core/dtype/npy/time.py | 16 +++++--- tests/conftest.py | 3 +- tests/test_dtype/test_npy/test_bool.py | 1 + tests/test_dtype/test_npy/test_complex.py | 3 ++ tests/test_dtype/test_npy/test_float.py | 21 ++++++++--- tests/test_dtype/test_npy/test_int.py | 9 +++++ tests/test_dtype/test_npy/test_sized.py | 20 ++++++++++ tests/test_dtype/test_npy/test_string.py | 3 ++ tests/test_dtype/test_npy/test_time.py | 2 + tests/test_dtype/test_wrapper.py | 16 +++++++- 20 files changed, 216 insertions(+), 60 deletions(-) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 66e2dbbc34..1c5e52e9a4 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -138,14 +138,16 @@ def to_dict(self) -> dict[str, JSON]: } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - dtype = array_spec.dtype.to_dtype() + item_size = 1 + if isinstance(array_spec.dtype, HasItemSize): + item_size = array_spec.dtype.item_size new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=dtype.itemsize) + new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), + shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), ) return new_codec diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index a87df060e7..5db39796e4 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -10,6 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec @@ -58,10 +59,7 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - # Note: this check is numpy-dtype-specific - # For single-byte (e.g., uint8) or 0-byte (e.g., S0) dtypes, - # endianness does not apply. - if array_spec.dtype.to_dtype().itemsize < 2: + if not isinstance(array_spec.dtype, HasEndianness): if self.endian is not None: return replace(self, endian=None) elif self.endian is None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4ec722c9ef..44b40b3044 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -71,6 +71,7 @@ ZDTypeLike, parse_data_type, ) +from zarr.core.dtype.common import HasItemSize from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -622,9 +623,9 @@ async def _create( if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size if chunks: - _chunks = normalize_chunks(chunks, shape, dtype_parsed.to_dtype().itemsize) + _chunks = normalize_chunks(chunks, shape, item_size) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.to_dtype().itemsize) + _chunks = normalize_chunks(chunk_shape, shape, item_size) config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index ecc475192c..d4aded658d 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -30,3 +30,16 @@ class HasEndianness: """ endianness: Endianness | None = "little" + + +@dataclass(frozen=True) +class HasItemSize: + """ + A mix-in class for data types with an item size attribute. + This mix-in bears a property ``item_size``, which denotes the size of each element of the data + type, in bytes. + """ + + @property + def item_size(self) -> int: + raise NotImplementedError diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index c80033c54e..d46758f789 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -4,12 +4,13 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasItemSize from zarr.core.dtype.npy.common import check_json_bool from zarr.core.dtype.wrapper import TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_]): +class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ Wrapper for numpy boolean dtype. @@ -65,7 +66,7 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> bool: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ Convert a scalar to a python bool. @@ -107,5 +108,9 @@ def check_value(self, data: object) -> bool: # Anything can become a bool return True - def _cast_value_unsafe(self, value: object) -> np.bool_: - return np.bool_(value) + def _cast_value_unsafe(self, data: object) -> np.bool_: + return np.bool_(data) + + @property + def item_size(self) -> int: + return 1 diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 3e5f640946..ee52dd0577 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -10,7 +10,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, @@ -31,7 +31,7 @@ @dataclass(frozen=True) -class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness): +class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -83,11 +83,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> bool: - return isinstance(value, ComplexLike) + def check_value(self, data: object) -> bool: + return isinstance(data, ComplexLike) - def _cast_value_unsafe(self, value: object) -> TComplexScalar_co: - return self.to_dtype().type(value) # type: ignore[arg-type, return-value] + def _cast_value_unsafe(self, data: object) -> TComplexScalar_co: + return self.to_dtype().type(data) # type: ignore[arg-type, return-value] def default_value(self) -> TComplexScalar_co: """ @@ -130,7 +130,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> JSON: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert an object to a JSON-serializable float. @@ -160,9 +160,17 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): _zarr_v3_name = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " int: + return 8 + @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType _zarr_v3_name = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " int: + return 16 diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index e4d6e42ef3..28f3ced63e 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -4,7 +4,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( EndiannessNumpy, FloatLike, @@ -23,7 +23,7 @@ @dataclass(frozen=True) -class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness): +class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -156,6 +156,10 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", " int: + return 2 + @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): @@ -163,9 +167,17 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", " int: + return 4 + @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", " int: + return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 78d9499243..db5869b202 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -4,7 +4,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( EndiannessNumpy, check_json_int, @@ -32,7 +32,7 @@ @dataclass(frozen=True) -class BaseInt(ZDType[TIntDType_co, TIntScalar_co]): +class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -67,11 +67,11 @@ def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: return data == cls._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def check_value(self, value: object) -> TypeGuard[IntLike]: - return isinstance(value, IntLike) + def check_value(self, data: object) -> TypeGuard[IntLike]: + return isinstance(data, IntLike) - def _cast_value_unsafe(self, value: object) -> TIntScalar_co: - return self.to_dtype().type(value) # type: ignore[return-value, arg-type] + def _cast_value_unsafe(self, data: object) -> TIntScalar_co: + return self.to_dtype().type(data) # type: ignore[return-value, arg-type] def default_value(self) -> TIntScalar_co: """ @@ -104,7 +104,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_ return self._cast_value_unsafe(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, zarr_format: ZarrFormat) -> int: + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: """ Convert an object to JSON-serializable scalar. @@ -140,6 +140,10 @@ def to_dtype(self: Self) -> np.dtypes.Int8DType: def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() + @property + def item_size(self) -> int: + return 1 + @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): @@ -158,6 +162,10 @@ def to_dtype(self: Self) -> np.dtypes.UInt8DType: def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() + @property + def item_size(self) -> int: + return 1 + @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): @@ -183,6 +191,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 2 + @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): @@ -207,6 +219,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 2 + @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): @@ -243,6 +259,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 4 + @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): @@ -267,6 +287,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 4 + @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): @@ -291,6 +315,10 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @property + def item_size(self) -> int: + return 8 + @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): @@ -314,3 +342,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @property + def item_size(self) -> int: + return 8 diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 7ca507b84e..2b2ed2ac70 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -7,7 +7,7 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasLength +from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize, HasLength from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, @@ -20,7 +20,7 @@ @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength): +class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" @@ -85,9 +85,13 @@ def check_value(self, data: object) -> bool: def _cast_value_unsafe(self, value: object) -> np.bytes_: return self.to_dtype().type(value) + @property + def item_size(self) -> int: + return self.length + @dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength): +class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): # np.dtypes.VoidDType is specified in an odd way in numpy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here @@ -168,25 +172,31 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: def check_value(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) - def _cast_value_unsafe(self, value: object) -> np.void: + def _cast_value_unsafe(self, data: object) -> np.void: native_dtype = self.to_dtype() # Without the second argument, numpy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. - return native_dtype.type(value, native_dtype) + return native_dtype.type(data, native_dtype) + + @property + def item_size(self) -> int: + return self.length @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode(ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength): +class FixedLengthUnicode( + ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize +): dtype_cls = np.dtypes.StrDType _zarr_v3_name = "numpy.fixed_length_ucs4" - item_size_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point + code_point_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( - length=dtype.itemsize // (cls.item_size_bytes), + length=dtype.itemsize // (cls.code_point_bytes), endianness=endianness_from_numpy_str(byte_order), ) @@ -220,7 +230,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: elif zarr_format == 3: return { "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.item_size_bytes}, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, } raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -229,7 +239,7 @@ def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.item_size_bytes) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_value(self) -> np.str_: @@ -247,12 +257,16 @@ def check_value(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) - def _cast_value_unsafe(self, value: object) -> np.str_: - return self.to_dtype().type(value) + def _cast_value_unsafe(self, data: object) -> np.str_: + return self.to_dtype().type(data) + + @property + def item_size(self) -> int: + return self.length * self.code_point_bytes @dataclass(frozen=True, kw_only=True) -class Structured(ZDType[np.dtypes.VoidDType[int], np.void]): +class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] @@ -395,3 +409,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: dtype = self.to_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + @property + def item_size(self) -> int: + # Lets have numpy do the arithmetic here + return self.to_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3849fd05ce..d5a4f9be08 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -72,8 +72,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: def check_value(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, value: object) -> str: - return str(value) + def _cast_value_unsafe(self, data: object) -> str: + return str(data) else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @@ -130,5 +130,5 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: def check_value(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, value: object) -> str: - return str(value) + def _cast_value_unsafe(self, data: object) -> str: + return str(data) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index ea44d76b56..61786351f8 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -17,7 +17,7 @@ import numpy as np -from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( DateTimeUnit, EndiannessNumpy, @@ -99,7 +99,7 @@ class TimeConfig(TypedDict): @dataclass(frozen=True, kw_only=True, slots=True) -class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness): +class TimeDTypeBase(ZDType[_BaseTimeDType_co, _BaseTimeScalar], HasEndianness, HasItemSize): _zarr_v2_names: ClassVar[tuple[str, ...]] # this attribute exists so that we can programmatically create a numpy dtype instance # because the particular numpy dtype we are wrapping does not allow direct construction via @@ -163,6 +163,10 @@ def check_value(self, data: object) -> bool: except ValueError: return False + @property + def item_size(self) -> int: + return 8 + @dataclass(frozen=True, kw_only=True, slots=True) class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): @@ -188,8 +192,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelt return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, value: object) -> np.timedelta64: - return self.to_dtype().type(value) # type: ignore[arg-type] + def _cast_value_unsafe(self, data: object) -> np.timedelta64: + return self.to_dtype().type(data) # type: ignore[arg-type] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: @@ -235,8 +239,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, value: object) -> np.datetime64: - return self.to_dtype().type(value) # type: ignore[no-any-return, call-overload] + def _cast_value_unsafe(self, data: object) -> np.datetime64: + return self.to_dtype().type(data) # type: ignore[no-any-return, call-overload] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: diff --git a/tests/conftest.py b/tests/conftest.py index f690478f2e..9b0ae02756 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,7 @@ from zarr.core.dtype import ( get_data_type_from_native_dtype, ) +from zarr.core.dtype.common import HasItemSize from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -290,7 +291,7 @@ def create_array_metadata( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_parsed.to_dtype().itemsize, + item_size=item_size, ) if order is None: diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 086a2cfee8..1adae57f02 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -38,3 +38,4 @@ class TestBool(_TestZDType): (Bool(), np.True_, np.True_), (Bool(), np.False_, np.False_), ) + item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b24bc4d7c8..45a3a1480e 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -52,6 +52,8 @@ class TestComplex64(_BaseTestFloat): (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) + item_size_params = (Complex64(),) + class TestComplex128(_BaseTestFloat): test_cls = Complex128 @@ -89,3 +91,4 @@ class TestComplex128(_BaseTestFloat): (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) + item_size_params = (Complex128(),) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index ba43b6bcf6..daa9bafac0 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -12,14 +12,16 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return True return super().scalar_equals(scalar1, scalar2) - hex_nan_params: tuple[str, ...] = () + hex_string_params: tuple[tuple[str, float], ...] = () - def test_hex_nan(self, hex_nan_params: str) -> None: + def test_hex_encoding(self, hex_string_params: tuple[str, float]) -> None: """ Test that hexadecimal strings can be read as NaN values """ + hex_string, expected = hex_string_params zdtype = self.test_cls() - assert np.isnan(zdtype.from_json_value(hex_nan_params, zarr_format=3)) + observed = zdtype.from_json_value(hex_string, zarr_format=3) + assert self.scalar_equals(observed, expected) class TestFloat16(_BaseTestFloat): @@ -61,7 +63,8 @@ class TestFloat16(_BaseTestFloat): (Float16(), "NaN", np.float16("NaN")), ) - hex_nan_params = ("0x7fc0", "0x7fc1") + hex_string_params = (("0x7fc0", np.nan), ("0x7fc1", np.nan), ("0x3c00", 1.0)) + item_size_params = (Float16(),) class TestFloat32(_BaseTestFloat): @@ -105,7 +108,8 @@ class TestFloat32(_BaseTestFloat): (Float32(), "NaN", np.float32("NaN")), ) - hex_nan_params = ("0x7fc00000", "0x7fc00001") + hex_string_params = (("0x7fc00000", np.nan), ("0x7fc00001", np.nan), ("0x3f800000", 1.0)) + item_size_params = (Float32(),) class TestFloat64(_BaseTestFloat): @@ -148,4 +152,9 @@ class TestFloat64(_BaseTestFloat): (Float64(), "NaN", np.float64("NaN")), ) - hex_nan_params = ("0x7ff8000000000000", "0x7ff8000000000001") + hex_string_params = ( + ("0x7ff8000000000000", np.nan), + ("0x7ff8000000000001", np.nan), + ("0x3ff0000000000000", 1.0), + ) + item_size_params = (Float64(),) diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 637b594e1b..5b0180af3b 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -34,6 +34,7 @@ class TestInt8(_TestZDType): (Int8(), 1, np.int8(1)), (Int8(), -1, np.int8(-1)), ) + item_size_params = (Int8(),) class TestInt16(_TestZDType): @@ -65,6 +66,8 @@ class TestInt16(_TestZDType): (Int16(), -1, np.int16(-1)), ) + item_size_params = (Int16(),) + class TestInt32(_TestZDType): test_cls = Int32 @@ -94,6 +97,7 @@ class TestInt32(_TestZDType): (Int32(), 1, np.int32(1)), (Int32(), -1, np.int32(-1)), ) + item_size_params = (Int32(),) class TestInt64(_TestZDType): @@ -124,6 +128,7 @@ class TestInt64(_TestZDType): (Int64(), 1, np.int64(1)), (Int64(), -1, np.int64(-1)), ) + item_size_params = (Int64(),) class TestUInt8(_TestZDType): @@ -154,6 +159,7 @@ class TestUInt8(_TestZDType): (UInt8(), 1, np.uint8(1)), (UInt8(), 0, np.uint8(0)), ) + item_size_params = (UInt8(),) class TestUInt16(_TestZDType): @@ -184,6 +190,7 @@ class TestUInt16(_TestZDType): (UInt16(), 1, np.uint16(1)), (UInt16(), 0, np.uint16(0)), ) + item_size_params = (UInt16(),) class TestUInt32(_TestZDType): @@ -214,6 +221,7 @@ class TestUInt32(_TestZDType): (UInt32(), 1, np.uint32(1)), (UInt32(), 0, np.uint32(0)), ) + item_size_params = (UInt32(),) class TestUInt64(_TestZDType): @@ -244,3 +252,4 @@ class TestUInt64(_TestZDType): (UInt64(), 1, np.uint64(1)), (UInt64(), 0, np.uint64(0)), ) + item_size_params = (UInt64(),) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 2ded5bbb7c..202bb0d04e 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -50,6 +50,11 @@ class TestFixedLengthAscii(_TestZDType): (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), ) + item_size_params = ( + FixedLengthAscii(length=0), + FixedLengthAscii(length=4), + FixedLengthAscii(length=10), + ) class TestFixedLengthBytes(_TestZDType): @@ -91,6 +96,11 @@ class TestFixedLengthBytes(_TestZDType): (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), ) + item_size_params = ( + FixedLengthBytes(length=0), + FixedLengthBytes(length=4), + FixedLengthBytes(length=10), + ) class TestFixedLengthUnicode(_TestZDType): @@ -125,6 +135,11 @@ class TestFixedLengthUnicode(_TestZDType): (FixedLengthUnicode(length=2), "hi", np.str_("hi")), (FixedLengthUnicode(length=4), "hihi", np.str_("hihi")), ) + item_size_params = ( + FixedLengthUnicode(length=0), + FixedLengthUnicode(length=4), + FixedLengthUnicode(length=10), + ) class TestStructured(_TestZDType): @@ -214,3 +229,8 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): return np.array_equal(scalar1, scalar2) return super().scalar_equals(scalar1, scalar2) + + item_size_params = ( + Structured(fields=(("field1", Int32()), ("field2", Float64()))), + Structured(fields=(("field1", Int64()), ("field2", Int32()))), + ) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index c87f538be5..1046afcac0 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -37,6 +37,7 @@ class TestVariableLengthString(_TestZDType): (VariableLengthString(), "", np.str_("")), (VariableLengthString(), "hi", np.str_("hi")), ) + item_size_params = (VariableLengthString(),) else: @@ -70,3 +71,5 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] (VariableLengthString(), "", np.str_("")), (VariableLengthString(), "hi", np.str_("hi")), ) + + item_size_params = (VariableLengthString(),) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index f8f8b5ae47..90c573007f 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -63,6 +63,7 @@ class TestDateTime64(_TestTimeBase): (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), ) + item_size_params = (DateTime64(unit="ns", scale_factor=1),) class TestTimeDelta64(_TestTimeBase): @@ -102,6 +103,7 @@ class TestTimeDelta64(_TestTimeBase): (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), ) + item_size_params = (TimeDelta64(unit="ns", scale_factor=1),) def test_time_invalid_unit() -> None: diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 608e272690..302a419c0f 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -2,6 +2,10 @@ from typing import TYPE_CHECKING, Any, ClassVar +import pytest + +from zarr.core.dtype.common import HasItemSize + if TYPE_CHECKING: from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -74,8 +78,8 @@ class _TestZDType: scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] + item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: # An equality check for json-encoded scalars. This defaults to regular equality, @@ -119,3 +123,13 @@ def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: zdtype, value, expected = cast_value_params observed = zdtype.cast_value(value) assert self.scalar_equals(expected, observed) + + def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: + """ + Test that the item_size attribute matches the numpy dtype itemsize attribute, for dtypes + with a fixed scalar size. + """ + if isinstance(item_size_params, HasItemSize): + assert item_size_params.item_size == item_size_params.to_dtype().itemsize + else: + pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") From deb30682911b30a485f08da1daf9927c3ebe73b6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:31:09 +0200 Subject: [PATCH 103/129] rename fixed-length string dtypes, and be strict about the numpy object dtype (i.e., refuse to match it) --- src/zarr/api/asynchronous.py | 12 ++--- src/zarr/core/dtype/__init__.py | 12 ++--- src/zarr/core/dtype/npy/sized.py | 8 ++-- src/zarr/core/dtype/registry.py | 13 ++++++ src/zarr/core/metadata/dtype.py | 0 tests/conftest.py | 10 ++++ tests/test_array.py | 23 +++++---- tests/test_dtype/test_npy/test_sized.py | 62 ++++++++++++------------- tests/test_dtype_registry.py | 8 ++-- tests/test_v2.py | 18 +++++-- 10 files changed, 101 insertions(+), 65 deletions(-) delete mode 100644 src/zarr/core/metadata/dtype.py diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d83e51f954..2a17e425c8 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -28,7 +28,7 @@ _default_zarr_format, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import ZDTypeLike, get_data_type_from_native_dtype, parse_data_type from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -857,7 +857,7 @@ async def open_group( async def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 - chunks: ChunkCoords | int | bool | None = None, + chunks: ChunkCoords | int | None = None, # TODO: v2 allowed chunks=True dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", fill_value: Any | None = DEFAULT_FILL_VALUE, @@ -1005,12 +1005,12 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - dtype_wrapped = get_data_type_from_native_dtype(dtype) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) if zarr_format == 2: if chunks is None: chunks = shape - default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype_wrapped) - if filters is None: + default_filters, default_compressor = _get_default_chunk_encoding_v2(zdtype) + if not filters: filters = default_filters # type: ignore[assignment] if compressor is None: compressor = default_compressor @@ -1060,7 +1060,7 @@ async def create( store_path, shape=shape, chunks=chunks, - dtype=dtype_wrapped, + dtype=zdtype, compressor=compressor, fill_value=fill_value, overwrite=overwrite, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index b973691f0f..5d51db92db 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -8,9 +8,9 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 from zarr.core.dtype.npy.sized import ( - FixedLengthAscii, + FixedLengthASCII, FixedLengthBytes, - FixedLengthUnicode, + FixedLengthUTF32, Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -36,9 +36,9 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", - "FixedLengthAscii", + "FixedLengthASCII", "FixedLengthBytes", - "FixedLengthUnicode", + "FixedLengthUTF32", "Float16", "Float32", "Float64", @@ -72,8 +72,8 @@ ComplexFloatDType = Complex64 | Complex128 COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 -StringDType = FixedLengthUnicode | VariableLengthString | FixedLengthAscii -STRING_DTYPE: Final = FixedLengthUnicode, VariableLengthString, FixedLengthAscii +StringDType = FixedLengthUTF32 | VariableLengthString | FixedLengthASCII +STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthString, FixedLengthASCII TimeDType = DateTime64 | TimeDelta64 TIME_DTYPE: Final = DateTime64, TimeDelta64 diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 2b2ed2ac70..bf54638890 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -20,7 +20,7 @@ @dataclass(frozen=True, kw_only=True) -class FixedLengthAscii(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): +class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType _zarr_v3_name = "numpy.fixed_length_ascii" @@ -185,12 +185,12 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) -class FixedLengthUnicode( +class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_ucs4" - code_point_bytes: ClassVar[int] = 4 # UCS4 is 4 bytes per code point + _zarr_v3_name = "numpy.fixed_length_utf32" + code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index ae5c3d426e..047f908ac6 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -3,6 +3,8 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Self +import numpy as np + from zarr.core.dtype.common import DataTypeValidationError if TYPE_CHECKING: @@ -38,6 +40,17 @@ def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: self.lazy_load() + if dtype == np.dtype("O"): + msg = ( + "Data type resolution failed. " + 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' + 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' + "data type. " + "In this case you should construct your array by providing a specific Zarr data " + 'type. For a list of Zarr data types that are compatible with the numpy "Object"' + "data type, see xxxxxxxxxxx" + ) + raise ValueError(msg) for val in self.contents.values(): try: return val.from_dtype(dtype) diff --git a/src/zarr/core/metadata/dtype.py b/src/zarr/core/metadata/dtype.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/conftest.py b/tests/conftest.py index 9b0ae02756..1abfb24076 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,7 @@ from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ChunkCoords, MemoryOrder, ShapeLike, ZarrFormat + from zarr.core.dtype.wrapper import ZDType async def parse_store( @@ -432,3 +433,12 @@ def meta_from_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, ) + + +def skip_object_dtype(dtype: ZDType[Any, Any]) -> None: + if dtype.dtype_cls is type(np.dtype("O")): + msg = ( + f"{dtype} uses the numpy object data type, which is not a valid target for data " + "type resolution" + ) + pytest.skip(msg) diff --git a/tests/test_array.py b/tests/test_array.py index b977156bac..bc4d0a9071 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -44,8 +44,8 @@ from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.common import Endianness from zarr.core.dtype.npy.common import endianness_from_numpy_str -from zarr.core.dtype.npy.float import Float64 -from zarr.core.dtype.npy.int import Int16 +from zarr.core.dtype.npy.float import Float32, Float64 +from zarr.core.dtype.npy.int import Int16, UInt8 from zarr.core.dtype.npy.sized import ( Structured, ) @@ -1004,9 +1004,11 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor """ Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string """ + skip_object_dtype(dtype) a = zarr.create_array( store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format ) + b = zarr.create_array( store, name="b", @@ -1049,12 +1051,13 @@ def test_dtype_roundtrip( """ Test that creating an array, then opening it, gets the same array. """ + skip_object_dtype(dtype) a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format) b = zarr.open_array(store) assert a.dtype == b.dtype @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", [ @@ -1239,7 +1242,7 @@ async def test_invalid_v3_arguments( zarr.create(store=store, dtype="uint8", shape=(10,), zarr_format=3, **kwargs) @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U10", "S10", ">M8[10s]"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32"]) @pytest.mark.parametrize( "compressors", [ @@ -1281,17 +1284,17 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype_str", ["uint8", "float32", "str"]) + @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthString()]) async def test_default_filters_compressors( - store: MemoryStore, dtype_str: str, zarr_format: ZarrFormat + store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthString, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. """ - zdtype = get_data_type_from_native_dtype(dtype_str) + arr = await create_array( store=store, - dtype=dtype_str, + dtype=dtype, shape=(10,), zarr_format=zarr_format, ) @@ -1303,14 +1306,14 @@ async def test_default_filters_compressors( compressors=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, serializer=sig.parameters["serializer"].default, - dtype=zdtype, + dtype=dtype, ) elif zarr_format == 2: default_filters, default_compressors = _parse_chunk_encoding_v2( compressor=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, - dtype=zdtype, + dtype=dtype, ) if default_filters is None: expected_filters = () diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 202bb0d04e..8bc83f2f73 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -8,15 +8,15 @@ from zarr.core.dtype.npy.float import Float16, Float64 from zarr.core.dtype.npy.int import Int32, Int64 from zarr.core.dtype.npy.sized import ( - FixedLengthAscii, + FixedLengthASCII, FixedLengthBytes, - FixedLengthUnicode, + FixedLengthUTF32, Structured, ) class TestFixedLengthAscii(_TestZDType): - test_cls = FixedLengthAscii + test_cls = FixedLengthASCII valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) invalid_dtype = ( np.dtype(np.int8), @@ -36,24 +36,24 @@ class TestFixedLengthAscii(_TestZDType): ) scalar_v2_params = ( - (FixedLengthAscii(length=0), ""), - (FixedLengthAscii(length=2), "YWI="), - (FixedLengthAscii(length=4), "YWJjZA=="), + (FixedLengthASCII(length=0), ""), + (FixedLengthASCII(length=2), "YWI="), + (FixedLengthASCII(length=4), "YWJjZA=="), ) scalar_v3_params = ( - (FixedLengthAscii(length=0), ""), - (FixedLengthAscii(length=2), "YWI="), - (FixedLengthAscii(length=4), "YWJjZA=="), + (FixedLengthASCII(length=0), ""), + (FixedLengthASCII(length=2), "YWI="), + (FixedLengthASCII(length=4), "YWJjZA=="), ) cast_value_params = ( - (FixedLengthAscii(length=0), "", np.bytes_("")), - (FixedLengthAscii(length=2), "ab", np.bytes_("ab")), - (FixedLengthAscii(length=4), "abcd", np.bytes_("abcd")), + (FixedLengthASCII(length=0), "", np.bytes_("")), + (FixedLengthASCII(length=2), "ab", np.bytes_("ab")), + (FixedLengthASCII(length=4), "abcd", np.bytes_("abcd")), ) item_size_params = ( - FixedLengthAscii(length=0), - FixedLengthAscii(length=4), - FixedLengthAscii(length=10), + FixedLengthASCII(length=0), + FixedLengthASCII(length=4), + FixedLengthASCII(length=10), ) @@ -103,8 +103,8 @@ class TestFixedLengthBytes(_TestZDType): ) -class TestFixedLengthUnicode(_TestZDType): - test_cls = FixedLengthUnicode +class TestFixedLengthUTF32(_TestZDType): + test_cls = FixedLengthUTF32 valid_dtype = (np.dtype(">U10"), np.dtype("U10", " np.bool_: @staticmethod @pytest.mark.parametrize( - ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUnicode, "|U4")] + ("wrapper_cls", "dtype_str"), [(Bool, "bool"), (FixedLengthUTF32, "|U4")] ) def test_match_dtype( data_type_registry_fixture: DataTypeRegistry, @@ -100,7 +101,7 @@ def test_registered_dtypes( """ Test that the registered dtypes can be retrieved from the registry. """ - + skip_object_dtype(zdtype) assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype assert ( data_type_registry.match_json( @@ -121,6 +122,7 @@ def test_match_dtype_unique( that excludes the data type class being tested, and ensure that an instance of the wrapped data type fails to match anything in the registry """ + skip_object_dtype(zdtype) for _cls in get_args(AnyDType): if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) diff --git a/tests/test_v2.py b/tests/test_v2.py index ceea5c9539..5b7f2b5a1b 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -16,7 +16,9 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.sized import FixedLengthASCII, FixedLengthUTF32, Structured +from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -104,10 +106,16 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) -@pytest.mark.parametrize(("dtype", "value"), [("|S1", b"Y"), ("|U1", "Y"), (str, "Y")]) -def test_v2_encode_decode_with_data(dtype, value): - dtype, value = dtype, value - expected = np.full((3,), value, dtype=dtype) +@pytest.mark.parametrize( + ("dtype", "value"), + [ + (FixedLengthASCII(length=1), b"Y"), + (FixedLengthUTF32(length=1), "Y"), + (VariableLengthString(), "Y"), + ], +) +def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): + expected = np.full((3,), value, dtype=dtype.to_dtype()) a = zarr.create( shape=(3,), zarr_format=2, From 2b725eeffb863d7d8d2acabf40bb28230f38147b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:46:41 +0200 Subject: [PATCH 104/129] remove vestigial use of to_dtype().itemsize() --- src/zarr/core/array.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 44b40b3044..90b5fe0e0b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4221,7 +4221,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - dtype_wrapped = parse_data_type(dtype, zarr_format=zarr_format) + zdtype = parse_data_type(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format @@ -4243,7 +4243,7 @@ async def init_array( array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, - item_size=dtype_wrapped.to_dtype().itemsize, + item_size=item_size, ) chunks_out: tuple[int, ...] meta: ArrayV2Metadata | ArrayV3Metadata @@ -4259,7 +4259,7 @@ async def init_array( raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=dtype_wrapped + compressor=compressors, filters=filters, dtype=zdtype ) if dimension_names is not None: raise ValueError("Zarr format 2 arrays do not support dimension names.") @@ -4270,7 +4270,7 @@ async def init_array( meta = AsyncArray._create_metadata_v2( shape=shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, chunks=chunk_shape_parsed, dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, @@ -4284,7 +4284,7 @@ async def init_array( compressors=compressors, filters=filters, serializer=serializer, - dtype=dtype_wrapped, + dtype=zdtype, ) sub_codecs = cast("tuple[Codec, ...]", (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] @@ -4299,7 +4299,7 @@ async def init_array( ) sharding_codec.validate( shape=chunk_shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) @@ -4313,7 +4313,7 @@ async def init_array( meta = AsyncArray._create_metadata_v3( shape=shape_parsed, - dtype=dtype_wrapped, + dtype=zdtype, fill_value=fill_value, chunk_shape=chunks_out, chunk_key_encoding=chunk_key_encoding_parsed, From 03259c6977c55ae5cecc68e4ddfbf4ad9eab2d4e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 22 May 2025 21:49:12 +0200 Subject: [PATCH 105/129] remove another vestigial use of to_dtype().itemsize() --- src/zarr/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 90b5fe0e0b..f05d8e9cad 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -71,7 +71,7 @@ ZDTypeLike, parse_data_type, ) -from zarr.core.dtype.common import HasItemSize +from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -4792,7 +4792,7 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1: + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): # The default endianness in the bytescodec might not be None, so we need to replace it out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes From 9a87b3ddfcbdfaf1ef76d809067511ed21349014 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 23 May 2025 10:49:19 +0200 Subject: [PATCH 106/129] emit warning about unstable dtype when serializing Structured dtype to JSON --- src/zarr/core/dtype/common.py | 20 ++++++++++++++++++++ src/zarr/core/dtype/npy/sized.py | 9 ++++++++- src/zarr/core/dtype/wrapper.py | 13 ------------- tests/test_array.py | 3 +++ tests/test_dtype/conftest.py | 7 ++++++- tests/test_dtype/test_wrapper.py | 1 + tests/test_dtype_registry.py | 2 ++ 7 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index d4aded658d..5eeff2af5b 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from dataclasses import dataclass from typing import Final, Literal @@ -43,3 +44,22 @@ class HasItemSize: @property def item_size(self) -> int: raise NotImplementedError + + +class UnstableSpecificationWarning(FutureWarning): ... + + +def v3_unstable_dtype_warning(dtype: object) -> None: + """ + Emit this warning when a data type does not have a stable zarr v3 spec + """ + msg = ( + f"The data type ({dtype}) does not have a Zarr V3 specification. " + "That means that the representation of data saved with this data type may change without " + "warning in a future version of Zarr Python. " + "Arrays stored with this data type may be unreadable by other Zarr libraries " + "Use this data type at your own risk! " + "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " + "status of data type specifications for Zarr V3." + ) + warnings.warn(msg, category=UnstableSpecificationWarning, stacklevel=2) diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index bf54638890..1014ba6f79 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -7,7 +7,13 @@ import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import DataTypeValidationError, HasEndianness, HasItemSize, HasLength +from zarr.core.dtype.common import ( + DataTypeValidationError, + HasEndianness, + HasItemSize, + HasLength, + v3_unstable_dtype_warning, +) from zarr.core.dtype.npy.common import ( EndiannessNumpy, bytes_from_json, @@ -325,6 +331,7 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: if zarr_format == 2: return fields elif zarr_format == 3: + v3_unstable_dtype_warning(self) base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] return cast("JSON", base_dict) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index c8e060e764..1a9d9b1e21 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -22,7 +22,6 @@ from __future__ import annotations -import warnings from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar @@ -336,15 +335,3 @@ def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScal The native scalar value. """ ... - - -def v3_unstable_dtype_warning(dtype: ZDType[TBaseDType, TBaseScalar]) -> None: - msg = ( - f"You are using a data type ({dtype}) that does not have a stable Zarr V3 specification." - "Be advised that arrays stored with this data type may be unreadable by other Zarr " - "libraries, and possibly future versions of Zarr-Python as well. " - "Use this data type at your own risk." - "See https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for a list" - "of data types with a stable Zarr V3 specification." - ) - warnings.warn(msg, category=FutureWarning, stacklevel=2) diff --git a/tests/test_array.py b/tests/test_array.py index bc4d0a9071..be416d6d8f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -988,6 +988,7 @@ def test_chunks_and_shards(store: Store) -> None: @staticmethod @pytest.mark.parametrize("dtype", zdtype_examples) + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: """ Test that the fill value of an array is set to the default value for the dtype object @@ -999,6 +1000,7 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: assert a.fill_value == dtype.default_value() @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", zdtype_examples) def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None: """ @@ -1044,6 +1046,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor assert a.dtype == c.dtype @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", zdtype_examples) def test_dtype_roundtrip( dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index bf58a17556..2b21a57365 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -1,4 +1,5 @@ # Generate a collection of zdtype instances for use in testing. +import warnings from typing import Any import numpy as np @@ -13,7 +14,11 @@ for wrapper_cls in data_type_registry.contents.values(): # The Structured dtype has to be constructed with some actual fields if wrapper_cls is Structured: - zdtype_examples += (wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])),) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + zdtype_examples += ( + wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), + ) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 302a419c0f..a33e443c76 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -105,6 +105,7 @@ def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) assert zdtype.to_json(zarr_format=2) == valid_json_v2 + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 35c704673d..0c650e5c29 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -94,6 +94,7 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non data_type_registry_fixture.get(outside_dtype) @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) def test_registered_dtypes( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat @@ -111,6 +112,7 @@ def test_registered_dtypes( ) @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) def test_match_dtype_unique( zdtype: ZDType[Any, Any], From de76df057df30812e533942bbfa63343db55bbdf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 24 May 2025 14:10:09 +0200 Subject: [PATCH 107/129] put string dtypes in the strings module --- src/zarr/core/dtype/__init__.py | 4 +- src/zarr/core/dtype/npy/sized.py | 158 +---------------------- src/zarr/core/dtype/npy/string.py | 164 +++++++++++++++++++++++- tests/test_dtype/test_npy/test_sized.py | 8 +- tests/test_v2.py | 3 +- 5 files changed, 171 insertions(+), 166 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 5d51db92db..9c672fd986 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -8,9 +8,7 @@ from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 from zarr.core.dtype.npy.sized import ( - FixedLengthASCII, FixedLengthBytes, - FixedLengthUTF32, Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 @@ -24,6 +22,8 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( _NUMPY_SUPPORTS_VLEN_STRING, + FixedLengthASCII, + FixedLengthUTF32, VariableLengthString, ) from zarr.core.dtype.registry import DataTypeRegistry diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index 1014ba6f79..eb2b39ad9a 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -2,100 +2,25 @@ import re from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, ClassVar, Self, TypeGuard, cast +from typing import Any, Self, TypeGuard, cast import numpy as np from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, - HasEndianness, HasItemSize, HasLength, v3_unstable_dtype_warning, ) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, bytes_from_json, bytes_to_json, check_json_str, - endianness_from_numpy_str, - endianness_to_numpy_str, ) from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -@dataclass(frozen=True, kw_only=True) -class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): - dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.fixed_length_ascii" - - @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - - def to_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_value(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, np.bytes_ | str | bytes | int) - - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) - - @property - def item_size(self) -> int: - return self.length - - @dataclass(frozen=True, kw_only=True) class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): # np.dtypes.VoidDType is specified in an odd way in numpy @@ -190,87 +115,6 @@ def item_size(self) -> int: return self.length -@dataclass(frozen=True, kw_only=True) -class FixedLengthUTF32( - ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize -): - dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_utf32" - code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point - - @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.code_point_bytes), - endianness=endianness_from_numpy_str(byte_order), - ) - - def to_dtype(self) -> np.dtypes.StrDType[int]: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls(self.length).newbyteorder(byte_order) - - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - and isinstance(data["configuration"]["length_bytes"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def to_json(self, zarr_format: ZarrFormat) -> JSON: - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.code_point_bytes}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_value(self) -> np.str_: - return np.str_("") - - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) - - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: - if check_json_str(data): - return self.to_dtype().type(data) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_value(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, str | np.str_ | bytes | int) - - def _cast_value_unsafe(self, data: object) -> np.str_: - return self.to_dtype().type(data) - - @property - def item_size(self) -> int: - return self.length * self.code_point_bytes - - @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index d5a4f9be08..f65db5a984 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,11 +1,19 @@ from __future__ import annotations +import base64 +import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Self, TypeGuard +from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard, cast import numpy as np -from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength +from zarr.core.dtype.npy.common import ( + EndiannessNumpy, + check_json_str, + endianness_from_numpy_str, + endianness_to_numpy_str, +) from zarr.core.dtype.wrapper import ZDType if TYPE_CHECKING: @@ -15,6 +23,158 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +@dataclass(frozen=True, kw_only=True) +class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): + dtype_cls = np.dtypes.BytesDType + _zarr_v3_name = "numpy.fixed_length_ascii" + + @classmethod + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + return cls(length=dtype.itemsize) + + def to_dtype(self) -> np.dtypes.BytesDType[int]: + return self.dtype_cls(self.length) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and "length_bytes" in data["configuration"] + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.bytes_: + return np.bytes_(b"") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + if check_json_str(data): + return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def check_value(self, data: object) -> bool: + # this is generous for backwards compatibility + return isinstance(data, np.bytes_ | str | bytes | int) + + def _cast_value_unsafe(self, value: object) -> np.bytes_: + return self.to_dtype().type(value) + + @property + def item_size(self) -> int: + return self.length + + +@dataclass(frozen=True, kw_only=True) +class FixedLengthUTF32( + ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize +): + dtype_cls = np.dtypes.StrDType + _zarr_v3_name = "numpy.fixed_length_utf32" + code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point + + @classmethod + def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + byte_order = cast("EndiannessNumpy", dtype.byteorder) + return cls( + length=dtype.itemsize // (cls.code_point_bytes), + endianness=endianness_from_numpy_str(byte_order), + ) + + def to_dtype(self) -> np.dtypes.StrDType[int]: + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls(self.length).newbyteorder(byte_order) + + @classmethod + def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + """ + Check that the input is a valid JSON representation of a numpy S dtype. + """ + if zarr_format == 2: + # match >U1, <]U\d+$", data) is not None + elif zarr_format == 3: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def to_json(self, zarr_format: ZarrFormat) -> JSON: + if zarr_format == 2: + return self.to_dtype().str + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + elif zarr_format == 3: + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_value(self) -> np.str_: + return np.str_("") + + def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + return str(data) + + def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + if check_json_str(data): + return self.to_dtype().type(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover + + def check_value(self, data: object) -> bool: + # this is generous for backwards compatibility + return isinstance(data, str | np.str_ | bytes | int) + + def _cast_value_unsafe(self, data: object) -> np.str_: + return self.to_dtype().type(data) + + @property + def item_size(self) -> int: + return self.length * self.code_point_bytes + + if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index 8bc83f2f73..c0e8f137d4 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -5,12 +5,14 @@ import numpy as np from tests.test_dtype.test_wrapper import _TestZDType -from zarr.core.dtype.npy.float import Float16, Float64 -from zarr.core.dtype.npy.int import Int32, Int64 -from zarr.core.dtype.npy.sized import ( +from zarr.core.dtype import ( FixedLengthASCII, FixedLengthBytes, FixedLengthUTF32, + Float16, + Float64, + Int32, + Int64, Structured, ) diff --git a/tests/test_v2.py b/tests/test_v2.py index 5b7f2b5a1b..ca727c9b10 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -16,8 +16,7 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype.npy.sized import FixedLengthASCII, FixedLengthUTF32, Structured -from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32, Structured, VariableLengthString from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath From b4f106377622f7172e185e3f788720b88c0f207d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 24 May 2025 22:56:35 +0200 Subject: [PATCH 108/129] make tests isomorphic to source code --- tests/test_dtype/test_npy/test_sized.py | 83 ------------------------ tests/test_dtype/test_npy/test_string.py | 82 +++++++++++++++++++++++ tests/test_dtype/test_wrapper.py | 2 +- 3 files changed, 83 insertions(+), 84 deletions(-) diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py index c0e8f137d4..eaaa915f59 100644 --- a/tests/test_dtype/test_npy/test_sized.py +++ b/tests/test_dtype/test_npy/test_sized.py @@ -6,9 +6,7 @@ from tests.test_dtype.test_wrapper import _TestZDType from zarr.core.dtype import ( - FixedLengthASCII, FixedLengthBytes, - FixedLengthUTF32, Float16, Float64, Int32, @@ -17,48 +15,6 @@ ) -class TestFixedLengthAscii(_TestZDType): - test_cls = FixedLengthASCII - valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|U10"), - ) - valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) - invalid_json_v2 = ( - "|S", - "|U10", - "|f8", - ) - invalid_json_v3 = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 0}}, - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, - ) - - scalar_v2_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - scalar_v3_params = ( - (FixedLengthASCII(length=0), ""), - (FixedLengthASCII(length=2), "YWI="), - (FixedLengthASCII(length=4), "YWJjZA=="), - ) - cast_value_params = ( - (FixedLengthASCII(length=0), "", np.bytes_("")), - (FixedLengthASCII(length=2), "ab", np.bytes_("ab")), - (FixedLengthASCII(length=4), "abcd", np.bytes_("abcd")), - ) - item_size_params = ( - FixedLengthASCII(length=0), - FixedLengthASCII(length=4), - FixedLengthASCII(length=10), - ) - - class TestFixedLengthBytes(_TestZDType): test_cls = FixedLengthBytes valid_dtype = (np.dtype("|V10"),) @@ -105,45 +61,6 @@ class TestFixedLengthBytes(_TestZDType): ) -class TestFixedLengthUTF32(_TestZDType): - test_cls = FixedLengthUTF32 - valid_dtype = (np.dtype(">U10"), np.dtype("U10", "U10"), np.dtype("U10", " bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: object) -> None: - assert self.test_cls.check_dtype(valid_dtype) # type: ignore[arg-type] + assert self.test_cls.check_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] From 7b6c78c6aeb5b815bfcd51d4b28c24afa49ad79a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 25 May 2025 12:19:20 +0200 Subject: [PATCH 109/129] remove old string logic --- src/zarr/codecs/vlen_utf8.py | 2 +- src/zarr/core/strings.py | 89 ---------------------------------- tests/test_codecs/test_vlen.py | 2 +- tests/test_metadata/test_v3.py | 2 +- tests/test_strings.py | 37 -------------- 5 files changed, 3 insertions(+), 129 deletions(-) delete mode 100644 src/zarr/core/strings.py delete mode 100644 tests/test_strings.py diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index bad51f33ce..45dccf01fe 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -50,7 +50,7 @@ async def _decode_single( decoded = _vlen_utf8_codec.decode(raw_bytes) assert decoded.dtype == np.object_ decoded.shape = chunk_spec.shape - as_string_dtype = decoded.astype(chunk_spec.dtype.to_native_dtype(), copy=False) + as_string_dtype = decoded.astype(chunk_spec.dtype.to_dtype(), copy=False) return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) async def _encode_single( diff --git a/src/zarr/core/strings.py b/src/zarr/core/strings.py deleted file mode 100644 index 15c30b6f9b..0000000000 --- a/src/zarr/core/strings.py +++ /dev/null @@ -1,89 +0,0 @@ -"""This module contains utilities for working with string arrays across -different versions of Numpy. -""" - -from __future__ import annotations - -from typing import Any, Union, cast -from warnings import warn - -import numpy as np - -# _STRING_DTYPE is the in-memory datatype that will be used for V3 string arrays -# when reading data back from Zarr. -# Any valid string-like datatype should be fine for *setting* data. - -VLenStringType = Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"] -_VLEN_STRING_DTYPE: VLenStringType -_NUMPY_SUPPORTS_VLEN_STRING: bool - - -def cast_array( - data: np.ndarray[Any, np.dtype[Any]], -) -> np.ndarray[Any, VLenStringType]: - raise NotImplementedError - - -try: - # this new vlen string dtype was added in NumPy 2.0 - _VLEN_STRING_DTYPE = np.dtypes.StringDType() - _NUMPY_SUPPORTS_VLEN_STRING = True - - def cast_array( - data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, VLenStringType]: - out = data.astype(_VLEN_STRING_DTYPE, copy=False) - return cast(np.ndarray[Any, np.dtypes.StringDType], out) - -except AttributeError: - # if not available, we fall back on an object array of strings, as in Zarr < 3 - _VLEN_STRING_DTYPE = np.dtypes.ObjectDType() - _NUMPY_SUPPORTS_VLEN_STRING = False - - def cast_array( - data: np.ndarray[Any, np.dtype[Any]], - ) -> np.ndarray[Any, VLenStringType]: - out = data.astype(_VLEN_STRING_DTYPE, copy=False) - return cast(np.ndarray[Any, np.dtypes.ObjectDType], out) - - -def cast_to_string_dtype( - data: np.ndarray[Any, np.dtype[Any]], safe: bool = False -) -> np.ndarray[Any, VLenStringType]: - """Take any data and attempt to cast to to our preferred string dtype. - - data : np.ndarray - The data to cast - - safe : bool - If True, do not issue a warning if the data is cast from object to string dtype. - - """ - if np.issubdtype(data.dtype, np.str_): - # legacy fixed-width string type (e.g. "= 2.", - stacklevel=2, - ) - return cast_array(data) - raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype") diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 5879782354..6fe1863464 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -9,8 +9,8 @@ from zarr.abc.store import Store from zarr.codecs import ZstdCodec from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"] diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 9eb4d5ba1d..0f88f52c66 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -12,6 +12,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.core.config import config from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.dtype.npy.time import DateTime64 from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( @@ -19,7 +20,6 @@ parse_dimension_names, parse_zarr_format, ) -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.errors import MetadataValidationError if TYPE_CHECKING: diff --git a/tests/test_strings.py b/tests/test_strings.py deleted file mode 100644 index 963f2e305e..0000000000 --- a/tests/test_strings.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Tests for the strings module.""" - -import numpy as np -import pytest - -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING, _VLEN_STRING_DTYPE, cast_to_string_dtype - - -def test_string_defaults() -> None: - if _NUMPY_SUPPORTS_VLEN_STRING: - assert _VLEN_STRING_DTYPE == np.dtypes.StringDType() - else: - assert _VLEN_STRING_DTYPE == np.dtypes.ObjectDType() - - -def test_cast_to_string_dtype() -> None: - d1 = np.array(["a", "b", "c"]) - assert d1.dtype == np.dtype(" Date: Mon, 26 May 2025 17:27:41 +0200 Subject: [PATCH 110/129] use scale_factor and unit in cast_value for datetime --- src/zarr/core/dtype/npy/time.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 61786351f8..1c0e0d715c 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -240,7 +240,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover def _cast_value_unsafe(self, data: object) -> np.datetime64: - return self.to_dtype().type(data) # type: ignore[no-any-return, call-overload] + return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] @classmethod def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: From e0b5a6495c4b361d3b84b001f0d749947fb117a2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 10:46:10 +0200 Subject: [PATCH 111/129] add regression testing against v2.18 --- tests/test_regression/test_regression.py | 125 +++++++++++++++++++++++ tests/test_regression/v2.18.py | 81 +++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 tests/test_regression/test_regression.py create mode 100644 tests/test_regression/v2.18.py diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py new file mode 100644 index 0000000000..362e8d75b4 --- /dev/null +++ b/tests/test_regression/test_regression.py @@ -0,0 +1,125 @@ +import subprocess +from dataclasses import asdict, dataclass +from itertools import product +from pathlib import Path + +import numcodecs +import numpy as np +import pytest +from numcodecs import LZ4, LZMA, Blosc, GZip, VLenUTF8, Zstd + +import zarr +from zarr.core.array import Array +from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.storage import LocalStore + + +def runner_installed() -> bool: + try: + subprocess.check_output(["uv", "--version"]) + return True + except FileNotFoundError: + return False + + +def array_metadata_equals(a: ArrayV2Metadata, b: ArrayV2Metadata) -> bool: + dict_a, dict_b = asdict(a), asdict(b) + fill_value_a, fill_value_b = dict_a.pop("fill_value"), dict_b.pop("fill_value") + if ( + isinstance(fill_value_a, float) + and isinstance(fill_value_b, float) + and np.isnan(fill_value_a) + and np.isnan(fill_value_b) + ): + return dict_a == dict_b + else: + return fill_value_a == fill_value_b and dict_a == dict_b + + +@dataclass(kw_only=True) +class ArrayParams: + values: np.ndarray[tuple[int], np.dtype[np.generic]] + fill_value: np.generic | str + compressor: numcodecs.abc.Codec + + +basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" +string_dtypes = ">S1", "U4" + +basic_array_cases = [ + ArrayParams(values=np.arange(4, dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, basic_dtypes) +] +datetime_array_cases = [ + ArrayParams(values=np.ones((4,), dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, datetime_dtypes) +] +string_array_cases = [ + ArrayParams( + values=np.array(["aaaa", "bbbb", "ccccc", "dddd"], dtype=dtype), + fill_value="foo", + compressor=codec, + ) + for codec, dtype in product(basic_codecs, string_dtypes) +] +vlen_string_cases = [ + ArrayParams( + values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), + fill_value="1", + compressor=VLenUTF8(), + ) +] +array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases + + +@pytest.fixture +def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: + dest = tmp_path / "in" + store = LocalStore(dest) + array_params: ArrayParams = request.param + compressor = array_params.compressor + if array_params.values.dtype == np.dtype("|O"): + dtype = VariableLengthString() + else: + dtype = array_params.values.dtype + z = zarr.create_array( + store, + shape=array_params.values.shape, + dtype=dtype, + chunks=array_params.values.shape, + compressors=compressor, + fill_value=array_params.fill_value, + order="C", + filters=None, + chunk_key_encoding={"name": "v2", "configuration": {"separator": "/"}}, + write_data=True, + zarr_format=2, + ) + z[:] = array_params.values + return z + + +@pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") +@pytest.mark.parametrize( + "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) +) +def test_roundtrip(source_array: Array, tmp_path: Path) -> None: + out_path = tmp_path / "out" + copy_op = subprocess.run( + [ + "uv", + "run", + Path(__file__).resolve().parent / "v2.18.py", + str(source_array.store).removeprefix("file://"), + str(out_path), + ], + capture_output=True, + text=True, + ) + assert copy_op.returncode == 0 + out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) + assert array_metadata_equals(source_array.metadata, out_array.metadata) + assert np.array_equal(source_array[:], out_array[:]) diff --git a/tests/test_regression/v2.18.py b/tests/test_regression/v2.18.py new file mode 100644 index 0000000000..39e1c5210c --- /dev/null +++ b/tests/test_regression/v2.18.py @@ -0,0 +1,81 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr==2.18", +# "numcodecs==0.15" +# ] +# /// + +import argparse + +import zarr +from zarr._storage.store import BaseStore + + +def copy_group( + *, node: zarr.hierarchy.Group, store: zarr.storage.BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group: + result = zarr.group(store=store, path=path, overwrite=overwrite) + result.attrs.put(node.attrs.asdict()) + for key, child in node.items(): + child_path = f"{path}/{key}" + if isinstance(child, zarr.hierarchy.Group): + copy_group(node=child, store=store, path=child_path, overwrite=overwrite) + elif isinstance(child, zarr.core.Array): + copy_array(node=child, store=store, overwrite=overwrite, path=child_path) + return result + + +def copy_array( + *, node: zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.core.Array: + result = zarr.create( + shape=node.shape, + dtype=node.dtype, + fill_value=node.fill_value, + chunks=node.chunks, + compressor=node.compressor, + filters=node.filters, + order=node.order, + dimension_separator=node._dimension_separator, + store=store, + path=path, + overwrite=overwrite, + ) + result.attrs.put(node.attrs.asdict()) + result[:] = node[:] + return result + + +def copy_node( + node: zarr.hierarchy.Group | zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group | zarr.core.Array: + if isinstance(node, zarr.hierarchy.Group): + return copy_group(node=node, store=store, path=path, overwrite=overwrite) + elif isinstance(node, zarr.core.Array): + return copy_array(node=node, store=store, path=path, overwrite=overwrite) + else: + raise TypeError(f"Unexpected node type: {type(node)}") # pragma: no cover + + +def cli() -> None: + parser = argparse.ArgumentParser( + description="Copy a zarr hierarchy from one location to another" + ) + parser.add_argument("source", type=str, help="Path to the source zarr hierarchy") + parser.add_argument("destination", type=str, help="Path to the destination zarr hierarchy") + args = parser.parse_args() + + src, dst = args.source, args.destination + root_src = zarr.open(src, mode="r") + result = copy_node(node=root_src, store=zarr.NestedDirectoryStore(dst), path="", overwrite=True) + + print(f"successfully created {result} at {dst}") + + +def main() -> None: + cli() + + +if __name__ == "__main__": + main() From 6437c8d1a19c4988307fcdd9c4cf32dfbc69716e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 11:33:05 +0200 Subject: [PATCH 112/129] truncate U and S scalars in _cast_value_unsafe --- src/zarr/core/dtype/npy/string.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index f65db5a984..b5b86ca387 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -86,8 +86,15 @@ def check_value(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, np.bytes_ | str | bytes | int) - def _cast_value_unsafe(self, value: object) -> np.bytes_: - return self.to_dtype().type(value) + def _cast_value_unsafe(self, data: object) -> np.bytes_: + # We explicitly truncate the result because of the following numpy behavior: + # >>> x = np.dtype('S3').type('hello world') + # >>> x + # np.bytes_(b'hello world') + # >>> x.dtype + # dtype('S11') + + return self.to_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: @@ -168,7 +175,14 @@ def check_value(self, data: object) -> bool: return isinstance(data, str | np.str_ | bytes | int) def _cast_value_unsafe(self, data: object) -> np.str_: - return self.to_dtype().type(data) + # We explicitly truncate the result because of the following numpy behavior: + # >>> x = np.dtype('U3').type('hello world') + # >>> x + # np.str_('hello world') + # >>> x.dtype + # dtype('U11') + + return self.to_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: From d9ab8dad5c290787ed017fc5a6323cbc99328cd6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 11:58:34 +0200 Subject: [PATCH 113/129] docstrings and simplification for regression tests --- tests/test_regression/test_regression.py | 26 +++++++----------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 362e8d75b4..688c5ff89d 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -1,5 +1,5 @@ import subprocess -from dataclasses import asdict, dataclass +from dataclasses import dataclass from itertools import product from pathlib import Path @@ -11,36 +11,24 @@ import zarr from zarr.core.array import Array from zarr.core.dtype.npy.string import VariableLengthString -from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.storage import LocalStore def runner_installed() -> bool: + """ + Check if a PEP-723 compliant python script runner is installed. + """ try: subprocess.check_output(["uv", "--version"]) - return True + return True # noqa: TRY300 except FileNotFoundError: return False -def array_metadata_equals(a: ArrayV2Metadata, b: ArrayV2Metadata) -> bool: - dict_a, dict_b = asdict(a), asdict(b) - fill_value_a, fill_value_b = dict_a.pop("fill_value"), dict_b.pop("fill_value") - if ( - isinstance(fill_value_a, float) - and isinstance(fill_value_b, float) - and np.isnan(fill_value_a) - and np.isnan(fill_value_b) - ): - return dict_a == dict_b - else: - return fill_value_a == fill_value_b and dict_a == dict_b - - @dataclass(kw_only=True) class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] - fill_value: np.generic | str + fill_value: np.generic | str | int compressor: numcodecs.abc.Codec @@ -121,5 +109,5 @@ def test_roundtrip(source_array: Array, tmp_path: Path) -> None: ) assert copy_op.returncode == 0 out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) - assert array_metadata_equals(source_array.metadata, out_array.metadata) + assert source_array.metadata.to_dict() == out_array.metadata.to_dict() assert np.array_equal(source_array[:], out_array[:]) From 3302161f64f1329bae115d39eb3f83daba2951b3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 12:29:34 +0200 Subject: [PATCH 114/129] changes necessary for linting with regression tests --- pyproject.toml | 1 + src/zarr/core/dtype/wrapper.py | 1 + tests/test_dtype/test_wrapper.py | 2 +- tests/test_regression/test_regression.py | 20 ++++-- tests/test_regression/v2.18.py | 81 ------------------------ 5 files changed, 19 insertions(+), 86 deletions(-) delete mode 100644 tests/test_regression/v2.18.py diff --git a/pyproject.toml b/pyproject.toml index 0b7cb9f856..64cc4f7e1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -386,6 +386,7 @@ module = [ "tests.test_indexing", "tests.test_properties", "tests.test_sync", + "tests.test_v2", "tests.test_regression.scripts.*" ] ignore_errors = true diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 1a9d9b1e21..bd9686afc1 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -39,6 +39,7 @@ # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. TBaseDType = np.dtype[np.generic] + # These two type parameters are covariant because we want # x : ZDType[BaseDType, BaseScalar] = ZDType[SubDType, SubScalar] # to type check diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index a61fc1a9cd..9a5e3ee56d 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -91,7 +91,7 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: # but some classes may need to override this for special cases return scalar1 == scalar2 - def test_check_dtype_valid(self, valid_dtype: object) -> None: + def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: assert self.test_cls.check_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 688c5ff89d..61ff8ebfa9 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from itertools import product from pathlib import Path +from typing import TYPE_CHECKING import numcodecs import numpy as np @@ -10,9 +11,13 @@ import zarr from zarr.core.array import Array +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.string import VariableLengthString from zarr.storage import LocalStore +if TYPE_CHECKING: + from zarr.core.dtype import ZDTypeLike + def runner_installed() -> bool: """ @@ -69,8 +74,10 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: store = LocalStore(dest) array_params: ArrayParams = request.param compressor = array_params.compressor + chunk_key_encoding = V2ChunkKeyEncoding(separator="/") + dtype: ZDTypeLike if array_params.values.dtype == np.dtype("|O"): - dtype = VariableLengthString() + dtype = VariableLengthString() # type: ignore[assignment] else: dtype = array_params.values.dtype z = zarr.create_array( @@ -82,7 +89,7 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: fill_value=array_params.fill_value, order="C", filters=None, - chunk_key_encoding={"name": "v2", "configuration": {"separator": "/"}}, + chunk_key_encoding=chunk_key_encoding, write_data=True, zarr_format=2, ) @@ -90,17 +97,22 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: return z +# TODO: make this dynamic based on the installed scripts +script_paths = [Path(__file__).resolve().parent / "scripts" / "v2.18.py"] + + @pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") @pytest.mark.parametrize( "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) ) -def test_roundtrip(source_array: Array, tmp_path: Path) -> None: +@pytest.mark.parametrize("script_path", script_paths) +def test_roundtrip(source_array: Array, tmp_path: Path, script_path: Path) -> None: out_path = tmp_path / "out" copy_op = subprocess.run( [ "uv", "run", - Path(__file__).resolve().parent / "v2.18.py", + script_path, str(source_array.store).removeprefix("file://"), str(out_path), ], diff --git a/tests/test_regression/v2.18.py b/tests/test_regression/v2.18.py deleted file mode 100644 index 39e1c5210c..0000000000 --- a/tests/test_regression/v2.18.py +++ /dev/null @@ -1,81 +0,0 @@ -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "zarr==2.18", -# "numcodecs==0.15" -# ] -# /// - -import argparse - -import zarr -from zarr._storage.store import BaseStore - - -def copy_group( - *, node: zarr.hierarchy.Group, store: zarr.storage.BaseStore, path: str, overwrite: bool -) -> zarr.hierarchy.Group: - result = zarr.group(store=store, path=path, overwrite=overwrite) - result.attrs.put(node.attrs.asdict()) - for key, child in node.items(): - child_path = f"{path}/{key}" - if isinstance(child, zarr.hierarchy.Group): - copy_group(node=child, store=store, path=child_path, overwrite=overwrite) - elif isinstance(child, zarr.core.Array): - copy_array(node=child, store=store, overwrite=overwrite, path=child_path) - return result - - -def copy_array( - *, node: zarr.core.Array, store: BaseStore, path: str, overwrite: bool -) -> zarr.core.Array: - result = zarr.create( - shape=node.shape, - dtype=node.dtype, - fill_value=node.fill_value, - chunks=node.chunks, - compressor=node.compressor, - filters=node.filters, - order=node.order, - dimension_separator=node._dimension_separator, - store=store, - path=path, - overwrite=overwrite, - ) - result.attrs.put(node.attrs.asdict()) - result[:] = node[:] - return result - - -def copy_node( - node: zarr.hierarchy.Group | zarr.core.Array, store: BaseStore, path: str, overwrite: bool -) -> zarr.hierarchy.Group | zarr.core.Array: - if isinstance(node, zarr.hierarchy.Group): - return copy_group(node=node, store=store, path=path, overwrite=overwrite) - elif isinstance(node, zarr.core.Array): - return copy_array(node=node, store=store, path=path, overwrite=overwrite) - else: - raise TypeError(f"Unexpected node type: {type(node)}") # pragma: no cover - - -def cli() -> None: - parser = argparse.ArgumentParser( - description="Copy a zarr hierarchy from one location to another" - ) - parser.add_argument("source", type=str, help="Path to the source zarr hierarchy") - parser.add_argument("destination", type=str, help="Path to the destination zarr hierarchy") - args = parser.parse_args() - - src, dst = args.source, args.destination - root_src = zarr.open(src, mode="r") - result = copy_node(node=root_src, store=zarr.NestedDirectoryStore(dst), path="", overwrite=True) - - print(f"successfully created {result} at {dst}") - - -def main() -> None: - cli() - - -if __name__ == "__main__": - main() From 4a301d9c88f05379f75c2325412b9ca77e7adea8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 12:40:03 +0200 Subject: [PATCH 115/129] improve method names, refactor type hints with typeddictionaries, fix registry load frequency, add object_codec_id for v2 json deserialization --- docs/user-guide/arrays.rst | 4 +- docs/user-guide/data_types.rst | 10 +- src/zarr/codecs/_v2.py | 6 +- src/zarr/codecs/bytes.py | 2 +- src/zarr/codecs/sharding.py | 4 +- src/zarr/codecs/vlen_utf8.py | 2 +- src/zarr/core/array.py | 6 +- src/zarr/core/codec_pipeline.py | 8 +- src/zarr/core/common.py | 16 +- src/zarr/core/dtype/__init__.py | 52 +-- src/zarr/core/dtype/common.py | 24 +- src/zarr/core/dtype/npy/bool.py | 50 +-- src/zarr/core/dtype/npy/common.py | 5 +- src/zarr/core/dtype/npy/complex.py | 53 ++-- src/zarr/core/dtype/npy/float.py | 48 +-- src/zarr/core/dtype/npy/int.py | 368 +++++++++++++++++----- src/zarr/core/dtype/npy/sized.py | 220 +++++++------ src/zarr/core/dtype/npy/string.py | 258 +++++++++------ src/zarr/core/dtype/npy/time.py | 174 +++++----- src/zarr/core/dtype/registry.py | 49 ++- src/zarr/core/dtype/wrapper.py | 151 ++++++--- src/zarr/core/metadata/v2.py | 33 +- src/zarr/core/metadata/v3.py | 10 +- tests/package_with_entrypoint/__init__.py | 12 +- tests/test_array.py | 18 +- tests/test_dtype/conftest.py | 2 +- tests/test_dtype/test_npy/test_bool.py | 6 +- tests/test_dtype/test_npy/test_common.py | 36 ++- tests/test_dtype/test_npy/test_complex.py | 8 +- tests/test_dtype/test_npy/test_float.py | 18 +- tests/test_dtype/test_npy/test_int.py | 34 +- tests/test_dtype/test_npy/test_sized.py | 18 +- tests/test_dtype/test_npy/test_string.py | 36 ++- tests/test_dtype/test_npy/test_time.py | 18 +- tests/test_dtype/test_wrapper.py | 48 +-- tests/test_dtype_registry.py | 47 ++- tests/test_group.py | 5 +- tests/test_metadata/test_consolidated.py | 2 +- tests/test_metadata/test_v3.py | 14 +- tests/test_properties.py | 2 +- tests/test_regression/test_regression.py | 6 +- tests/test_v2.py | 6 +- 42 files changed, 1195 insertions(+), 694 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index e4b253d812..8264b3d489 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -210,8 +210,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 3558573 (3.4M) - Storage ratio : 112.4 + No. bytes stored : 9696520 + Storage ratio : 41.3 Chunks Initialized : 100 .. note:: diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index a4d8314a5e..c101ae50fc 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -128,20 +128,20 @@ Create a ``ZDType`` from a native data type: >>> from zarr.core.dtype import Int8 >>> import numpy as np - >>> int8 = Int8.from_dtype(np.dtype('int8')) + >>> int8 = Int8.from_native_dtype(np.dtype('int8')) Convert back to native data type: .. code-block:: python - >>> native_dtype = int8.to_dtype() + >>> native_dtype = int8.to_native_dtype() >>> assert native_dtype == np.dtype('int8') Get the default scalar value for the data type: .. code-block:: python - >>> default_value = int8.default_value() + >>> default_value = int8.default_scalar() >>> assert default_value == np.int8(0) @@ -160,7 +160,7 @@ Serialize a scalar value to JSON: .. code-block:: python - >>> json_value = int8.to_json_value(42, zarr_format=3) + >>> json_value = int8.to_json_scalar(42, zarr_format=3) >>> json_value 42 @@ -168,5 +168,5 @@ Deserialize a scalar value from JSON: .. code-block:: python - >>> scalar_value = int8.from_json_value(42, zarr_format=3) + >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index c03e3c55fb..08853f27f1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -48,7 +48,7 @@ async def _decode_single( # segfaults and other bad things happening if chunk_spec.dtype.dtype_cls is not np.dtypes.ObjectDType: try: - chunk = chunk.view(chunk_spec.dtype.to_dtype()) + chunk = chunk.view(chunk_spec.dtype.to_native_dtype()) except TypeError: # this will happen if the dtype of the chunk # does not match the dtype of the array spec i.g. if @@ -56,7 +56,7 @@ async def _decode_single( # is an object array. In this case, we need to convert the object # array to the correct dtype. - chunk = np.array(chunk).astype(chunk_spec.dtype.to_dtype()) + chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype()) elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. @@ -80,7 +80,7 @@ async def _encode_single( chunk = chunk_array.as_ndarray_like() # ensure contiguous and correct order - chunk = chunk.astype(chunk_spec.dtype.to_dtype(), order=chunk_spec.order, copy=False) + chunk = chunk.astype(chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, copy=False) # apply filters if self.filters: diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 5db39796e4..6ef0fef60b 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -79,7 +79,7 @@ async def _decode_single( "Endianness | None", self.endian.value if self.endian is not None else None ) new_byte_order = endianness_to_numpy_str(endian_str) - dtype = chunk_spec.dtype.to_dtype().newbyteorder(new_byte_order) + dtype = chunk_spec.dtype.to_native_dtype().newbyteorder(new_byte_order) as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 914236d700..cd8676b4d1 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -452,7 +452,7 @@ async def _decode_single( # setup output array out = chunk_spec.prototype.nd_buffer.create( shape=shard_shape, - dtype=shard_spec.dtype.to_dtype(), + dtype=shard_spec.dtype.to_native_dtype(), order=shard_spec.order, fill_value=0, ) @@ -499,7 +499,7 @@ async def _decode_partial_single( # setup output array out = shard_spec.prototype.nd_buffer.create( shape=indexer.shape, - dtype=shard_spec.dtype.to_dtype(), + dtype=shard_spec.dtype.to_native_dtype(), order=shard_spec.order, fill_value=0, ) diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 45dccf01fe..bad51f33ce 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -50,7 +50,7 @@ async def _decode_single( decoded = _vlen_utf8_codec.decode(raw_bytes) assert decoded.dtype == np.object_ decoded.shape = chunk_spec.shape - as_string_dtype = decoded.astype(chunk_spec.dtype.to_dtype(), copy=False) + as_string_dtype = decoded.astype(chunk_spec.dtype.to_native_dtype(), copy=False) return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) async def _encode_single( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f05d8e9cad..4e71ee34fc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -732,7 +732,7 @@ def _create_metadata_v3( if fill_value is None: # v3 spec will not allow a null fill value - fill_value_parsed = dtype.default_value() + fill_value_parsed = dtype.default_scalar() else: fill_value_parsed = fill_value @@ -814,7 +814,7 @@ def _create_metadata_v2( if dimension_separator is None: dimension_separator = "." if fill_value is None: - fill_value = dtype.default_value() # type: ignore[assignment] + fill_value = dtype.default_scalar() # type: ignore[assignment] return ArrayV2Metadata( shape=shape, dtype=dtype, @@ -1091,7 +1091,7 @@ def dtype(self) -> TBaseDType: np.dtype Data type of the array """ - return self._zdtype.to_dtype() + return self._zdtype.to_native_dtype() @property def order(self) -> MemoryOrder: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 3d00fe5467..23c27e40c6 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -62,7 +62,7 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # validated when decoding the metadata, but we support reading # Zarr V2 data and need to support the case where fill_value # is None. - return chunk_spec.dtype.default_value() + return chunk_spec.dtype.default_scalar() else: return fill_value @@ -296,7 +296,9 @@ def _merge_chunk_array( is_complete_chunk: bool, drop_axes: tuple[int, ...], ) -> NDBuffer: - if chunk_selection == () or is_scalar(value.as_ndarray_like(), chunk_spec.dtype.to_dtype()): + if chunk_selection == () or is_scalar( + value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() + ): chunk_value = value else: chunk_value = value[out_selection] @@ -317,7 +319,7 @@ def _merge_chunk_array( if existing_chunk_array is None: chunk_array = chunk_spec.prototype.nd_buffer.create( shape=chunk_spec.shape, - dtype=chunk_spec.dtype.to_dtype(), + dtype=chunk_spec.dtype.to_native_dtype(), order=chunk_spec.order, fill_value=fill_value_or_default(chunk_spec), ) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 6fc46f6b06..6d99f1e937 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -10,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Final, Generic, Literal, TypedDict, @@ -48,19 +47,8 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): - """ - A typed dictionary representing an object with a name and configuration, where the configuration - is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. - - This class is generic with two type parameters: the type of the name (``TName``) and the type of - the configuration (``TConfig``). - """ - - name: ReadOnly[TName] - """The name of the object.""" - - configuration: ReadOnly[TConfig] - """The configuration of the object.""" + name: TName + configuration: TConfig def product(tup: ChunkCoords) -> int: diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 9c672fd986..a8bfe2b5c4 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -16,12 +16,13 @@ if TYPE_CHECKING: from zarr.core.common import ZarrFormat +from collections.abc import Mapping + import numpy as np import numpy.typing as npt from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( - _NUMPY_SUPPORTS_VLEN_STRING, FixedLengthASCII, FixedLengthUTF32, VariableLengthString, @@ -102,7 +103,7 @@ ) # This type models inputs that can be coerced to a ZDType -ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | dict[str, JSON] | str +ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str for dtype in ANY_DTYPE: # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType @@ -114,42 +115,41 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, """ Get a data type wrapper (an instance of ``ZDType``) from a native data type, e.g. a numpy dtype. """ - data_type_registry.lazy_load() if not isinstance(dtype, np.dtype): - # TODO: This check has a lot of assumptions in it! Chiefly, we assume that the - # numpy object dtype contains variable length strings, which is not in general true - # When / if zarr python supports ragged arrays, for example, this check will fail! - if dtype in (str, "str", "|T16", "O", "|O", np.dtypes.ObjectDType()): - if _NUMPY_SUPPORTS_VLEN_STRING: - na_dtype = np.dtype("T") - else: - na_dtype = np.dtype("O") - elif isinstance(dtype, list): + na_dtype: np.dtype[np.generic] + if isinstance(dtype, list): # this is a valid _VoidDTypeLike check na_dtype = np.dtype([tuple(d) for d in dtype]) else: na_dtype = np.dtype(dtype) else: na_dtype = dtype - return data_type_registry.match_dtype(na_dtype) + return data_type_registry.match_dtype(dtype=na_dtype) + + +def get_data_type_from_json_v3( + dtype_spec: JSON, +) -> ZDType[TBaseDType, TBaseScalar]: + return data_type_registry.match_json_v3(dtype_spec) -def get_data_type_from_json( - dtype: JSON, zarr_format: ZarrFormat +def get_data_type_from_json_v2( + dtype_spec: JSON, *, object_codec_id: str | None = None ) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json(dtype, zarr_format=zarr_format) + return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id) -def parse_data_type(dtype: ZDTypeLike, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: +def parse_data_type( + dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None +) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. """ - if isinstance(dtype, ZDType): - return dtype - elif isinstance(dtype, dict): - # This branch assumes that the data type has been specified in the JSON form - # but it's also possible for numpy data types to be specified as dictionaries, which will - # cause an error in the `get_data_type_from_json`, but that's ok for now - return get_data_type_from_json(dtype, zarr_format=zarr_format) # type: ignore[arg-type] - else: - return get_data_type_from_native_dtype(dtype) + if isinstance(dtype_spec, ZDType): + return dtype_spec + # dict and zarr_format 3 means that we have a JSON object representation of the dtype + if zarr_format == 3 and isinstance(dtype_spec, Mapping): + return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type] + # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case + # we can create a numpy dtype from it, and do the dtype inference from that + return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 5eeff2af5b..bbdc06c50d 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -2,7 +2,7 @@ import warnings from dataclasses import dataclass -from typing import Final, Literal +from typing import ClassVar, Final, Literal Endianness = Literal["little", "big"] SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] @@ -46,6 +46,28 @@ def item_size(self) -> int: raise NotImplementedError +@dataclass(frozen=True) +class HasObjectCodec: + """ + A mix-in class for data types that require an object codec id. + This class bears the property ``object_codec_id``, which is the string name of an object + codec that is required to encode and decode the data type. + + In zarr-python 2.x certain data types like variable-length strings or variable-length arrays + used the catch-all numpy "object" data type for their in-memory representation. But these data + types cannot be stored as numpy object data types, because the object data type does not define + a fixed memory layout. So these data types required a special codec, called an "object codec", + that effectively defined a compact representation for the data type, which was used to encode + and decode the data type. + + Zarr-python 2.x would not allow the creation of arrays with the "object" data type if an object + codec was not specified, and thus the name of the object codec is effectively part of the data + type model. + """ + + object_codec_id: ClassVar[str] + + class UnstableSpecificationWarning(FutureWarning): ... diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index d46758f789..b1800127e8 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,12 +1,12 @@ from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard +from typing import ClassVar, Literal, Self, TypeGuard, overload import numpy as np from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import HasItemSize from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType @dataclass(frozen=True, kw_only=True, slots=True) @@ -22,40 +22,50 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): The numpy dtype class. """ - _zarr_v3_name = "bool" + _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) dtype_cls = np.dtypes.BoolDType @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.BoolDType: + def to_native_dtype(self: Self) -> np.dtypes.BoolDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[Literal["bool", "|b1"]]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|b1"]]: """ Check that the input is a valid JSON representation of a bool. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - def to_json(self, zarr_format: ZarrFormat) -> str: + def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> np.bool_: + def default_scalar(self) -> np.bool_: """ Get the default value for the boolean dtype. @@ -66,7 +76,7 @@ def default_value(self) -> np.bool_: """ return np.False_ - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ Convert a scalar to a python bool. @@ -84,7 +94,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> bool: """ return bool(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: """ Read a JSON-serializable value as a numpy boolean scalar. @@ -101,14 +111,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: The numpy boolean scalar. """ if check_json_bool(data): - return self._cast_value_unsafe(data) + return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # Anything can become a bool return True - def _cast_value_unsafe(self, data: object) -> np.bool_: + def _cast_scalar_unchecked(self, data: object) -> np.bool_: return np.bool_(data) @property diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 2481dcb150..03dc194a7a 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -176,7 +176,10 @@ def float_from_json_v3(data: JSONFloatV3) -> float: elif len(data[2:]) == 16: dtype_code = ">d" else: - msg = f"Invalid float value: {data!r}. Expected a string of length 4, 8, or 16." + msg = ( + f"Invalid hexadecimal float value: {data!r}. " + "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" + ) raise ValueError(msg) return float(struct.unpack(dtype_code, bytes.fromhex(data[2:]))[0]) return float_from_json_v2(data) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index ee52dd0577..f7db6fe94d 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, ClassVar, + Literal, Self, TypeGuard, cast, @@ -24,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.dtype.npy.common import EndiannessNumpy @@ -36,11 +37,11 @@ class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, Ha _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> TComplexDType_co: + def to_native_dtype(self) -> TComplexDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -59,37 +60,39 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, ComplexLike) - def _cast_value_unsafe(self, data: object) -> TComplexScalar_co: - return self.to_dtype().type(data) # type: ignore[arg-type, return-value] + def _cast_scalar_unchecked(self, data: object) -> TComplexScalar_co: + return self.to_native_dtype().type(data) # type: ignore[arg-type, return-value] - def default_value(self) -> TComplexScalar_co: + def default_scalar(self) -> TComplexScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -98,9 +101,9 @@ def default_value(self) -> TComplexScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexScalar_co: """ Read a JSON-serializable value as a numpy float. @@ -118,19 +121,19 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TComplexSca """ if zarr_format == 2: if check_json_complex_float_v2(data): - return self._cast_value_unsafe(complex_float_from_json_v2(data)) + return self._cast_scalar_unchecked(complex_float_from_json_v2(data)) raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) elif zarr_format == 3: if check_json_complex_float_v3(data): - return self._cast_value_unsafe(complex_float_from_json_v3(data)) + return self._cast_scalar_unchecked(complex_float_from_json_v3(data)) raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert an object to a JSON-serializable float. @@ -148,16 +151,16 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: each of which is encoding according to a zarr-format-specific encoding. """ if zarr_format == 2: - return complex_float_to_json_v2(self.cast_value(data)) + return complex_float_to_json_v2(self.cast_scalar(data)) elif zarr_format == 3: - return complex_float_to_json_v3(self.cast_value(data)) + return complex_float_to_json_v3(self.cast_scalar(data)) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): dtype_cls = np.dtypes.Complex64DType - _zarr_v3_name = "complex64" + _zarr_v3_name: ClassVar[Literal["complex64"]] = "complex64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c8", " int: @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): dtype_cls = np.dtypes.Complex128DType - _zarr_v3_name = "complex128" + _zarr_v3_name: ClassVar[Literal["complex128"]] = "complex128" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">c16", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> TFloatDType_co: + def to_native_dtype(self) -> TFloatDType_co: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -51,37 +51,39 @@ def to_json(self, zarr_format: ZarrFormat) -> str: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name - def check_value(self, data: object) -> TypeGuard[FloatLike]: + def check_scalar(self, data: object) -> TypeGuard[FloatLike]: return isinstance(data, FloatLike) - def _cast_value_unsafe(self, data: object) -> TFloatScalar_co: - return self.to_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: object) -> TFloatScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] - def default_value(self) -> TFloatScalar_co: + def default_scalar(self) -> TFloatScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -90,9 +92,9 @@ def default_value(self) -> TFloatScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ Read a JSON-serializable value as a numpy float. @@ -110,14 +112,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala """ if zarr_format == 2: if check_json_float_v2(data): - return self._cast_value_unsafe(float_from_json_v2(data)) + return self._cast_scalar_unchecked(float_from_json_v2(data)) else: raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." ) elif zarr_format == 3: if check_json_float_v3(data): - return self._cast_value_unsafe(float_from_json_v3(data)) + return self._cast_scalar_unchecked(float_from_json_v3(data)) else: raise TypeError( f"Invalid type: {data}. Expected a float or a special string encoding of a float." @@ -125,7 +127,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScala else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | str: """ Convert an object to a JSON-serializable float. @@ -143,9 +145,9 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> float | str See the zarr specifications for details on the JSON encoding for floats. """ if zarr_format == 2: - return float_to_json_v2(self._cast_value_unsafe(data)) + return float_to_json_v2(self._cast_scalar_unchecked(data)) elif zarr_format == 3: - return float_to_json_v3(self._cast_value_unsafe(data)) + return float_to_json_v3(self._cast_scalar_unchecked(data)) else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index db5869b202..92705917f9 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -1,5 +1,15 @@ from dataclasses import dataclass -from typing import ClassVar, Self, SupportsIndex, SupportsInt, TypeGuard, TypeVar, cast +from typing import ( + ClassVar, + Literal, + Self, + SupportsIndex, + SupportsInt, + TypeGuard, + TypeVar, + cast, + overload, +) import numpy as np @@ -11,7 +21,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -36,44 +46,24 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] - def to_json(self, zarr_format: ZarrFormat) -> str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of this data type. """ - if zarr_format == 2: - return data in cls._zarr_v2_names - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data in cls._zarr_v2_names - def check_value(self, data: object) -> TypeGuard[IntLike]: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + return data == cls._zarr_v3_name + + def check_scalar(self, data: object) -> TypeGuard[IntLike]: return isinstance(data, IntLike) - def _cast_value_unsafe(self, data: object) -> TIntScalar_co: - return self.to_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: object) -> TIntScalar_co: + return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] - def default_value(self) -> TIntScalar_co: + def default_scalar(self) -> TIntScalar_co: """ Get the default value, which is 0 cast to this dtype @@ -82,9 +72,9 @@ def default_value(self) -> TIntScalar_co: Int scalar The default value. """ - return self._cast_value_unsafe(0) + return self._cast_scalar_unchecked(0) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ Read a JSON-serializable value as a numpy int scalar. @@ -101,10 +91,10 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_ The numpy scalar. """ if check_json_int(data): - return self._cast_value_unsafe(data) + return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ Convert an object to JSON-serializable scalar. @@ -120,24 +110,52 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: int The JSON-serializable form of the scalar. """ - return int(self.cast_value(data)) + return int(self.cast_scalar(data)) @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType - _zarr_v3_name = "int8" + _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.Int8DType: + def to_native_dtype(self: Self) -> np.dtypes.Int8DType: return self.dtype_cls() @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() @property @@ -148,18 +166,46 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name = "uint8" + _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: + """ + Convert the wrapped data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + The JSON-serializable representation of the wrapped data type + """ + if zarr_format == 2: + return self.to_native_dtype().str + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self: Self) -> np.dtypes.UInt8DType: + def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() @property @@ -170,23 +216,51 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType - _zarr_v3_name = "int16" + _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Literal[">i2", " Literal["int16"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int16", ">i2", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int16DType: + def to_native_dtype(self) -> np.dtypes.Int16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: # This ensures that we get the endianness correct without annoying string parsing - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -199,22 +273,50 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name = "uint16" + _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Literal[">u2", " Literal["uint16"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint16", ">u2", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt16DType: + def to_native_dtype(self) -> np.dtypes.UInt16DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -227,34 +329,64 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType - _zarr_v3_name = "int32" + _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Literal[">i4", " Literal["int32"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["int32", ">i4", " Self: + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: # We override the base implementation to address a windows-specific, pre-numpy 2 issue where # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, # despite the two classes being different. Thus we will create an instance of `cls` with the # latter dtype, after pulling in the byte order of the input if dtype == np.dtypes.Int32DType(): - return cls._from_dtype_unsafe(np.dtypes.Int32DType().newbyteorder(dtype.byteorder)) + return cls._from_native_dtype_unsafe( + np.dtypes.Int32DType().newbyteorder(dtype.byteorder) + ) else: - return super().from_dtype(dtype) + return super().from_native_dtype(dtype) @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int32DType: + def to_native_dtype(self) -> np.dtypes.Int32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -267,22 +399,48 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name = "uint32" + _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Literal[">u4", " Literal["uint32"]: ... + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint32", ">u4", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt32DType: + def to_native_dtype(self) -> np.dtypes.UInt32DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -295,22 +453,48 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType - _zarr_v3_name = "int64" + _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", " Literal[">i8", " Literal["int64"]: ... + def to_json(self, zarr_format: ZarrFormat) -> Literal["int64", ">i8", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.Int64DType: + def to_native_dtype(self) -> np.dtypes.Int64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -323,22 +507,50 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType - _zarr_v3_name = "uint64" + _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", " Literal[">u8", " Literal["uint64"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["uint64", ">u8", " Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls(endianness=endianness_from_numpy_str(byte_order)) - def to_dtype(self) -> np.dtypes.UInt64DType: + def to_native_dtype(self) -> np.dtypes.UInt64DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: return cls() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py index eb2b39ad9a..69d6145ad4 100644 --- a/src/zarr/core/dtype/npy/sized.py +++ b/src/zarr/core/dtype/npy/sized.py @@ -2,11 +2,11 @@ import re from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, Self, TypeGuard, cast +from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import ( DataTypeValidationError, HasItemSize, @@ -18,7 +18,14 @@ bytes_to_json, check_json_str, ) -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType + + +class FixedLengthBytesConfig(TypedDict): + length_bytes: int + + +FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], FixedLengthBytesConfig] @dataclass(frozen=True, kw_only=True) @@ -27,49 +34,59 @@ class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, Has # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "numpy.fixed_length_bytes" + _zarr_v3_name: ClassVar[Literal["fixed_length_bytes"]] = "fixed_length_bytes" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize) - def to_dtype(self) -> np.dtypes.VoidDType[int]: + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # Check that the dtype is |V1, |V2, ... + return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthBytesJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthBytesJSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthBytesJSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[Any]]: + def check_native_dtype( + cls: type[Self], dtype: TBaseDType + ) -> TypeGuard[np.dtypes.VoidDType[Any]]: """ Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. @@ -89,22 +106,22 @@ def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidD """ return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - def default_value(self) -> np.void: - return self.to_dtype().type(("\x00" * self.length).encode("ascii")) + def default_scalar(self) -> np.void: + return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_value(data).tobytes()).decode("ascii") + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data)) + return self.to_native_dtype().type(base64.standard_b64decode(data)) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, np.bytes_ | str | bytes | np.void) - def _cast_value_unsafe(self, data: object) -> np.void: - native_dtype = self.to_dtype() + def _cast_scalar_unchecked(self, data: object) -> np.void: + native_dtype = self.to_native_dtype() # Without the second argument, numpy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. @@ -115,17 +132,18 @@ def item_size(self) -> int: return self.length +# TODO: tighten this up, get a v3 spec in place, handle endianness, etc. @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] - def default_value(self) -> np.void: - return self._cast_value_unsafe(0) + def default_scalar(self) -> np.void: + return self._cast_scalar_unchecked(0) - def _cast_value_unsafe(self, data: object) -> np.void: - na_dtype = self.to_dtype() + def _cast_scalar_unchecked(self, data: object) -> np.void: + na_dtype = self.to_native_dtype() if isinstance(data, bytes): res = np.frombuffer(data, dtype=na_dtype)[0] elif isinstance(data, list | tuple): @@ -135,7 +153,7 @@ def _cast_value_unsafe(self, data: object) -> np.void: return cast("np.void", res) @classmethod - def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + def check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype @@ -149,10 +167,10 @@ def check_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: TypeGuard[np.dtypes.VoidDType] True if the dtype matches, False otherwise. """ - return super().check_dtype(dtype) and dtype.fields is not None + return super().check_native_dtype(dtype) and dtype.fields is not None @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] @@ -168,7 +186,13 @@ def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(fields=tuple(fields)) - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: fields = [ (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields ] @@ -178,90 +202,94 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: v3_unstable_dtype_warning(self) base_dict = {"name": self._zarr_v3_name} base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("JSON", base_dict) + return cast("DTypeJSON_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def check_json( - cls, data: JSON, zarr_format: ZarrFormat - ) -> TypeGuard[dict[str, JSON] | list[Any]]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[list[object]]: # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[Any] for now - if zarr_format == 2: - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) - ) - elif zarr_format == 3: - return ( - isinstance(data, dict) - and "name" in data - and "configuration" in data - and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] + # list[object] for now + + return ( + not isinstance(data, str) + and isinstance(data, Sequence) + and all( + not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 + for field in data ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + ) + + @classmethod + def check_json_v3( + cls, data: JSON + ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: + return ( + isinstance(data, dict) + and "name" in data + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and "fields" in data["configuration"] + ) @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: - from zarr.core.dtype import get_data_type_from_json + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + # avoid circular import issues by importing these functions here + from zarr.core.dtype import get_data_type_from_json_v2, get_data_type_from_json_v3 # This is a horrible mess, because this data type is recursive - if cls.check_json(data, zarr_format=zarr_format): - if zarr_format == 2: + if zarr_format == 2: + if cls.check_json_v2(data): # type: ignore[arg-type] # structured dtypes are constructed directly from a list of lists + # note that we do not handle the object codec here! this will prevent structured + # dtypes from containing object dtypes. return cls( fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) + (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] for f_name, f_dtype in data ) ) - elif zarr_format == 3: - if isinstance(data, dict) and "configuration" in data: - config = data["configuration"] - if isinstance(config, dict) and "fields" in config: - meta_fields = config["fields"] - fields = tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=zarr_format)) - for f_name, f_dtype in meta_fields - ) - return cls(fields=fields) - else: - raise TypeError( - f"Invalid type: {data}. Expected a dictionary." - ) # pragma: no cover - else: - raise TypeError( - f"Invalid type: {data}. Expected a dictionary." - ) # pragma: no cover + else: + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + elif zarr_format == 3: + if cls.check_json_v3(data): # type: ignore[arg-type] + config = data["configuration"] + meta_fields = config["fields"] + fields = tuple( + (f_name, get_data_type_from_json_v3(f_dtype)) for f_name, f_dtype in meta_fields + ) + else: + raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") + else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - def to_dtype(self) -> np.dtypes.VoidDType[int]: + return cls(fields=fields) + + def to_native_dtype(self) -> np.dtypes.VoidDType[int]: return cast( "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_dtype()) for (key, dtype) in self.fields]), + np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), ) - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_value(data).tobytes(), zarr_format) + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # TODO: implement something here! return True - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_dtype() + dtype = self.to_native_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover @property def item_size(self) -> int: # Lets have numpy do the arithmetic here - return self.to_dtype().itemsize + return self.to_native_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index b5b86ca387..2299b7aab1 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -3,18 +3,19 @@ import base64 import re from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard, cast +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength +from zarr.core.common import NamedConfig +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength, HasObjectCodec from zarr.core.dtype.npy.common import ( EndiannessNumpy, check_json_str, endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -23,39 +24,53 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") +class LengthBytesConfig(TypedDict): + length_bytes: int + + +# TDO: Fix this terrible name +FixedLengthASCIIJSONV3 = NamedConfig[Literal["fixed_length_ascii"], LengthBytesConfig] + + @dataclass(frozen=True, kw_only=True) class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): dtype_cls = np.dtypes.BytesDType - _zarr_v3_name = "numpy.fixed_length_ascii" + _zarr_v3_name: ClassVar[Literal["fixed_length_ascii"]] = "fixed_length_ascii" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls(length=dtype.itemsize) - def to_dtype(self) -> np.dtypes.BytesDType[int]: + def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of a numpy S dtype. """ - if zarr_format == 2: - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + # match |S1, |S2, etc + return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthASCIIJSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and "length_bytes" in data["configuration"] + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthASCIIJSONV3: ... - def to_json(self, zarr_format: ZarrFormat) -> JSON: + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthASCIIJSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return { "name": self._zarr_v3_name, @@ -64,29 +79,31 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload] + return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.bytes_: + def default_scalar(self) -> np.bytes_: return np.bytes_(b"") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: if check_json_str(data): - return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii"))) + return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, np.bytes_ | str | bytes | int) - def _cast_value_unsafe(self, data: object) -> np.bytes_: + def _cast_scalar_unchecked(self, data: object) -> np.bytes_: # We explicitly truncate the result because of the following numpy behavior: # >>> x = np.dtype('S3').type('hello world') # >>> x @@ -94,56 +111,68 @@ def _cast_value_unsafe(self, data: object) -> np.bytes_: # >>> x.dtype # dtype('S11') - return self.to_dtype().type(data[: self.length]) # type: ignore[index] + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: return self.length +# TODO: Fix this terrible name +FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] + + @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): dtype_cls = np.dtypes.StrDType - _zarr_v3_name = "numpy.fixed_length_utf32" + _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32" code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: byte_order = cast("EndiannessNumpy", dtype.byteorder) return cls( length=dtype.itemsize // (cls.code_point_bytes), endianness=endianness_from_numpy_str(byte_order), ) - def to_dtype(self) -> np.dtypes.StrDType[int]: + def to_native_dtype(self) -> np.dtypes.StrDType[int]: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: """ Check that the input is a valid JSON representation of a numpy S dtype. """ - if zarr_format == 2: - # match >U1, <]U\d+$", data) is not None - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - and isinstance(data["configuration"]["length_bytes"], int) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and "configuration" in data + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: if zarr_format == 2: - return self.to_dtype().str + return self.to_native_dtype().str elif zarr_format == 3: return { "name": self._zarr_v3_name, @@ -152,29 +181,31 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator] + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[index, call-overload] raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_value(self) -> np.str_: + def default_scalar(self) -> np.str_: return np.str_("") - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: if check_json_str(data): - return self.to_dtype().type(data) + return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) - def _cast_value_unsafe(self, data: object) -> np.str_: + def _cast_scalar_unchecked(self, data: object) -> np.str_: # We explicitly truncate the result because of the following numpy behavior: # >>> x = np.dtype('U3').type('hello world') # >>> x @@ -182,7 +213,10 @@ def _cast_value_unsafe(self, data: object) -> np.str_: # >>> x.dtype # dtype('U11') - return self.to_dtype().type(data[: self.length]) # type: ignore[index] + if isinstance(data, int): + return self.to_native_dtype().type(str(data)[: self.length]) + else: + return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] @property def item_size(self) -> int: @@ -192,32 +226,38 @@ def item_size(self) -> int: if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str]): # type: ignore[type-var] + class VariableLengthString(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] dtype_cls = np.dtypes.StringDType - _zarr_v3_name = "numpy.variable_length_utf8" + _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + object_codec_id = "vlen-utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.StringDType: + def to_native_dtype(self) -> np.dtypes.StringDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: """ - Check that the input is a valid JSON representation of a numpy string dtype. + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - # Note that we are checking for the object dtype name. - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: if zarr_format == 2: # Note: unlike many other numpy data types, we don't serialize the .str attribute # of the data type to JSON. This is because Zarr was using `|O` for strings before the @@ -229,71 +269,83 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> str: + def default_scalar(self) -> str: return "" - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return str(data) - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: if not check_json_str(data): raise TypeError(f"Invalid type: {data}. Expected a string.") return data - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, data: object) -> str: + def _cast_scalar_unchecked(self, data: object) -> str: return str(data) else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str]): # type: ignore[no-redef] + class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name = "numpy.variable_length_utf8" + _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + object_codec_id = "vlen-utf8" @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: return cls() - def to_dtype(self) -> np.dtypes.ObjectDType: + def to_native_dtype(self) -> np.dtypes.ObjectDType: return self.dtype_cls() @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: """ - Check that the input is a valid JSON representation of a numpy O dtype. + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ - if zarr_format == 2: - # TODO: take the entire metadata document in here, and - # check the compressors / filters for vlen-utf8 - return data == "|O" - elif zarr_format == 3: - return data == cls._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + return data == "|O" and object_codec_id == cls.object_codec_id - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: if zarr_format == 2: - return self.to_dtype().str + return "|O" elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: return cls() - def default_value(self) -> str: + def default_scalar(self) -> str: return "" - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: return data # type: ignore[return-value] - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: """ Strings pass through """ @@ -301,8 +353,8 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str: raise TypeError(f"Invalid type: {data}. Expected a string.") return data - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: return isinstance(data, str) - def _cast_value_unsafe(self, data: object) -> str: + def _cast_scalar_unchecked(self, data: object) -> str: return str(data) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 1c0e0d715c..4c5ce45442 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,11 +1,9 @@ from __future__ import annotations -from collections.abc import Mapping from dataclasses import dataclass from typing import ( TYPE_CHECKING, ClassVar, - Generic, Literal, Self, TypedDict, @@ -13,10 +11,12 @@ TypeVar, cast, get_args, + overload, ) import numpy as np +from zarr.core.common import NamedConfig from zarr.core.dtype.common import HasEndianness, HasItemSize from zarr.core.dtype.npy.common import ( DateTimeUnit, @@ -25,7 +25,7 @@ endianness_from_numpy_str, endianness_to_numpy_str, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -79,23 +79,14 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: ) _BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) -TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) - - -class NamedConfig(TypedDict, Generic[TName, TConfig]): - name: TName - configuration: TConfig - class TimeConfig(TypedDict): unit: DateTimeUnit interval: int -# aspirational -DateTime64MetaParams = NamedConfig[Literal["numpy.datetime64"], TimeConfig] -TimeDelta64MetaParams = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] +DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] +TimeDelta64JSONV3 = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] @dataclass(frozen=True, kw_only=True, slots=True) @@ -117,7 +108,7 @@ def __post_init__(self) -> None: raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") @classmethod - def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit, scale_factor = np.datetime_data(dtype.name) unit = cast("DateTimeUnit", unit) byteorder = cast("EndiannessNumpy", dtype.byteorder) @@ -125,7 +116,7 @@ def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self: unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) ) - def to_dtype(self) -> _BaseTimeDType_co: + def to_native_dtype(self) -> _BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. @@ -133,32 +124,42 @@ def to_dtype(self) -> _BaseTimeDType_co: return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @classmethod - def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self: + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: if zarr_format == 2: - return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type] + return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] elif zarr_format == 3: unit = data["configuration"]["unit"] # type: ignore[index, call-overload] scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] - return cls(unit=unit, scale_factor=scale_factor) # type: ignore[arg-type] + return cls(unit=unit, scale_factor=scale_factor) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json(self, zarr_format: ZarrFormat) -> JSON: + @overload + def to_json(self, zarr_format: Literal[2]) -> str: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... + + def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta64JSONV3: if zarr_format == 2: - return cast("str", self.to_dtype().str) + return cast("str", self.to_native_dtype().str) elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } + return cast( + "DateTime64JSONV3 | TimeDelta64JSONV3", + { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + }, + ) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> int: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: return datetimelike_to_int(data) # type: ignore[arg-type] - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: # TODO: decide which values we should accept for datetimes. try: - np.array([data], dtype=self.to_dtype()) + np.array([data], dtype=self.to_native_dtype()) return True # noqa: TRY300 except ValueError: return False @@ -178,91 +179,90 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has """ dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.timedelta64" + _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" _zarr_v2_names = (">m8", " np.timedelta64: + def default_scalar(self) -> np.timedelta64: return np.timedelta64("NaT") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, data: object) -> np.timedelta64: - return self.to_dtype().type(data) # type: ignore[arg-type] + def _cast_scalar_unchecked(self, data: object) -> np.timedelta64: + return self.to_native_dtype().type(data) # type: ignore[arg-type] @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match m[M], etc - # consider making this a standalone function - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either m8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and set(data.keys()) == {"name", "configuration"} - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # match m[M], etc + # consider making this a standalone function + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either m8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} + ) @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] - _zarr_v3_name = "numpy.datetime64" + _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" _zarr_v2_names = (">M8", " np.datetime64: + def default_scalar(self) -> np.datetime64: return np.datetime64("NaT") - def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: if check_json_int(data) or data == "NaT": - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - def _cast_value_unsafe(self, data: object) -> np.datetime64: - return self.to_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] + def _cast_scalar_unchecked(self, data: object) -> np.datetime64: + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] @classmethod - def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: - if zarr_format == 2: - # match M[M], etc - # consider making this a standalone function - if not isinstance(data, str): - return False - if not data.startswith(cls._zarr_v2_names): - return False - if len(data) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" - elif zarr_format == 3: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and set(data["configuration"].keys()) == {"unit", "scale_factor"} - and data["configuration"]["unit"] in get_args(DateTimeUnit) - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + # match M[M], etc + # consider making this a standalone function + if not isinstance(data, str): + return False + if not data.startswith(cls._zarr_v2_names): + return False + if len(data) == 3: + # no unit, and + # we already checked that this string is either M8 + return True + else: + return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"unit", "scale_factor"} + ) diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 047f908ac6..0423f69dbe 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib from dataclasses import dataclass, field from typing import TYPE_CHECKING, Self @@ -10,7 +11,7 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON, ZarrFormat + from zarr.core.common import JSON from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -22,11 +23,12 @@ class DataTypeRegistry: contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) + lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) def lazy_load(self) -> None: for e in self.lazy_load_list: - self.register(e.name, e.load()) + self.register(e.load()._zarr_v3_name, e.load()) self.lazy_load_list.clear() @@ -35,14 +37,20 @@ def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) - if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls + def unregister(self, key: str) -> None: + """Unregister a data type by its key.""" + if key in self.contents: + del self.contents[key] + else: + raise KeyError(f"Data type '{key}' not found in registry.") + def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: return self.contents[key] def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: - self.lazy_load() if dtype == np.dtype("O"): msg = ( - "Data type resolution failed. " + f"Zarr data type resolution from {dtype} failed. " 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' "data type. " @@ -51,18 +59,41 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: "data type, see xxxxxxxxxxx" ) raise ValueError(msg) + matched: list[ZDType[TBaseDType, TBaseScalar]] = [] + for val in self.contents.values(): + with contextlib.suppress(DataTypeValidationError): + matched.append(val.from_native_dtype(dtype)) + if len(matched) == 1: + return matched[0] + elif len(matched) > 1: + msg = ( + f"Zarr data type resolution from {dtype} failed. " + f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " + "You should unregister one of these data types, or avoid Zarr data type inference " + "entirely by providing a specific Zarr data type when creating your array." + "For more information, see xxxxxxxxxxxxxxxxxx" + ) + raise ValueError(msg) + raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + + def match_json_v2( + self, data: JSON, *, object_codec_id: str | None = None + ) -> ZDType[TBaseDType, TBaseScalar]: + # The dtype field in zarr v2 JSON metadata is not unique across different distinct data types. + # Specifically, multiple distinct data types all use the "|O" data type representation. + # These must be disambiguated by the presence of an "object codec", which is a codec + # like variable-length utf8 encoding for strings. for val in self.contents.values(): try: - return val.from_dtype(dtype) + return val.from_json_v2(data, object_codec_id=object_codec_id) except DataTypeValidationError: pass - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + raise ValueError(f"No data type wrapper found that matches {data}") - def match_json(self, data: JSON, zarr_format: ZarrFormat) -> ZDType[TBaseDType, TBaseScalar]: - self.lazy_load() + def match_json_v3(self, data: JSON) -> ZDType[TBaseDType, TBaseScalar]: for val in self.contents.values(): try: - return val.from_json(data, zarr_format=zarr_format) + return val.from_json_v3(data) except DataTypeValidationError: pass raise ValueError(f"No data type wrapper found that matches {data}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index bd9686afc1..c9b23707e8 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -23,8 +23,18 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Generic, Self, TypeGuard, TypeVar +from typing import ( + TYPE_CHECKING, + ClassVar, + Generic, + Literal, + Self, + TypeGuard, + TypeVar, + overload, +) import numpy as np @@ -46,6 +56,10 @@ TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) +# These types should include all JSON-serializable types that can be used to represent a data type. +DTypeJSON_V2 = str | Sequence[object] +DTypeJSON_V3 = str | Mapping[str, object] + @dataclass(frozen=True, kw_only=True, slots=True) class ZDType(Generic[TDType_co, TScalar_co], ABC): @@ -70,7 +84,7 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + def check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ Check that a data type matches the dtype_cls class attribute. Used as a type guard. @@ -87,7 +101,7 @@ def check_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: return type(dtype) is cls.dtype_cls @classmethod - def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a dtype object. @@ -106,15 +120,15 @@ def from_dtype(cls: type[Self], dtype: TBaseDType) -> Self: TypeError If the dtype does not match the dtype_cls class attribute. """ - if cls.check_dtype(dtype): - return cls._from_dtype_unsafe(dtype) + if cls.check_native_dtype(dtype): + return cls._from_native_dtype_unsafe(dtype) raise DataTypeValidationError( f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." ) @classmethod @abstractmethod - def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: + def _from_native_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: """ Wrap a native dtype without checking. @@ -131,7 +145,7 @@ def _from_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: ... @abstractmethod - def to_dtype(self: Self) -> TDType_co: + def to_native_dtype(self: Self) -> TDType_co: """ Return an instance of the wrapped dtype. @@ -142,10 +156,10 @@ def to_dtype(self: Self) -> TDType_co: """ ... - def cast_value(self, data: object) -> TScalar_co: + def cast_scalar(self, data: object) -> TScalar_co: """ - Cast a value to the wrapped scalar type. The type is first checked for compatibility. If it's - incompatible with the associated scalar type, a ``TypeError`` will be raised. + Cast a scalar to the wrapped scalar type. The type is first checked for compatibility. If + it's incompatible with the associated scalar type, a ``TypeError`` will be raised. Parameters ---------- @@ -157,8 +171,8 @@ def cast_value(self, data: object) -> TScalar_co: TScalar The cast value. """ - if self.check_value(data): - return self._cast_value_unsafe(data) + if self.check_scalar(data): + return self._cast_scalar_unchecked(data) msg = ( f"The value {data} failed a type check. " f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " @@ -168,9 +182,9 @@ def cast_value(self, data: object) -> TScalar_co: raise TypeError(msg) @abstractmethod - def check_value(self, data: object) -> bool: + def check_scalar(self, data: object) -> bool: """ - Check that a value is a valid value for the wrapped data type. + Check that a scalar is a valid value for the wrapped data type. Parameters ---------- @@ -185,9 +199,9 @@ def check_value(self, data: object) -> bool: ... @abstractmethod - def _cast_value_unsafe(self, data: object) -> TScalar_co: + def _cast_scalar_unchecked(self, data: object) -> TScalar_co: """ - Cast a value to the wrapped data type. This method should not perform any input validation. + Cast a scalar to the wrapped data type. This method should not perform any input validation. Parameters ---------- @@ -202,11 +216,12 @@ def _cast_value_unsafe(self, data: object) -> TScalar_co: ... @abstractmethod - def default_value(self) -> TScalar_co: + def default_scalar(self) -> TScalar_co: """ - Get the default value for the wrapped data type. This is a method, rather than an attribute, + Get the default scalar value for the wrapped data type. This is a method, rather than an attribute, because the default value for some data types may depend on parameters that are not known - until a concrete data type is wrapped. + until a concrete data type is wrapped. For example, data types parametrized by a length like + fixed-length strings or bytes will generate scalars consistent with that length. Returns ------- @@ -217,7 +232,35 @@ def default_value(self) -> TScalar_co: @classmethod @abstractmethod - def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]: + def check_json_v2( + cls: type[Self], data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[DTypeJSON_V2]: + """ + Check that a JSON representation of a data type is consistent with the ZDType class. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. + + object_codec_id : str | None + The object codec ID, if applicable. Object codecs are specific numcodecs codecs that + zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set + to "|O" with an object codec ID of "vlen-utf8" indicates that the data type is a + variable-length string. + + Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. + + Returns + ------- + Bool + True if the JSON representation matches, False otherwise. + """ + ... + + @classmethod + @abstractmethod + def check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: """ Check that a JSON representation of a data type matches the dtype_cls class attribute. Used as a type guard. This base implementation checks that the input is a dictionary, @@ -229,9 +272,6 @@ def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuar data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. - Returns ------- Bool @@ -239,8 +279,14 @@ def check_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> TypeGuar """ ... + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + @abstractmethod - def to_json(self, zarr_format: ZarrFormat) -> JSON: + def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: """ Convert the wrapped data type to a JSON-serializable form. @@ -251,46 +297,73 @@ def to_json(self, zarr_format: ZarrFormat) -> JSON: Returns ------- - JSON + DTypeJSON_V2 | DTypeJSON_V3 The JSON-serializable representation of the wrapped data type """ ... @classmethod - def from_json(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: + def from_json_v3(cls: type[Self], data: JSON) -> Self: """ - Wrap a JSON representation of a data type. + Wrap a Zarr V3 JSON representation of a data type. Parameters ---------- data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. - Returns ------- Self The wrapped data type. """ - if cls.check_json(data, zarr_format=zarr_format): - return cls._from_json_unsafe(data, zarr_format=zarr_format) + if cls.check_json_v3(data): + return cls._from_json_unchecked(data, zarr_format=3) raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") @classmethod - @abstractmethod - def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> Self: + def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self: """ - Wrap a JSON representation of a data type. + Wrap a Zarr V2 JSON representation of a data type. Parameters ---------- data : JSON The JSON representation of the data type. - zarr_format : ZarrFormat - The zarr format version. + Returns + ------- + Self + The wrapped data type. + """ + if cls.check_json_v2(data, object_codec_id=object_codec_id): + return cls._from_json_unchecked(data, zarr_format=2) + raise DataTypeValidationError( + f"Invalid JSON representation of data type {cls}: {data!r}, object_codec_id={object_codec_id!r}" + ) + + @classmethod + @overload + def _from_json_unchecked(cls, data: DTypeJSON_V2, *, zarr_format: Literal[2]) -> Self: ... + @classmethod + @overload + def _from_json_unchecked(cls, data: DTypeJSON_V3, *, zarr_format: Literal[3]) -> Self: ... + + @classmethod + @abstractmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + """ + Create a ZDType instance from a JSON representation of a data type. + + This method should be called after input has been type checked, and so it should not perform + any input validation. + + Parameters + ---------- + data : JSON + The JSON representation of the data type. Returns ------- @@ -300,7 +373,7 @@ def _from_json_unsafe(cls: type[Self], data: JSON, zarr_format: ZarrFormat) -> S ... @abstractmethod - def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ Convert a single value to JSON-serializable format. @@ -319,7 +392,7 @@ def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> JSON: ... @abstractmethod - def from_json_value(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ Read a JSON-serializable value as a scalar. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index e82c768b90..9296afd72a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,7 @@ from zarr.abc.metadata import Metadata from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype import get_data_type_from_json_v2 from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType if TYPE_CHECKING: @@ -58,6 +58,9 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +# These are the ids of the known object codecs for zarr v2. +ObjectCodecIds = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") + @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): @@ -99,7 +102,7 @@ def __init__( filters_parsed = parse_filters(filters) fill_value_parsed: TBaseScalar | None if fill_value is not None: - fill_value_parsed = dtype.cast_value(fill_value) + fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value attributes_parsed = parse_attributes(attributes) @@ -148,11 +151,29 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = get_data_type_from_native_dtype(_data["dtype"]) + + # To resolve a numpy object dtype array, we need to search for an object codec, + # which could be in filters or as a compressor. + # we will use a hard-coded list of object codecs for this search. + object_codec_id: str | None = None + maybe_object_codecs = (data.get("filters"), data.get("compressor")) + for maybe_object_codec in maybe_object_codecs: + if isinstance(maybe_object_codec, Sequence): + for codec in maybe_object_codec: + if isinstance(codec, dict) and codec.get("id") in ObjectCodecIds: + object_codec_id = codec["id"] + break + elif ( + isinstance(maybe_object_codec, dict) + and maybe_object_codec.get("id") in ObjectCodecIds + ): + object_codec_id = maybe_object_codec["id"] + break + dtype = get_data_type_from_json_v2(data["dtype"], object_codec_id=object_codec_id) _data["dtype"] = dtype fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: - fill_value = dtype.from_json_value(fill_value_encoded, zarr_format=2) + fill_value = dtype.from_json_scalar(fill_value_encoded, zarr_format=2) _data["fill_value"] = fill_value # zarr v2 allowed arbitrary keys here. @@ -205,11 +226,11 @@ def to_dict(self) -> dict[str, JSON]: # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: - fill_value = self.dtype.to_json_value(self.fill_value, zarr_format=2) + fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value # serialize the dtype after fill value-specific JSON encoding - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) # type: ignore[assignment] return zarray_dict diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 80ed722836..83b9bd7bc8 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,7 +7,7 @@ from zarr.core.dtype import ( VariableLengthString, ZDType, - get_data_type_from_json, + get_data_type_from_json_v3, ) if TYPE_CHECKING: @@ -175,7 +175,7 @@ def __init__( chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) dimension_names_parsed = parse_dimension_names(dimension_names) # Note: relying on a type method is numpy-specific - fill_value_parsed = data_type.cast_value(fill_value) + fill_value_parsed = data_type.cast_scalar(fill_value) attributes_parsed = parse_attributes(attributes) codecs_parsed_partial = parse_codecs(codecs) storage_transformers_parsed = parse_storage_transformers(storage_transformers) @@ -306,12 +306,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - data_type = get_data_type_from_json(data_type_json, zarr_format=3) + data_type = get_data_type_from_json_v3(data_type_json) # check that the fill value is consistent with the data type try: fill = _data.pop("fill_value") - fill_value_parsed = data_type.from_json_value(fill, zarr_format=3) + fill_value_parsed = data_type.from_json_scalar(fill, zarr_format=3) except ValueError as e: raise TypeError(f"Invalid fill_value: {fill!r}") from e @@ -325,7 +325,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() - out_dict["fill_value"] = self.data_type.to_json_value( + out_dict["fill_value"] = self.data_type.to_json_scalar( self.fill_value, zarr_format=self.zarr_format ) if not isinstance(out_dict, dict): diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index ef605be41a..4f507ab457 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -10,7 +10,7 @@ from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import BytesLike, ZarrFormat +from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.bool import Bool @@ -75,13 +75,13 @@ class TestDataType(Bool): This is a "data type" that serializes to "test" """ - _zarr_v3_name = "test" + _zarr_v3_name = "test" # type: ignore[assignment] @classmethod - def from_json(cls, data: Any, zarr_format: Literal[2, 3]) -> Self: - if data == cls._zarr_v3_name: + def from_json(cls, data: JSON, zarr_format: Literal[2, 3]) -> Self: + if data == cls._zarr_v3_name: # type: ignore[has-type] return cls() raise ValueError - def to_json(self, zarr_format: ZarrFormat) -> str: - return self._zarr_v3_name + def to_json(self, zarr_format: ZarrFormat) -> str: # type: ignore[override] + return self._zarr_v3_name # type: ignore[no-any-return, has-type] diff --git a/tests/test_array.py b/tests/test_array.py index be416d6d8f..0f8e2f8343 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -214,7 +214,7 @@ def test_array_fill_value_default( ) else: arr = zarr.create_array(store=store, shape=shape, dtype=zdtype, zarr_format=3, chunks=shape) - expected_fill_value = zdtype.default_value() + expected_fill_value = zdtype.default_scalar() if isinstance(expected_fill_value, np.datetime64 | np.timedelta64): if np.isnat(expected_fill_value): assert np.isnat(arr.fill_value) @@ -995,9 +995,9 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: """ a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype) if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): - assert np.isnat(dtype.default_value()) + assert np.isnat(dtype.default_scalar()) else: - assert a.fill_value == dtype.default_value() + assert a.fill_value == dtype.default_scalar() @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -1016,7 +1016,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="b", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype(), + dtype=dtype.to_native_dtype(), zarr_format=zarr_format, ) assert a.dtype == b.dtype @@ -1031,7 +1031,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="c", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype().char, + dtype=dtype.to_native_dtype().char, zarr_format=zarr_format, ) else: @@ -1040,7 +1040,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor name="c", shape=(5,), chunks=(5,), - dtype=dtype.to_dtype().str, + dtype=dtype.to_native_dtype().str, zarr_format=zarr_format, ) assert a.dtype == c.dtype @@ -1297,7 +1297,7 @@ async def test_default_filters_compressors( arr = await create_array( store=store, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] shape=(10,), zarr_format=zarr_format, ) @@ -1309,14 +1309,14 @@ async def test_default_filters_compressors( compressors=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, serializer=sig.parameters["serializer"].default, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] ) elif zarr_format == 2: default_filters, default_compressors = _parse_chunk_encoding_v2( compressor=sig.parameters["compressors"].default, filters=sig.parameters["filters"].default, - dtype=dtype, + dtype=dtype, # type: ignore[arg-type] ) if default_filters is None: expected_filters = () diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 2b21a57365..b2aa89afd7 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -17,7 +17,7 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore") zdtype_examples += ( - wrapper_cls.from_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), + wrapper_cls.from_native_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), ) elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 1adae57f02..03dc550a9d 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -2,11 +2,11 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype.npy.bool import Bool -class TestBool(_TestZDType): +class TestBool(BaseTestZDType): test_cls = Bool valid_dtype = (np.dtype(np.bool_),) @@ -15,7 +15,7 @@ class TestBool(_TestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = ("|b1",) + valid_json_v2 = (V2JsonTestParams(dtype="|b1"),) valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index 258ab48fe1..c4a82e22b0 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -45,7 +45,7 @@ def nan_equal(a: object, b: object) -> bool: return a == b -json_float_v2_cases: list[tuple[JSONFloatV2, float | np.floating[Any]]] = [ +json_float_v2_roundtrip_cases: tuple[tuple[JSONFloatV2, float | np.floating[Any]], ...] = ( ("Infinity", float("inf")), ("Infinity", np.inf), ("-Infinity", float("-inf")), @@ -53,11 +53,9 @@ def nan_equal(a: object, b: object) -> bool: ("NaN", float("nan")), ("NaN", np.nan), (1.0, 1.0), -] +) -# exactly the same as v2, for now, until we get support for the special NaN encoding defined in the -# v3 spec -json_float_v3_cases = json_float_v2_cases +json_float_v3_cases = json_float_v2_roundtrip_cases @pytest.mark.parametrize( @@ -94,13 +92,15 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: endianness_to_numpy_str(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v2_cases + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize( + ("data", "expected"), json_float_v2_roundtrip_cases + (("SHOULD_ERR", ""),) +) def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v2 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloatStrings) or isinstance(data, float): + if data != "SHOULD_ERR": assert nan_equal(float_from_json_v2(data), expected) # type: ignore[arg-type] else: msg = f"could not convert string to float: {data!r}" @@ -108,25 +108,35 @@ def test_float_from_json_v2(data: JSONFloatV2 | str, expected: float | str) -> N float_from_json_v2(data) # type: ignore[arg-type] -@pytest.mark.parametrize(("data", "expected"), json_float_v3_cases + [("SHOULD_ERR", "")]) +@pytest.mark.parametrize( + ("data", "expected"), json_float_v3_cases + (("SHOULD_ERR", ""), ("0x", "")) +) def test_float_from_json_v3(data: JSONFloatV2 | str, expected: float | str) -> None: """ Test that float_from_json_v3 correctly converts a JSON string representation of a float to a float. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(SpecialFloatStrings) or isinstance(data, float): - assert nan_equal(float_from_json_v3(data), expected) - else: + if data == "SHOULD_ERR": msg = ( f"Invalid float value: {data!r}. Expected a string starting with the hex prefix" " '0x', or one of 'NaN', 'Infinity', or '-Infinity'." ) with pytest.raises(ValueError, match=msg): float_from_json_v3(data) + elif data == "0x": + msg = ( + f"Invalid hexadecimal float value: {data!r}. " + "Expected the '0x' prefix to be followed by 4, 8, or 16 numeral characters" + ) + + with pytest.raises(ValueError, match=msg): + float_from_json_v3(data) + else: + assert nan_equal(float_from_json_v3(data), expected) # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("expected", "data"), json_float_v2_cases) +@pytest.mark.parametrize(("expected", "data"), json_float_v2_roundtrip_cases) def test_float_to_json_v2(data: float | np.floating[Any], expected: JSONFloatV2) -> None: """ Test that floats are JSON-encoded properly for zarr v2 @@ -170,7 +180,7 @@ def test_bytes_to_json(zarr_format: ZarrFormat) -> None: # note the order of parameters relative to the order of the parametrized variable. -@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_cases) +@pytest.mark.parametrize(("json_expected", "float_data"), json_float_v2_roundtrip_cases) def test_complex_to_json_v2( float_data: float | np.floating[Any], json_expected: JSONFloatV2 ) -> None: diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index 45a3a1480e..fd216d8415 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -4,11 +4,11 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype.npy.complex import Complex64, Complex128 -class _BaseTestFloat(_TestZDType): +class _BaseTestFloat(BaseTestZDType): def scalar_equals(self, scalar1: object, scalar2: object) -> bool: if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] return True @@ -23,7 +23,7 @@ class TestComplex64(_BaseTestFloat): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = (">c8", "c8"), V2JsonTestParams(dtype="c16", "c16"), V2JsonTestParams(dtype=" bool: if np.isnan(scalar1) and np.isnan(scalar2): # type: ignore[call-overload] return True @@ -20,7 +20,7 @@ def test_hex_encoding(self, hex_string_params: tuple[str, float]) -> None: """ hex_string, expected = hex_string_params zdtype = self.test_cls() - observed = zdtype.from_json_value(hex_string, zarr_format=3) + observed = zdtype.from_json_scalar(hex_string, zarr_format=3) assert self.scalar_equals(observed, expected) @@ -32,8 +32,8 @@ class TestFloat16(_BaseTestFloat): np.dtype(np.uint16), np.dtype(np.float32), ) - valid_json_v2 = Float16._zarr_v2_names - valid_json_v3 = (Float16._zarr_v3_name,) + valid_json_v2 = (V2JsonTestParams(dtype=">f2"), V2JsonTestParams(dtype="f4"), V2JsonTestParams(dtype="f8"), V2JsonTestParams(dtype="i1", @@ -37,7 +37,7 @@ class TestInt8(_TestZDType): item_size_params = (Int8(),) -class TestInt16(_TestZDType): +class TestInt16(BaseTestZDType): test_cls = Int16 scalar_type = np.int16 valid_dtype = (np.dtype(">i2"), np.dtype("i2", "i2"), V2JsonTestParams(dtype="i4"), np.dtype("i4", "i4"), V2JsonTestParams(dtype="i8"), np.dtype("i8", "i8"), V2JsonTestParams(dtype="u2"), np.dtype("u2", "u2"), V2JsonTestParams(dtype="u4"), np.dtype("u4", "u4"), V2JsonTestParams(dtype="u8"), np.dtype("u8", "u8"), V2JsonTestParams(dtype="i4"), ("field2", ">f8")], - [("field1", ">i8"), ("field2", ">i4")], + V2JsonTestParams(dtype=[("field1", ">i4"), ("field2", ">f8")]), + V2JsonTestParams(dtype=[("field1", ">i8"), ("field2", ">i4")]), ) valid_json_v3 = ( { @@ -99,7 +99,7 @@ class TestStructured(_TestZDType): ), ( "field2", - {"name": "numpy.fixed_length_utf32", "configuration": {"length_bytes": 32}}, + {"name": "fixed_length_utf32", "configuration": {"length_bytes": 32}}, ), ] }, diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 6620f45052..73c8612db4 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -2,13 +2,13 @@ import numpy as np -from tests.test_dtype.test_wrapper import _TestZDType +from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32 from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthString if _NUMPY_SUPPORTS_VLEN_STRING: - class TestVariableLengthString(_TestZDType): + class TestVariableLengthString(BaseTestZDType): test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment] invalid_dtype = ( @@ -16,15 +16,15 @@ class TestVariableLengthString(_TestZDType): np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ("|O",) - valid_json_v3 = ("numpy.variable_length_utf8",) + valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-utf8"),) + valid_json_v3 = ("variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", "invalid", ) invalid_json_v3 = ( - {"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}}, + {"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}}, {"name": "invalid_name"}, ) @@ -42,7 +42,7 @@ class TestVariableLengthString(_TestZDType): else: - class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] + class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] test_cls = VariableLengthString # type: ignore[assignment] valid_dtype = (np.dtype("O"),) invalid_dtype = ( @@ -50,8 +50,8 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] np.dtype(np.float64), np.dtype("|S10"), ) - valid_json_v2 = ("|O",) - valid_json_v3 = ("numpy.variable_length_utf8",) + valid_json_v2 = (V2JsonTestParams(dtype="|O", object_codec_id="vlen-utf8"),) + valid_json_v3 = ("variable_length_utf8",) invalid_json_v2 = ( "|S10", "|f8", @@ -76,7 +76,7 @@ class TestVariableLengthString(_TestZDType): # type: ignore[no-redef] item_size_params = (VariableLengthString(),) -class TestFixedLengthAscii(_TestZDType): +class TestFixedLengthAscii(BaseTestZDType): test_cls = FixedLengthASCII valid_dtype = (np.dtype("|S10"), np.dtype("|S4")) invalid_dtype = ( @@ -84,15 +84,19 @@ class TestFixedLengthAscii(_TestZDType): np.dtype(np.float64), np.dtype("|U10"), ) - valid_json_v2 = ("|S0", "|S2", "|S4") - valid_json_v3 = ({"name": "numpy.fixed_length_ascii", "configuration": {"length_bytes": 10}},) + valid_json_v2 = ( + V2JsonTestParams(dtype="|S0"), + V2JsonTestParams(dtype="|S2"), + V2JsonTestParams(dtype="|S4"), + ) + valid_json_v3 = ({"name": "fixed_length_ascii", "configuration": {"length_bytes": 10}},) invalid_json_v2 = ( "|S", "|U10", "|f8", ) invalid_json_v3 = ( - {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": 0}}, + {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, {"name": "numpy.fixed_length_ascii", "configuration": {"length_bits": "invalid"}}, ) @@ -118,7 +122,7 @@ class TestFixedLengthAscii(_TestZDType): ) -class TestFixedLengthUTF32(_TestZDType): +class TestFixedLengthUTF32(BaseTestZDType): test_cls = FixedLengthUTF32 valid_dtype = (np.dtype(">U10"), np.dtype("U10", "U10"), V2JsonTestParams(dtype=" bool: # This method gets overridden here to support the equivalency between NaT and # -9223372036854775808 fill values @@ -34,7 +34,12 @@ class TestDateTime64(_TestTimeBase): np.dtype(np.float64), np.dtype("timedelta64[ns]"), ) - valid_json_v2 = (">M8", ">M8[s]", " None: """ -class _TestZDType: +@dataclass(frozen=True, kw_only=True, slots=True) +class V2JsonTestParams: + dtype: str | dict[str, object] | list[object] + object_codec_id: str | None = None + + +class BaseTestZDType: """ A base class for testing ZDType subclasses. This class works in conjunction with the custom pytest collection function ``pytest_generate_tests`` defined in conftest.py, which applies the @@ -66,7 +73,7 @@ class _TestZDType: valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () + valid_json_v2: ClassVar[tuple[V2JsonTestParams, ...]] = () invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () @@ -92,37 +99,40 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: - assert self.test_cls.check_dtype(valid_dtype) + assert self.test_cls.check_native_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: - assert not self.test_cls.check_dtype(invalid_dtype) # type: ignore[arg-type] + assert not self.test_cls.check_native_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: - zdtype = self.test_cls.from_dtype(valid_dtype) - assert zdtype.to_dtype() == valid_dtype + zdtype = self.test_cls.from_native_dtype(valid_dtype) + assert zdtype.to_native_dtype() == valid_dtype - def test_from_json_roundtrip_v2(self, valid_json_v2: Any) -> None: - zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) - assert zdtype.to_json(zarr_format=2) == valid_json_v2 + def test_from_json_roundtrip_v2(self, valid_json_v2: V2JsonTestParams) -> None: + zdtype = self.test_cls.from_json_v2( + valid_json_v2.dtype, # type: ignore[arg-type] + object_codec_id=valid_json_v2.object_codec_id, + ) + assert zdtype.to_json(zarr_format=2) == valid_json_v2.dtype @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: - zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) + zdtype = self.test_cls.from_json_v3(valid_json_v3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 - def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[Any, Any]) -> None: + def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[ZDType[Any, Any], Any]) -> None: zdtype, scalar_json = scalar_v2_params - scalar = zdtype.from_json_value(scalar_json, zarr_format=2) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=2)) + scalar = zdtype.from_json_scalar(scalar_json, zarr_format=2) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=2)) - def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[Any, Any]) -> None: + def test_scalar_roundtrip_v3(self, scalar_v3_params: tuple[ZDType[Any, Any], Any]) -> None: zdtype, scalar_json = scalar_v3_params - scalar = zdtype.from_json_value(scalar_json, zarr_format=3) - assert self.json_scalar_equals(scalar_json, zdtype.to_json_value(scalar, zarr_format=3)) + scalar = zdtype.from_json_scalar(scalar_json, zarr_format=3) + assert self.json_scalar_equals(scalar_json, zdtype.to_json_scalar(scalar, zarr_format=3)) - def test_cast_value(self, cast_value_params: tuple[Any, Any, Any]) -> None: + def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) -> None: zdtype, value, expected = cast_value_params - observed = zdtype.cast_value(value) + observed = zdtype.cast_scalar(value) assert self.scalar_equals(expected, observed) def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: @@ -131,6 +141,6 @@ def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: with a fixed scalar size. """ if isinstance(item_size_params, HasItemSize): - assert item_size_params.item_size == item_size_params.to_dtype().itemsize + assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize else: pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 0c650e5c29..c4225874a4 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,9 +23,10 @@ TBaseScalar, ZDType, data_type_registry, - get_data_type_from_json, + get_data_type_from_json_v3, parse_data_type, ) +from zarr.core.dtype.common import HasObjectCodec if TYPE_CHECKING: from collections.abc import Generator @@ -58,7 +59,7 @@ def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) class NewBool(Bool): - def default_value(self) -> np.bool_: + def default_scalar(self) -> np.bool_: return np.True_ data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) @@ -96,20 +97,36 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("zdtype", zdtype_examples) - def test_registered_dtypes( - zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat - ) -> None: + def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) -> None: """ Test that the registered dtypes can be retrieved from the registry. """ skip_object_dtype(zdtype) - assert data_type_registry.match_dtype(zdtype.to_dtype()) == zdtype - assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format + assert data_type_registry.match_dtype(zdtype.to_native_dtype()) == zdtype + + @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") + @pytest.mark.parametrize("zdtype", zdtype_examples) + def test_registered_dtypes_match_json( + zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat + ) -> None: + if zarr_format == 2: + if isinstance(zdtype, HasObjectCodec): + object_codec_id = zdtype.object_codec_id + else: + object_codec_id = None + assert ( + data_type_registry.match_json_v2( + zdtype.to_json(zarr_format=zarr_format), # type: ignore[arg-type] + object_codec_id=object_codec_id, + ) + == zdtype + ) + else: + skip_object_dtype(zdtype) + assert ( + data_type_registry.match_json_v3(zdtype.to_json(zarr_format=zarr_format)) == zdtype # type: ignore[arg-type] ) - == zdtype - ) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -129,7 +146,7 @@ def test_match_dtype_unique( if _cls is not type(zdtype): data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) - dtype_instance = zdtype.to_dtype() + dtype_instance = zdtype.to_native_dtype() msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" with pytest.raises(ValueError, match=re.escape(msg)): @@ -138,7 +155,7 @@ def test_match_dtype_unique( instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No data type wrapper found that matches {instance_dict}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + data_type_registry_fixture.match_json_v3(instance_dict) # type: ignore[arg-type] # this is copied from the registry tests -- we should deduplicate @@ -161,9 +178,11 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType + data_type_registry.lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance + assert get_data_type_from_json_v3(dtype_json) == instance + data_type_registry.unregister(TestDataType._zarr_v3_name) @pytest.mark.parametrize( diff --git a/tests/test_group.py b/tests/test_group.py index ac1afb539b..eb243a008c 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -23,7 +23,6 @@ from zarr.core._info import GroupInfo from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config -from zarr.core.dtype.common import unpack_dtype_json from zarr.core.dtype.npy.int import UInt8 from zarr.core.group import ( ConsolidatedMetadata, @@ -517,7 +516,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), + "dtype": dtype.to_json(zarr_format=zarr_format), "fill_value": fill_value, "shape": (1,), "chunks": (1,), @@ -553,7 +552,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), + "data_type": dtype.to_json(zarr_format=zarr_format), "fill_value": fill_value, "node_type": "array", "shape": (1,), diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 7c82662052..395e036db2 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -638,7 +638,7 @@ async def test_consolidated_metadata_encodes_special_chars( "consolidated_metadata" ]["metadata"] - expected_fill_value = _time._zdtype.to_json_value(fill_value, zarr_format=2) + expected_fill_value = _time._zdtype.to_json_scalar(fill_value, zarr_format=2) if zarr_format == 2: assert root_metadata["time/.zarray"]["fill_value"] == expected_fill_value diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 0f88f52c66..68f53ded5f 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -130,10 +130,10 @@ def test_jsonify_fill_value_complex(fill_value: Any, dtype_str: str) -> None: """ zarr_format = 3 dtype = get_data_type_from_native_dtype(dtype_str) - expected = dtype.to_dtype().type(complex(*fill_value)) - observed = dtype.from_json_value(fill_value, zarr_format=zarr_format) + expected = dtype.to_native_dtype().type(complex(*fill_value)) + observed = dtype.from_json_scalar(fill_value, zarr_format=zarr_format) assert observed == expected - assert dtype.to_json_value(observed, zarr_format=zarr_format) == tuple(fill_value) + assert dtype.to_json_scalar(observed, zarr_format=zarr_format) == tuple(fill_value) @pytest.mark.parametrize("fill_value", [{"foo": 10}]) @@ -145,7 +145,7 @@ def test_parse_fill_value_invalid_type(fill_value: Any, dtype_str: str) -> None: """ dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=f"Invalid type: {fill_value}"): - dtype_instance.from_json_value(fill_value, zarr_format=3) + dtype_instance.from_json_scalar(fill_value, zarr_format=3) @pytest.mark.parametrize( @@ -166,7 +166,7 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str) """ dtype_instance = get_data_type_from_native_dtype(dtype_str) with pytest.raises(TypeError, match=re.escape(f"Invalid type: {fill_value}")): - dtype_instance.from_json_value(fill_value, zarr_format=3) + dtype_instance.from_json_scalar(fill_value, zarr_format=3) @pytest.mark.parametrize("chunk_grid", ["regular"]) @@ -268,8 +268,8 @@ async def test_datetime_metadata(fill_value: int, precision: str) -> None: "data_type": dtype.to_json(zarr_format=3), "chunk_key_encoding": {"name": "default", "separator": "."}, "codecs": (BytesCodec(),), - "fill_value": dtype.to_json_value( - dtype.to_dtype().type(fill_value, dtype.unit), zarr_format=3 + "fill_value": dtype.to_json_scalar( + dtype.to_native_dtype().type(fill_value, dtype.unit), zarr_format=3 ), } metadata = ArrayV3Metadata.from_dict(metadata_dict) diff --git a/tests/test_properties.py b/tests/test_properties.py index d8f70e63d7..c752721108 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -325,7 +325,7 @@ def test_array_metadata_meets_spec(meta: ArrayV2Metadata | ArrayV3Metadata) -> N assert asdict_dict["zarr_format"] == 3 # version-agnostic validations - dtype_native = meta.dtype.to_dtype() + dtype_native = meta.dtype.to_native_dtype() if dtype_native.kind == "f": assert serialized_float_is_valid(asdict_dict["fill_value"]) elif dtype_native.kind == "c": diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 61ff8ebfa9..a5b77d9931 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -34,6 +34,7 @@ def runner_installed() -> bool: class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] fill_value: np.generic | str | int + filters: tuple[numcodecs.abc.Codec, ...] = () compressor: numcodecs.abc.Codec @@ -62,7 +63,8 @@ class ArrayParams: ArrayParams( values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), fill_value="1", - compressor=VLenUTF8(), + filters=(VLenUTF8(),), + compressor=GZip(), ) ] array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases @@ -86,9 +88,9 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: dtype=dtype, chunks=array_params.values.shape, compressors=compressor, + filters=array_params.filters, fill_value=array_params.fill_value, order="C", - filters=None, chunk_key_encoding=chunk_key_encoding, write_data=True, zarr_format=2, diff --git a/tests/test_v2.py b/tests/test_v2.py index ca727c9b10..392ebc2d69 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -114,7 +114,7 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js ], ) def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): - expected = np.full((3,), value, dtype=dtype.to_dtype()) + expected = np.full((3,), value, dtype=dtype.to_native_dtype()) a = zarr.create( shape=(3,), zarr_format=2, @@ -286,8 +286,8 @@ def test_structured_dtype_roundtrip(fill_value: float | bytes, tmp_path: Path) - def test_parse_structured_fill_value_valid( fill_value: Any, dtype: np.dtype[Any], expected_result: Any ) -> None: - zdtype = Structured.from_dtype(dtype) - result = zdtype.cast_value(fill_value) + zdtype = Structured.from_native_dtype(dtype) + result = zdtype.cast_scalar(fill_value) assert result.dtype == expected_result.dtype assert result == expected_result if isinstance(expected_result, np.void): From 12bbb0768aae0f1cc49c6d1525b6b00fe2eb3b57 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 12:55:58 +0200 Subject: [PATCH 116/129] fix storage info discrepancy in docs --- docs/user-guide/arrays.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 8264b3d489..871db60874 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -210,8 +210,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696520 - Storage ratio : 41.3 + No. bytes stored : 3558573 + Storage ratio : 112.4 Chunks Initialized : 100 .. note:: From 463789b0557e8bec8c78ee76d98776aaf9b52361 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 17:29:05 +0200 Subject: [PATCH 117/129] fix docstring that was troubling sphinx --- src/zarr/core/dtype/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index c9b23707e8..f3d6b0adca 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -246,7 +246,7 @@ def check_json_v2( object_codec_id : str | None The object codec ID, if applicable. Object codecs are specific numcodecs codecs that zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set - to "|O" with an object codec ID of "vlen-utf8" indicates that the data type is a + to ``"|O"`` with an object codec ID of "vlen-utf8" indicates that the data type is a variable-length string. Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. From e665cef642a6bb37258f5ff22ce4d898c9ef433a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 21:01:14 +0200 Subject: [PATCH 118/129] wip: add vlen-bytes --- src/zarr/core/dtype/npy/vlen_bytes.py | 75 ++++++++++++++++++++++++ tests/test_regression/test_regression.py | 18 +++++- 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 src/zarr/core/dtype/npy/vlen_bytes.py diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py new file mode 100644 index 0000000000..6d804cac60 --- /dev/null +++ b/src/zarr/core/dtype/npy/vlen_bytes.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass +from typing import ClassVar, Literal, Self, TypeGuard, overload + +import numpy as np + +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import HasObjectCodec +from zarr.core.dtype.wrapper import TBaseDType, ZDType + + +@dataclass(frozen=True, kw_only=True) +class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] + dtype_cls = np.dtypes.ObjectDType + _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + object_codec_id = "vlen-bytes" + + @classmethod + def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: + return cls() + + def to_native_dtype(self) -> np.dtypes.ObjectDType: + return self.dtype_cls() + + @classmethod + def check_json_v2( + cls, data: JSON, *, object_codec_id: str | None = None + ) -> TypeGuard[Literal["|O"]]: + """ + Check that the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. + """ + return data == "|O" and object_codec_id == cls.object_codec_id + + @classmethod + def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: + return data == cls._zarr_v3_name + + @overload + def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + if zarr_format == 2: + return "|O" + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_unchecked( + cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat + ) -> Self: + return cls() + + def default_scalar(self) -> str: + return "" + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + return data # type: ignore[return-value] + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Strings pass through + """ + if not check_json_str(data): + raise TypeError(f"Invalid type: {data}. Expected a string.") + return data + + def check_scalar(self, data: object) -> bool: + return isinstance(data, str) + + def _cast_scalar_unchecked(self, data: object) -> str: + return str(data) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index a5b77d9931..83a917dee8 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -7,7 +7,7 @@ import numcodecs import numpy as np import pytest -from numcodecs import LZ4, LZMA, Blosc, GZip, VLenUTF8, Zstd +from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd import zarr from zarr.core.array import Array @@ -67,7 +67,21 @@ class ArrayParams: compressor=GZip(), ) ] -array_cases = basic_array_cases + datetime_array_cases + string_array_cases + vlen_string_cases +vlen_bytes_cases = [ + ArrayParams( + values=np.array([b"a", b"bb", b"ccc", b"dddd"], dtype="O"), + fill_value=b"1", + filters=(VLenBytes(),), + compressor=GZip(), + ) +] +array_cases = ( + basic_array_cases + + datetime_array_cases + + string_array_cases + + vlen_string_cases + + vlen_bytes_cases +) @pytest.fixture From 35116af42e725c22349b26197d56d22e0f7234a8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 29 May 2025 22:03:00 +0200 Subject: [PATCH 119/129] add vlen-bytes --- src/zarr/core/dtype/__init__.py | 3 ++ src/zarr/core/dtype/npy/vlen_bytes.py | 36 ++++++++++++------------ src/zarr/core/dtype/wrapper.py | 6 ++-- tests/test_regression/test_regression.py | 7 +++-- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index a8bfe2b5c4..575086cb6f 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -12,6 +12,7 @@ Structured, ) from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 +from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -88,6 +89,7 @@ | FixedLengthBytes | Structured | TimeDType + | VariableLengthBytes ) # mypy has trouble inferring the type of variablelengthstring dtype, because its class definition # depends on the installed numpy version. That's why the type: ignore statement is needed here. @@ -100,6 +102,7 @@ FixedLengthBytes, Structured, *TIME_DTYPE, + VariableLengthBytes, ) # This type models inputs that can be coerced to a ZDType diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py index 6d804cac60..c25523f9ed 100644 --- a/src/zarr/core/dtype/npy/vlen_bytes.py +++ b/src/zarr/core/dtype/npy/vlen_bytes.py @@ -1,15 +1,17 @@ +import base64 from dataclasses import dataclass from typing import ClassVar, Literal, Self, TypeGuard, overload import numpy as np from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasObjectCodec -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.common import HasObjectCodec, v3_unstable_dtype_warning +from zarr.core.dtype.npy.common import check_json_str +from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType @dataclass(frozen=True, kw_only=True) -class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] +class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" object_codec_id = "vlen-bytes" @@ -39,12 +41,13 @@ def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]] def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: + def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: if zarr_format == 2: return "|O" elif zarr_format == 3: + v3_unstable_dtype_warning(self) return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @@ -54,22 +57,19 @@ def _from_json_unchecked( ) -> Self: return cls() - def default_scalar(self) -> str: - return "" + def default_scalar(self) -> bytes: + return b"" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] + return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - """ - Strings pass through - """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: + if check_json_str(data): + return base64.standard_b64decode(data.encode("ascii")) + raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def check_scalar(self, data: object) -> bool: - return isinstance(data, str) + return isinstance(data, bytes | str) - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + def _cast_scalar_unchecked(self, data: object) -> bytes: + return bytes(data) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index f3d6b0adca..4c399bbb84 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -45,7 +45,7 @@ # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type -TBaseScalar = np.generic | str +TBaseScalar = np.generic | str | bytes # This is the bound for the dtypes that we support. If we support non-numpy dtypes, # then this bound will need to be widened. TBaseDType = np.dtype[np.generic] @@ -174,8 +174,8 @@ def cast_scalar(self, data: object) -> TScalar_co: if self.check_scalar(data): return self._cast_scalar_unchecked(data) msg = ( - f"The value {data} failed a type check. " - f"It cannot be safely cast to a scalar compatible with {self.dtype_cls}. " + f"The value {data!r} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {self}. " f"Consult the documentation for {self} to determine the possible values that can " "be cast to scalars of the wrapped data type." ) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py index 83a917dee8..a1d13510c3 100644 --- a/tests/test_regression/test_regression.py +++ b/tests/test_regression/test_regression.py @@ -13,6 +13,7 @@ from zarr.core.array import Array from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.string import VariableLengthString +from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes from zarr.storage import LocalStore if TYPE_CHECKING: @@ -33,7 +34,7 @@ def runner_installed() -> bool: @dataclass(kw_only=True) class ArrayParams: values: np.ndarray[tuple[int], np.dtype[np.generic]] - fill_value: np.generic | str | int + fill_value: np.generic | str | int | bytes filters: tuple[numcodecs.abc.Codec, ...] = () compressor: numcodecs.abc.Codec @@ -92,8 +93,10 @@ def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: compressor = array_params.compressor chunk_key_encoding = V2ChunkKeyEncoding(separator="/") dtype: ZDTypeLike - if array_params.values.dtype == np.dtype("|O"): + if array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenUTF8(),): dtype = VariableLengthString() # type: ignore[assignment] + elif array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenBytes(),): + dtype = VariableLengthBytes() else: dtype = array_params.values.dtype z = zarr.create_array( From 73c3c4555aa8024faa0c705b3a42fe8220ae4b96 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Jun 2025 14:59:22 +0200 Subject: [PATCH 120/129] wip --- src/zarr/abc/bikeshed.py | 29 --- src/zarr/abc/codec.py | 51 +++- src/zarr/codecs/bytes.py | 3 +- src/zarr/codecs/gzip.py | 61 ++++- src/zarr/codecs/numcodec.py | 129 +++++++++++ src/zarr/core/array.py | 7 +- src/zarr/core/common.py | 14 +- src/zarr/core/dtype/npy/sized.py | 295 ------------------------ src/zarr/core/dtype/npy/vlen_bytes.py | 75 ------ src/zarr/core/metadata/v2.py | 12 +- src/zarr/core/metadata/v3.py | 8 + src/zarr/registry.py | 7 +- tests/test_array.py | 2 +- tests/test_dtype/test_npy/test_sized.py | 155 ------------- tests/test_gzip.py | 26 +++ 15 files changed, 298 insertions(+), 576 deletions(-) delete mode 100644 src/zarr/abc/bikeshed.py create mode 100644 src/zarr/codecs/numcodec.py delete mode 100644 src/zarr/core/dtype/npy/sized.py delete mode 100644 src/zarr/core/dtype/npy/vlen_bytes.py delete mode 100644 tests/test_dtype/test_npy/test_sized.py create mode 100644 tests/test_gzip.py diff --git a/src/zarr/abc/bikeshed.py b/src/zarr/abc/bikeshed.py deleted file mode 100644 index 3d01c234dd..0000000000 --- a/src/zarr/abc/bikeshed.py +++ /dev/null @@ -1,29 +0,0 @@ -from collections.abc import Mapping -from typing import Generic, Self, TypeVar - -import numpy as np -from typing_extensions import Buffer, Protocol, runtime_checkable - -BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] -BaseConfig = Mapping[str, object] -TNCodecConfig = TypeVar("TNCodecConfig", bound=BaseConfig) - - -@runtime_checkable -class Numcodec(Protocol, Generic[TNCodecConfig]): - """ - This protocol models the numcodecs.abc.Codec interface. - """ - - codec_id: str - - def encode(self, buf: BufferOrNDArray) -> BufferOrNDArray: ... - - def decode( - self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None - ) -> BufferOrNDArray: ... - - def get_config(self) -> TNCodecConfig: ... - - @classmethod - def from_config(cls, config: TNCodecConfig) -> Self: ... diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 47664abced..1c7f673272 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,21 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import ( + TYPE_CHECKING, + Generic, + Literal, + TypedDict, + TypeVar, + overload, +) + +from typing_extensions import ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +44,15 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecConfig_V2(TypedDict, Generic[TName]): + id: ReadOnly[TName] + + +CodecConfig_V3 = NamedConfig[str, Mapping[str, object]] + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. @@ -157,6 +176,34 @@ async def encode( """ return await _batching_helper(self._encode_single, chunks_and_specs) + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecConfig_V2[str] | NamedConfig[str, Mapping[str, object]]: + raise NotImplementedError + + @classmethod + def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + raise NotImplementedError + + @classmethod + def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + raise NotImplementedError + + @classmethod + def from_json(cls, data: Mapping[str, object], zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 6ef0fef60b..827e3c466f 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,7 +3,7 @@ import sys from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import numpy as np @@ -18,7 +18,6 @@ from typing import Self from zarr.core.array_spec import ArraySpec - from zarr.core.dtype.common import Endianness class Endian(Enum): diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index b6e693148e..bb7cb854b0 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -1,14 +1,15 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecConfig_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -28,6 +29,16 @@ def parse_gzip_level(data: JSON) -> int: return data +class GZipSettings(TypedDict): + level: int + + +class GZipConfig_V2(CodecConfig_V2[Literal["gzip"]], GZipSettings): ... + + +GZipConfig_V3 = NamedConfig[Literal["gzip"], GZipSettings] + + @dataclass(frozen=True) class GzipCodec(BytesBytesCodec): is_fixed_size = False @@ -47,6 +58,50 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} + @overload + def to_json(self, zarr_format: Literal[2]) -> GZipConfig_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> GZipConfig_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> GZipConfig_V2 | GZipConfig_V3: + if zarr_format == 2: + return {"id": "gzip", "level": self.level} + elif zarr_format == 3: + return {"name": "gzip", "configuration": {"level": self.level}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V2]: + return ( + set(data.keys()) == {"id", "level"} + and data["id"] == "gzip" + and isinstance(data["level"], int) + ) + + @classmethod + def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V3]: + return ( + set(data.keys()) == {"name", "configuration"} + and data["name"] == "gzip" + and isinstance(data["configuration"], dict) + and "level" in data["configuration"] + and isinstance(data["configuration"]["level"], int) + ) + + @classmethod + def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + if cls._check_json_v2(data): + return cls(level=data["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + if cls._check_json_v3(data): + return cls(level=data["configuration"]["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") + async def _decode_single( self, chunk_bytes: Buffer, diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py new file mode 100644 index 0000000000..a8670829ad --- /dev/null +++ b/src/zarr/codecs/numcodec.py @@ -0,0 +1,129 @@ +""" +Utilities for interfacing with the numcodecs library. +""" + +from __future__ import annotations + +import asyncio +from collections.abc import Mapping +from dataclasses import dataclass +from typing import TYPE_CHECKING, Literal, Self, overload + +import numpy as np +from typing_extensions import Protocol, runtime_checkable + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecConfig_V2 +from zarr.core.array_spec import ArraySpec +from zarr.core.buffer.core import Buffer, BufferPrototype, NDArrayLike, NDBuffer +from zarr.core.buffer.cpu import as_numpy_array_wrapper + +if TYPE_CHECKING: + from zarr.core.array_spec import ArraySpec + from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat + +BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] | NDArrayLike + + +def resolve_numcodec(config: CodecConfig_V2[str]) -> Numcodec: + import numcodecs + + return numcodecs.get_codec(config) # type: ignore[no-any-return] + + +@runtime_checkable +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: BufferOrNDArray) -> BufferOrNDArray: ... + + def decode( + self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None + ) -> BufferOrNDArray: ... + + def get_config(self) -> CodecConfig_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecConfig_V2[str]) -> Self: ... + + +@dataclass(frozen=True, kw_only=True) +class NumcodecsAdapter: + _codec: Numcodec + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecConfig_V2[str] | NamedConfig[str, BaseConfig]: + if zarr_format == 2: + return self._codec.get_config() + elif zarr_format == 3: + config = self._codec.get_config() + config_no_id = {k: v for k, v in config.items() if k != "id"} + return {"name": config["id"], "configuration": config_no_id} + raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + return cls(_codec=resolve_numcodec(data)) # type: ignore[arg-type] + + @classmethod + def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + raise NotImplementedError( + "This class does not support creating instances from JSON data for Zarr format 3." + ) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + +class NumcodecsBytesBytesCodec(NumcodecsAdapter, BytesBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self._codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self._codec.encode(chunk_bytes.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayCodec(NumcodecsAdapter, ArrayArrayCodec): + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayBytesCodec(NumcodecsAdapter, ArrayBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self._codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4e71ee34fc..edccbfea6c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -29,6 +29,7 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.numcodec import Numcodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes @@ -4776,11 +4777,11 @@ def _parse_chunk_encoding_v3( elif compressors == "auto": out_bytes_bytes = default_compressors_v3(dtype) else: - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - if isinstance(compressors, dict | Codec): + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] + if isinstance(compressors, dict | Codec | Numcodec): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors) + maybe_bytes_bytes = compressors # type: ignore[assignment] out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 6d99f1e937..feb3d37f5b 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,8 @@ overload, ) +from typing_extensions import ReadOnly + from zarr.core.config import config as zarr_config if TYPE_CHECKING: @@ -42,13 +44,15 @@ ANY_ACCESS_MODE: Final = "r", "r+", "a", "w", "w-" DimensionNames = Iterable[str | None] | None -TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) +BaseConfig = Mapping[str, object] + +TName_co = TypeVar("TName_co", bound=str, covariant=True) +TConfig_co = TypeVar("TConfig_co", bound=BaseConfig, covariant=True) -class NamedConfig(TypedDict, Generic[TName, TConfig]): - name: TName - configuration: TConfig +class NamedConfig(TypedDict, Generic[TName_co, TConfig_co]): + name: ReadOnly[TName_co] + configuration: ReadOnly[TConfig_co] def product(tup: ChunkCoords) -> int: diff --git a/src/zarr/core/dtype/npy/sized.py b/src/zarr/core/dtype/npy/sized.py deleted file mode 100644 index 69d6145ad4..0000000000 --- a/src/zarr/core/dtype/npy/sized.py +++ /dev/null @@ -1,295 +0,0 @@ -import base64 -import re -from collections.abc import Sequence -from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload - -import numpy as np - -from zarr.core.common import JSON, NamedConfig, ZarrFormat -from zarr.core.dtype.common import ( - DataTypeValidationError, - HasItemSize, - HasLength, - v3_unstable_dtype_warning, -) -from zarr.core.dtype.npy.common import ( - bytes_from_json, - bytes_to_json, - check_json_str, -) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, TBaseScalar, ZDType - - -class FixedLengthBytesConfig(TypedDict): - length_bytes: int - - -FixedLengthBytesJSONV3 = NamedConfig[Literal["fixed_length_bytes"], FixedLengthBytesConfig] - - -@dataclass(frozen=True, kw_only=True) -class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): - # np.dtypes.VoidDType is specified in an odd way in numpy - # it cannot be used to create instances of the dtype - # so we have to tell mypy to ignore this here - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["fixed_length_bytes"]] = "fixed_length_bytes" - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - - def to_native_dtype(self) -> np.dtypes.VoidDType[int]: - # Numpy does not allow creating a void type - # by invoking np.dtypes.VoidDType directly - return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - # Check that the dtype is |V1, |V2, ... - return isinstance(data, str) and re.match(r"^\|V\d+$", data) is not None - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthBytesJSONV3]: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and set(data["configuration"].keys()) == {"length_bytes"} - ) - - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthBytesJSONV3: ... - - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthBytesJSONV3: - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_native_dtype( - cls: type[Self], dtype: TBaseDType - ) -> TypeGuard[np.dtypes.VoidDType[Any]]: - """ - Numpy void dtype comes in two forms: - * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. - * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, - - In this check we ensure that ``fields`` is ``None``. - - Parameters - ---------- - dtype : TDType - The dtype to check. - - Returns - ------- - Bool - True if the dtype matches, False otherwise. - """ - return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] - - def default_scalar(self) -> np.void: - return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data)) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - return isinstance(data, np.bytes_ | str | bytes | np.void) - - def _cast_scalar_unchecked(self, data: object) -> np.void: - native_dtype = self.to_native_dtype() - # Without the second argument, numpy will return a void scalar for dtype V1. - # The second argument ensures that, if native_dtype is something like V10, - # the result will actually be a V10 scalar. - return native_dtype.type(data, native_dtype) - - @property - def item_size(self) -> int: - return self.length - - -# TODO: tighten this up, get a v3 spec in place, handle endianness, etc. -@dataclass(frozen=True, kw_only=True) -class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): - dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "structured" - fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] - - def default_scalar(self) -> np.void: - return self._cast_scalar_unchecked(0) - - def _cast_scalar_unchecked(self, data: object) -> np.void: - na_dtype = self.to_native_dtype() - if isinstance(data, bytes): - res = np.frombuffer(data, dtype=na_dtype)[0] - elif isinstance(data, list | tuple): - res = np.array([tuple(data)], dtype=na_dtype)[0] - else: - res = np.array([data], dtype=na_dtype)[0] - return cast("np.void", res) - - @classmethod - def check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: - """ - Check that this dtype is a numpy structured dtype - - Parameters - ---------- - dtype : np.dtypes.DTypeLike - The dtype to check. - - Returns - ------- - TypeGuard[np.dtypes.VoidDType] - True if the dtype matches, False otherwise. - """ - return super().check_native_dtype(dtype) and dtype.fields is not None - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - from zarr.core.dtype import get_data_type_from_native_dtype - - fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] - - if dtype.fields is None: - raise ValueError("numpy dtype has no fields") - - # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only - # care about the first element in either case. - for key, (dtype_instance, *_) in dtype.fields.items(): - dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) - fields.append((key, dtype_wrapped)) - - return cls(fields=tuple(fields)) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V3 | DTypeJSON_V2: - fields = [ - (f_name, f_dtype.to_json(zarr_format=zarr_format)) for f_name, f_dtype in self.fields - ] - if zarr_format == 2: - return fields - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - base_dict = {"name": self._zarr_v3_name} - base_dict["configuration"] = {"fields": fields} # type: ignore[assignment] - return cast("DTypeJSON_V3", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[list[object]]: - # the actual JSON form is recursive and hard to annotate, so we give up and do - # list[object] for now - - return ( - not isinstance(data, str) - and isinstance(data, Sequence) - and all( - not isinstance(field, str) and isinstance(field, Sequence) and len(field) == 2 - for field in data - ) - ) - - @classmethod - def check_json_v3( - cls, data: JSON - ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, JSON]]]]]: - return ( - isinstance(data, dict) - and "name" in data - and data["name"] == cls._zarr_v3_name - and "configuration" in data - and isinstance(data["configuration"], dict) - and "fields" in data["configuration"] - ) - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - # avoid circular import issues by importing these functions here - from zarr.core.dtype import get_data_type_from_json_v2, get_data_type_from_json_v3 - - # This is a horrible mess, because this data type is recursive - if zarr_format == 2: - if cls.check_json_v2(data): # type: ignore[arg-type] - # structured dtypes are constructed directly from a list of lists - # note that we do not handle the object codec here! this will prevent structured - # dtypes from containing object dtypes. - return cls( - fields=tuple( # type: ignore[misc] - (f_name, get_data_type_from_json_v2(f_dtype, object_codec_id=None)) # type: ignore[has-type] - for f_name, f_dtype in data - ) - ) - else: - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - elif zarr_format == 3: - if cls.check_json_v3(data): # type: ignore[arg-type] - config = data["configuration"] - meta_fields = config["fields"] - fields = tuple( - (f_name, get_data_type_from_json_v3(f_dtype)) for f_name, f_dtype in meta_fields - ) - else: - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}.") - else: - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - return cls(fields=fields) - - def to_native_dtype(self) -> np.dtypes.VoidDType[int]: - return cast( - "np.dtypes.VoidDType[int]", - np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), - ) - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) - - def check_scalar(self, data: object) -> bool: - # TODO: implement something here! - return True - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: - if check_json_str(data): - as_bytes = bytes_from_json(data, zarr_format=zarr_format) - dtype = self.to_native_dtype() - return cast("np.void", np.array([as_bytes]).view(dtype)[0]) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - @property - def item_size(self) -> int: - # Lets have numpy do the arithmetic here - return self.to_native_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/vlen_bytes.py b/src/zarr/core/dtype/npy/vlen_bytes.py deleted file mode 100644 index c25523f9ed..0000000000 --- a/src/zarr/core/dtype/npy/vlen_bytes.py +++ /dev/null @@ -1,75 +0,0 @@ -import base64 -from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard, overload - -import numpy as np - -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasObjectCodec, v3_unstable_dtype_warning -from zarr.core.dtype.npy.common import check_json_str -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType - - -@dataclass(frozen=True, kw_only=True) -class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" - object_codec_id = "vlen-bytes" - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() - - @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name - - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_bytes"]: - if zarr_format == 2: - return "|O" - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - - def default_scalar(self) -> bytes: - return b"" - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: - if check_json_str(data): - return base64.standard_b64decode(data.encode("ascii")) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - return isinstance(data, bytes | str) - - def _cast_scalar_unchecked(self, data: object) -> bytes: - return bytes(data) # type: ignore[no-any-return, call-overload] diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 9296afd72a..97e3f26750 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -3,11 +3,13 @@ import warnings from collections.abc import Iterable, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict +from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast import numcodecs.abc +from zarr.abc.codec import Codec from zarr.abc.metadata import Metadata +from zarr.codecs.numcodec import NumcodecsAdapter from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json_v2 from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType @@ -58,9 +60,6 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None -# These are the ids of the known object codecs for zarr v2. -ObjectCodecIds = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") - @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): @@ -210,6 +209,9 @@ def to_dict(self) -> dict[str, JSON]: codec_config.pop("checksum") zarray_dict["compressor"] = codec_config + if isinstance(zarray_dict["compressor"], NumcodecsAdapter): + zarray_dict["compressor"] = zarray_dict["compressor"].to_json(zarr_format=2) + if zarray_dict["filters"] is not None: raw_filters = zarray_dict["filters"] # TODO: remove this when we can stratically type the output JSON data structure @@ -301,7 +303,7 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + if data is None or isinstance(data, numcodecs.abc.Codec | Codec): return data if isinstance(data, dict): return numcodecs.get_codec(data) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 83b9bd7bc8..4b62b7ef3d 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, TypedDict from zarr.abc.metadata import Metadata +from zarr.codecs.numcodec import NumcodecsAdapter from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import ( VariableLengthString, @@ -336,6 +337,13 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + out_dict["codecs"] = [] + for codec in self.codecs: + if isinstance(codec, NumcodecsAdapter): + out_dict["codecs"].append(codec.to_json(zarr_format=3)) + else: + out_dict["codecs"].append(codec.to_dict()) + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` # until then, we have this hack here, which relies on the fact that to_dict will pass through diff --git a/src/zarr/registry.py b/src/zarr/registry.py index eb345b24b1..4fa0484d25 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -177,19 +177,24 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsBytesBytesCodec + result: BytesBytesCodec if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsBytesBytesCodec(_codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") diff --git a/tests/test_array.py b/tests/test_array.py index 0f8e2f8343..0da68a1af8 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -7,7 +7,7 @@ import re import sys from itertools import accumulate -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import TYPE_CHECKING, Any, Literal from unittest import mock import numcodecs diff --git a/tests/test_dtype/test_npy/test_sized.py b/tests/test_dtype/test_npy/test_sized.py deleted file mode 100644 index d7aef88168..0000000000 --- a/tests/test_dtype/test_npy/test_sized.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import numpy as np - -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams -from zarr.core.dtype import ( - FixedLengthBytes, - Float16, - Float64, - Int32, - Int64, - Structured, -) - - -class TestFixedLengthBytes(BaseTestZDType): - test_cls = FixedLengthBytes - valid_dtype = (np.dtype("|V10"),) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|S10"), - ) - valid_json_v2 = (V2JsonTestParams(dtype="|V10"),) - valid_json_v3 = ( - {"name": "fixed_length_bytes", "configuration": {"length_bytes": 0}}, - {"name": "fixed_length_bytes", "configuration": {"length_bytes": 8}}, - ) - - invalid_json_v2 = ( - "|V", - "|S10", - "|f8", - ) - invalid_json_v3 = ( - {"name": "r10"}, - {"name": "r-80"}, - ) - - scalar_v2_params = ( - (FixedLengthBytes(length=0), ""), - (FixedLengthBytes(length=2), "YWI="), - (FixedLengthBytes(length=4), "YWJjZA=="), - ) - scalar_v3_params = ( - (FixedLengthBytes(length=0), ""), - (FixedLengthBytes(length=2), "YWI="), - (FixedLengthBytes(length=4), "YWJjZA=="), - ) - cast_value_params = ( - (FixedLengthBytes(length=0), b"", np.void(b"")), - (FixedLengthBytes(length=2), b"ab", np.void(b"ab")), - (FixedLengthBytes(length=4), b"abcd", np.void(b"abcd")), - ) - item_size_params = ( - FixedLengthBytes(length=0), - FixedLengthBytes(length=4), - FixedLengthBytes(length=10), - ) - - -class TestStructured(BaseTestZDType): - test_cls = Structured - valid_dtype = ( - np.dtype([("field1", np.int32), ("field2", np.float64)]), - np.dtype([("field1", np.int64), ("field2", np.int32)]), - ) - invalid_dtype = ( - np.dtype(np.int8), - np.dtype(np.float64), - np.dtype("|S10"), - ) - valid_json_v2 = ( - V2JsonTestParams(dtype=[("field1", ">i4"), ("field2", ">f8")]), - V2JsonTestParams(dtype=[("field1", ">i8"), ("field2", ">i4")]), - ) - valid_json_v3 = ( - { - "name": "structured", - "configuration": { - "fields": [ - ("field1", "int32"), - ("field2", "float64"), - ] - }, - }, - { - "name": "structured", - "configuration": { - "fields": [ - ( - "field1", - { - "name": "numpy.datetime64", - "configuration": {"unit": "s", "scale_factor": 1}, - }, - ), - ( - "field2", - {"name": "fixed_length_utf32", "configuration": {"length_bytes": 32}}, - ), - ] - }, - }, - ) - invalid_json_v2 = ( - [("field1", "|i1"), ("field2", "|f8")], - [("field1", "|S10"), ("field2", "|f8")], - ) - invalid_json_v3 = ( - { - "name": "structured", - "configuration": { - "fields": [ - ("field1", {"name": "int32", "configuration": {"endianness": "invalid"}}), - ("field2", {"name": "float64", "configuration": {"endianness": "big"}}), - ] - }, - }, - {"name": "invalid_name"}, - ) - - scalar_v2_params = ( - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "AQAAAAAAAAAAAPA/"), - (Structured(fields=(("field1", Float16()), ("field2", Int32()))), "AQAAAAAA"), - ) - scalar_v3_params = ( - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "AQAAAAAAAAAAAPA/"), - (Structured(fields=(("field1", Int64()), ("field2", Int32()))), "AQAAAAAAAAAAAPA/"), - ) - - cast_value_params = ( - ( - Structured(fields=(("field1", Int32()), ("field2", Float64()))), - (1, 2.0), - np.array((1, 2.0), dtype=[("field1", np.int32), ("field2", np.float64)]), - ), - ( - Structured(fields=(("field1", Int64()), ("field2", Int32()))), - (3, 4.5), - np.array((3, 4.5), dtype=[("field1", np.int64), ("field2", np.int32)]), - ), - ) - - def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: - if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): - return np.array_equal(scalar1, scalar2) - return super().scalar_equals(scalar1, scalar2) - - item_size_params = ( - Structured(fields=(("field1", Int32()), ("field2", Float64()))), - Structured(fields=(("field1", Int64()), ("field2", Int32()))), - ) diff --git a/tests/test_gzip.py b/tests/test_gzip.py new file mode 100644 index 0000000000..a092418615 --- /dev/null +++ b/tests/test_gzip.py @@ -0,0 +1,26 @@ +import json + +import pytest + +import zarr +from zarr.codecs import GzipCodec + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_gzip_compression(zarr_format): + store = {} + arr_in = zarr.create_array( + store=store, + dtype="int", + shape=(1,), + chunks=(10,), + zarr_format=zarr_format, + compressors=GzipCodec(), + ) + + if zarr_format == 2: + print(json.dumps(json.loads(store[".zarray"].to_bytes()), indent=2)) + else: + print(json.dumps(json.loads(store["zarr.json"].to_bytes()), indent=2)) + + arr_out = zarr.open_array(store=store, zarr_format=zarr_format) From 629557812ce45b975f2601e9434348eeebcc8be7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 30 Jun 2025 10:25:50 +0200 Subject: [PATCH 121/129] wip --- src/zarr/codecs/numcodec.py | 24 +++++++- src/zarr/core/metadata/v2.py | 26 +++------ src/zarr/registry.py | 105 +++++++++++++++++++++++++++++++---- 3 files changed, 126 insertions(+), 29 deletions(-) diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py index a8670829ad..39b7b5c037 100644 --- a/src/zarr/codecs/numcodec.py +++ b/src/zarr/codecs/numcodec.py @@ -7,7 +7,7 @@ import asyncio from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, overload +from typing import TYPE_CHECKING, Callable, Literal, Self, TypeGuard, overload import numpy as np from typing_extensions import Protocol, runtime_checkable @@ -49,6 +49,26 @@ def get_config(self) -> CodecConfig_V2[str]: ... @classmethod def from_config(cls, config: CodecConfig_V2[str]) -> Self: ... +def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object implements the Numcodec protocol. Because the @runtime_checkable + decorator does not allow issubclass checks for protocols with non-method members (i.e., attributes), + we need to manually check for the presence of the required attributes and methods. + """ + return ( + isinstance(obj, type) and + hasattr(obj, "codec_id") and + isinstance(obj.codec_id, str) and + hasattr(obj, "encode") and + callable(obj.encode) and + hasattr(obj, "decode") and + callable(obj.decode) and + hasattr(obj, "get_config") and + callable(obj.get_config) and + hasattr(obj, "from_config") and + callable(obj.from_config) + ) + @dataclass(frozen=True, kw_only=True) class NumcodecsAdapter: @@ -104,7 +124,7 @@ async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buf @dataclass(kw_only=True, frozen=True) -class NumcodecsArrayCodec(NumcodecsAdapter, ArrayArrayCodec): +class NumcodecsArrayArrayCodec(NumcodecsAdapter, ArrayArrayCodec): async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: chunk_ndarray = chunk_data.as_ndarray_like() out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 97e3f26750..02a3a18b33 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,7 @@ from zarr.abc.codec import Codec from zarr.abc.metadata import Metadata -from zarr.codecs.numcodec import NumcodecsAdapter +from zarr.codecs.numcodec import Numcodec, NumcodecsAdapter from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json_v2 from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType @@ -202,29 +202,21 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): + if isinstance(zarray_dict["compressor"], Numcodec): + raise ValueError('raw numcodecs codecs are not allowed.') codec_config = zarray_dict["compressor"].get_config() # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): codec_config.pop("checksum") zarray_dict["compressor"] = codec_config - if isinstance(zarray_dict["compressor"], NumcodecsAdapter): - zarray_dict["compressor"] = zarray_dict["compressor"].to_json(zarr_format=2) - + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + new_filters = [] if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters + new_filters.append(f.to_json(zarr_format=2)) + else: + new_filters = None + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 4fa0484d25..51fbb94029 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - + from zarr.codecs.numcodec import Numcodec from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -53,6 +53,10 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: self[qualname] = cls +__filter_registries: dict[str, Registry[ArrayArrayCodec]] = defaultdict(Registry) +__serializer_registries: dict[str, Registry[ArrayBytesCodec]] = defaultdict(Registry) +__compressor_registries: dict[str, Registry[BytesBytesCodec]] = defaultdict(Registry) + __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() @@ -117,17 +121,59 @@ def _collect_entrypoints() -> list[Registry[Any]]: def _reload_config() -> None: config.refresh() - def fully_qualified_name(cls: type) -> str: module = cls.__module__ return module + "." + cls.__qualname__ +def register_filter(key: str, codec_cls: type[ArrayArrayCodec]) -> None: + if key not in __filter_registries: + __filter_registries[key] = Registry() + __filter_registries[key].register(codec_cls) + +def register_serializer(key: str, codec_cls: type[ArrayBytesCodec]) -> None: + from zarr.codecs.numcodec import NumcodecsArrayBytesCodec, is_numcodec_cls + if is_numcodec_cls(codec_cls): + _codec_cls = NumcodecsArrayBytesCodec(_codec=codec_cls) + else: + _codec_cls = codec_cls + if key not in __serializer_registries: + __serializer_registries[key] = Registry() + __serializer_registries[key].register(_codec_cls) + +def register_serializer(key: str, codec_cls: type[ArrayBytesCodec]) -> None: + from zarr.codecs.numcodec import NumcodecsArrayBytesCodec, is_numcodec_cls + if is_numcodec_cls(codec_cls): + _codec_cls = NumcodecsArrayBytesCodec(_codec=codec_cls) + else: + _codec_cls = codec_cls + if key not in __serializer_registries: + __serializer_registries[key] = Registry() + __serializer_registries[key].register(_codec_cls) + +def register_compressor(key: str, codec_cls: type[BytesBytesCodec | Numcodec]) -> None: + from zarr.codecs.numcodec import NumcodecsBytesBytesCodec, is_numcodec_cls + if is_numcodec_cls(codec_cls): + _codec_cls = NumcodecsBytesBytesCodec(_codec=codec_cls) + else: + _codec_cls = codec_cls + if key not in __compressor_registries: + __compressor_registries[key] = Registry() + __compressor_registries[key].register(_codec_cls) def register_codec(key: str, codec_cls: type[Codec]) -> None: + from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec + if issubclass(codec_cls, ArrayBytesCodec): + register_serializer(key, codec_cls) + elif issubclass(codec_cls, ArrayArrayCodec): + register_filter(key, codec_cls) + else: + register_compressor(key, codec_cls) + + """ if key not in __codec_registries: __codec_registries[key] = Registry() __codec_registries[key].register(codec_cls) - + """ def register_pipeline(pipe_cls: type[CodecPipeline]) -> None: __pipeline_registry.register(pipe_cls) @@ -140,6 +186,41 @@ def register_ndbuffer(cls: type[NDBuffer], qualname: str | None = None) -> None: def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) +def get_filter_class(key: str, reload_config: bool = False) -> type[ArrayArrayCodec]: + return _get_codec_class(key, __serializer_registries, reload_config=reload_config) + +def get_serializer_class(key: str, reload_config: bool = False) -> type[ArrayBytesCodec]: + return _get_codec_class(key, __serializer_registries, reload_config=reload_config) + +def get_compressor_class(key: str, reload_config: bool = False) -> type[BytesBytesCodec]: + return _get_codec_class(key, __compressor_registries, reload_config=reload_config) + +def _get_codec_class(key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False) -> type[Codec]: + if reload_config: + _reload_config() + + if key in registry: + # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) + registry[key].lazy_load() + + codec_classes = registry[key] + if not codec_classes: + raise KeyError(key) + + config_entry = config.get("codecs", {}).get(key) + if config_entry is None: + if len(codec_classes) == 1: + return next(iter(codec_classes.values())) + warnings.warn( + f"Codec '{key}' not configured in config. Selecting any implementation.", + stacklevel=2, + ) + return list(codec_classes.values())[-1] + selected_codec_cls = codec_classes[config_entry] + + if selected_codec_cls: + return selected_codec_cls + raise KeyError(key) def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: if reload_config: @@ -189,7 +270,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesB result: BytesBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_compressor_class(data["name"]).from_dict(data) if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) @@ -202,19 +283,21 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesB return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec - + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_serializer_class(data["name"]).from_dict(data) if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayBytesCodec(_codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -222,19 +305,21 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec - + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_filter_class(data["name"]).from_dict(data) if not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayArrayCodec(_codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") From 64f234e0204203f097b76016145002bdc8589293 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 3 Jul 2025 21:57:55 +0200 Subject: [PATCH 122/129] add image codecs test --- examples/image_codecs.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 examples/image_codecs.py diff --git a/examples/image_codecs.py b/examples/image_codecs.py new file mode 100644 index 0000000000..85fe95e38c --- /dev/null +++ b/examples/image_codecs.py @@ -0,0 +1,29 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr @ file:///home/bennettd/dev/zarr-python/", +# "imagecodecs==2025.3.30" +# ] +# /// + +# "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", + +import numcodecs +import numpy as np +from imagecodecs.numcodecs import Jpeg + +import zarr + +numcodecs.register_codec(Jpeg) +jpg_codec = Jpeg() +store = {} + +z_w = zarr.create_array( + store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=3 +) + +# breakpoint() + +z_r = zarr.open_array(store=store, zarr_format=3) + +print(z_r.metadata.to_dict()["codecs"]) From 6eb3298e8d9cae603782c4faf0144c9b66e73c11 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 10 Jul 2025 18:45:29 +0200 Subject: [PATCH 123/129] wip --- examples/image_codecs.py | 2 - pyproject.toml | 3 +- src/zarr/abc/codec.py | 6 +- src/zarr/codecs/_v2.py | 2 +- src/zarr/codecs/blosc.py | 162 ++++++++++++++++++++++---------- src/zarr/codecs/bytes.py | 11 +-- src/zarr/codecs/gzip.py | 26 ++--- src/zarr/codecs/numcodec.py | 90 +++++++++++++----- src/zarr/codecs/vlen_utf8.py | 25 ++++- src/zarr/codecs/zstd.py | 25 ++++- src/zarr/core/array.py | 69 +++++++------- src/zarr/core/codec_pipeline.py | 108 ++++++++++++++++++++- src/zarr/core/common.py | 37 +++++++- src/zarr/core/config.py | 6 +- src/zarr/core/metadata/v2.py | 101 ++++++++++---------- src/zarr/core/metadata/v3.py | 22 +++-- src/zarr/registry.py | 91 ++++++------------ tests/test_api.py | 12 +-- tests/test_array.py | 4 +- tests/test_examples.py | 3 +- tests/test_image_codecs.py | 17 ++++ tests/test_x.py | 4 + 22 files changed, 553 insertions(+), 273 deletions(-) create mode 100644 tests/test_image_codecs.py create mode 100644 tests/test_x.py diff --git a/examples/image_codecs.py b/examples/image_codecs.py index 85fe95e38c..bfdb81e152 100644 --- a/examples/image_codecs.py +++ b/examples/image_codecs.py @@ -22,8 +22,6 @@ store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=3 ) -# breakpoint() - z_r = zarr.open_array(store=store, zarr_format=3) print(z_r.metadata.to_dict()["codecs"]) diff --git a/pyproject.toml b/pyproject.toml index 64cc4f7e1c..2450966352 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,8 @@ test = [ "pytest-xdist", "packaging", "tomlkit", - "uv" + "uv", + "imagecodecs" ] remote_tests = [ 'zarr[remote]', diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 1c7f673272..ec3634862e 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -47,7 +47,7 @@ TName = TypeVar("TName", bound=str, covariant=True) -class CodecConfig_V2(TypedDict, Generic[TName]): +class CodecJSON_V2(TypedDict, Generic[TName]): id: ReadOnly[TName] @@ -177,13 +177,13 @@ async def encode( return await _batching_helper(self._encode_single, chunks_and_specs) @overload - def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ... + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... @overload def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... def to_json( self, zarr_format: ZarrFormat - ) -> CodecConfig_V2[str] | NamedConfig[str, Mapping[str, object]]: + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: raise NotImplementedError @classmethod diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..d347bed420 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -19,7 +19,7 @@ @dataclass(frozen=True) -class V2Codec(ArrayBytesCodec): +class _V2Codec(ArrayBytesCodec): filters: tuple[numcodecs.abc.Codec, ...] | None compressor: numcodecs.abc.Codec | None diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 1c5e52e9a4..d26699dd14 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,18 +1,20 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Literal, NotRequired, TypedDict, get_args, overload +from typing_extensions import Final import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, NamedRequiredConfig, ZarrFormat, parse_enum, parse_named_configuration from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec @@ -22,40 +24,48 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int -class BloscShuffle(Enum): +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + +class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2): """ - Enum for shuffle filter used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" - - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] - - -class BloscCname(Enum): +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V3. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") + +def parse_shuffle(value: object) -> BloscShuffle: + if value not in BLOSC_SHUFFLE: + raise ValueError( + f"Value must be one of {BLOSC_SHUFFLE}. Got {value} instead." + ) + return value + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError( + f"Value must be one of {BLOSC_CNAME}. Got {value} instead." + ) + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc numcodecs.blosc.use_threads = False @@ -90,22 +100,22 @@ class BloscCodec(BytesBytesCodec): is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) @@ -122,21 +132,78 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") + return self.to_json(zarr_format=3) return { "name": "blosc", "configuration": { "typesize": self.typesize, - "cname": self.cname.value, + "cname": self.cname, "clevel": self.clevel, - "shuffle": self.shuffle.value, + "shuffle": self.shuffle, "blocksize": self.blocksize, }, } + @classmethod + def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + return cls( + typesize=data["typesize"], + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + ) + + @classmethod + def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + + @classmethod + def from_json(cls, data: Mapping[str, object], zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", + "clevel": self.clevel, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), + "blocksize": self.blocksize + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize + }} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 if isinstance(array_spec.dtype, HasItemSize): @@ -147,7 +214,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), + shuffle="bitshuffle" if item_size == 1 else "shuffle"), ) return new_codec @@ -156,15 +223,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 827e3c466f..4dbfc0ff0d 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -74,12 +74,11 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = cast( - "Endianness | None", self.endian.value if self.endian is not None else None - ) - new_byte_order = endianness_to_numpy_str(endian_str) - dtype = chunk_spec.dtype.to_native_dtype().newbyteorder(new_byte_order) - + endian = self.endian.value if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: + dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] + else: + dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() if isinstance(as_array_like, NDArrayLike): as_nd_array_like = as_array_like diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index bb7cb854b0..23177e8755 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -7,7 +7,7 @@ from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec, CodecConfig_V2 +from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec @@ -29,14 +29,18 @@ def parse_gzip_level(data: JSON) -> int: return data -class GZipSettings(TypedDict): +class GZipConfig(TypedDict): level: int +class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): + """ + The JSON form of the GZip codec in Zarr V2. + """ -class GZipConfig_V2(CodecConfig_V2[Literal["gzip"]], GZipSettings): ... - - -GZipConfig_V3 = NamedConfig[Literal["gzip"], GZipSettings] +class GZipJSON_V3(NamedConfig[Literal["gzip"], GZipConfig]): + """ + The JSON form of the GZip codec in Zarr V3. + """ @dataclass(frozen=True) @@ -59,11 +63,11 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} @overload - def to_json(self, zarr_format: Literal[2]) -> GZipConfig_V2: ... + def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> GZipConfig_V3: ... + def to_json(self, zarr_format: Literal[3]) -> GZipJSON_V3: ... - def to_json(self, zarr_format: ZarrFormat) -> GZipConfig_V2 | GZipConfig_V3: + def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: if zarr_format == 2: return {"id": "gzip", "level": self.level} elif zarr_format == 3: @@ -73,7 +77,7 @@ def to_json(self, zarr_format: ZarrFormat) -> GZipConfig_V2 | GZipConfig_V3: ) # pragma: no cover @classmethod - def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V2]: + def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipJSON_V2]: return ( set(data.keys()) == {"id", "level"} and data["id"] == "gzip" @@ -81,7 +85,7 @@ def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V2]: ) @classmethod - def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V3]: + def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipJSON_V3]: return ( set(data.keys()) == {"name", "configuration"} and data["name"] == "gzip" diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py index 39b7b5c037..a22b46ff4e 100644 --- a/src/zarr/codecs/numcodec.py +++ b/src/zarr/codecs/numcodec.py @@ -5,28 +5,57 @@ from __future__ import annotations import asyncio -from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Callable, Literal, Self, TypeGuard, overload +import numcodecs +import numcodecs.registry as numcodecs_registry import numpy as np from typing_extensions import Protocol, runtime_checkable -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecConfig_V2 +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BaseCodec, BytesBytesCodec, CodecJSON_V2 from zarr.core.array_spec import ArraySpec from zarr.core.buffer.core import Buffer, BufferPrototype, NDArrayLike, NDBuffer from zarr.core.buffer.cpu import as_numpy_array_wrapper if TYPE_CHECKING: + from collections.abc import Mapping + from zarr.core.array_spec import ArraySpec from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] | NDArrayLike +def get_numcodec_class(name: str) -> type[Numcodec]: + """Obtain a numcodec codec class by name. + + Parameters + ---------- + config : dict-like + Configuration object. + + Returns + ------- + codec : Codec -def resolve_numcodec(config: CodecConfig_V2[str]) -> Numcodec: - import numcodecs + Examples + -------- + >>> import numcodecs as codecs + >>> codec = codecs.get_codec(dict(id='zlib', level=1)) + >>> codec + Zlib(level=1) + + """ + cls = numcodecs_registry.codec_registry.get(name) + if cls is None and name in numcodecs_registry.entries: + cls = numcodecs_registry.entries[name].load() + numcodecs_registry.register_codec(cls, codec_id=name) + if cls is not None: + return cls + raise KeyError(name) + +def resolve_numcodec(config: CodecJSON_V2[str]) -> Numcodec: return numcodecs.get_codec(config) # type: ignore[no-any-return] @@ -44,10 +73,10 @@ def decode( self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None ) -> BufferOrNDArray: ... - def get_config(self) -> CodecConfig_V2[str]: ... + def get_config(self) -> CodecJSON_V2[str]: ... @classmethod - def from_config(cls, config: CodecConfig_V2[str]) -> Self: ... + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: """ @@ -71,21 +100,21 @@ def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: @dataclass(frozen=True, kw_only=True) -class NumcodecsAdapter: - _codec: Numcodec +class NumcodecsWrapper(BaseCodec[Buffer | NDBuffer, Buffer | NDBuffer]): + codec: Numcodec @overload - def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ... + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... @overload def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... def to_json( self, zarr_format: ZarrFormat - ) -> CodecConfig_V2[str] | NamedConfig[str, BaseConfig]: + ) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: if zarr_format == 2: - return self._codec.get_config() + return self.codec.get_config() elif zarr_format == 3: - config = self._codec.get_config() + config = self.codec.get_config() config_no_id = {k: v for k, v in config.items() if k != "id"} return {"name": config["id"], "configuration": config_no_id} raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover @@ -104,17 +133,36 @@ def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> raise NotImplementedError -class NumcodecsBytesBytesCodec(NumcodecsAdapter, BytesBytesCodec): + def to_array_array(self) -> NumcodecsArrayArrayCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. + """ + return NumcodecsArrayArrayCodec(codec=self.codec) + + def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. + """ + return NumcodecsBytesBytesCodec(codec=self.codec) + + def to_array_bytes(self) -> NumcodecsArrayBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. + """ + return NumcodecsArrayBytesCodec(codec=self.codec) + + +class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: return await asyncio.to_thread( as_numpy_array_wrapper, - self._codec.decode, + self.codec.decode, chunk_data, chunk_spec.prototype, ) def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: - encoded = self._codec.encode(chunk_bytes.as_array_like()) + encoded = self.codec.encode(chunk_bytes.as_array_like()) if isinstance(encoded, np.ndarray): # Required for checksum codecs return prototype.buffer.from_bytes(encoded.tobytes()) return prototype.buffer.from_bytes(encoded) @@ -124,26 +172,26 @@ async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buf @dataclass(kw_only=True, frozen=True) -class NumcodecsArrayArrayCodec(NumcodecsAdapter, ArrayArrayCodec): +class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec): async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: chunk_ndarray = chunk_data.as_ndarray_like() - out = await asyncio.to_thread(self._codec.decode, chunk_ndarray) + out = await asyncio.to_thread(self.codec.decode, chunk_ndarray) return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: chunk_ndarray = chunk_data.as_ndarray_like() - out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] @dataclass(kw_only=True, frozen=True) -class NumcodecsArrayBytesCodec(NumcodecsAdapter, ArrayBytesCodec): +class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec): async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: chunk_bytes = chunk_data.to_bytes() - out = await asyncio.to_thread(self._codec.decode, chunk_bytes) + out = await asyncio.to_thread(self.codec.decode, chunk_bytes) return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: chunk_ndarray = chunk_data.as_ndarray_like() - out = await asyncio.to_thread(self._codec.encode, chunk_ndarray) + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) return chunk_spec.prototype.buffer.from_bytes(out) diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index bad51f33ce..070af864f5 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,14 +1,15 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, NotRequired, TypedDict, overload +from warnings import warn import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -21,6 +22,14 @@ _vlen_utf8_codec = VLenUTF8() _vlen_bytes_codec = VLenBytes() +class VlenUF8Config(TypedDict): + ... + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): + ... + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): + ... @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): @@ -35,6 +44,16 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-utf8", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenUTF8JSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: + if zarr_format == 2: + return {"id": "vlen-utf8"} + else: + return {"name": "vlen-utf8"} + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index b4a4a13c29..3476417d32 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -3,15 +3,15 @@ import asyncio from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Required, TypedDict import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -20,6 +20,19 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +class ZstdSettings(TypedDict): + level: int + +class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdSettings): + """ + The JSON form of the Zstandard codec in Zarr v2. + """ + +class ZstdJSON_V3(NamedConfig[Literal["zstd"], ZstdSettings]): + """ + The JSON form of the GZip codec in Zarr v3. + """ + configuration: Required[ZstdSettings] def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): @@ -65,6 +78,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: + if zarr_format == 2: + return {"id": "zstd", "level": self.level} + else: + return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + @cached_property def _zstd_codec(self) -> Zstd: config_dict = {"level": self.level, "checksum": self.checksum} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index edccbfea6c..73c87f83d1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -27,9 +27,8 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec -from zarr.codecs.numcodec import Numcodec +from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes @@ -108,6 +107,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( + CompressorLike_V2, parse_compressor, parse_filters, ) @@ -118,13 +118,15 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, + get_codec, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path from zarr.storage._utils import _relativize_path +from collections.abc import Sequence if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -198,8 +200,14 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + _codecs: tuple[Codec, ...] = () + if metadata.filters is not None: + _codecs += metadata.filters + if metadata.compressor is not None: + _codecs += (metadata.compressor,) + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs): + _codecs = (BytesCodec(endian=None),) + _codecs + return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -336,7 +344,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: CompressorLikev2 | Literal["auto"] = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -808,8 +816,8 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, - compressor: CompressorLikev2 = None, + filters: Iterable[Codec] | None = None, + compressor: Codec | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -840,7 +848,7 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[CompressorLike_V2] | None = None, compressor: CompressorLike = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, @@ -853,14 +861,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - compressor_parsed: CompressorLikev2 + compressor_parsed: CompressorLike_V2 if compressor == "auto": - compressor_parsed = default_compressor_v2(dtype) - elif isinstance(compressor, BytesBytesCodec): - raise ValueError( - "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " - "Use a numcodecs codec directly instead." - ) + _, compressor_parsed = _get_default_chunk_encoding_v2(dtype) else: compressor_parsed = compressor @@ -4657,19 +4660,24 @@ def _get_default_chunk_encoding_v3( def _get_default_chunk_encoding_v2( dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Codec | NumcodecsWrapper, ...] | None, Codec | NumcodecsWrapper | None]: """ Given a data type, return the default filters for that data type. This is an empty tuple. No data types have default filters. """ dtype_category = categorize_data_type(dtype) - filters = zarr_config.get("array.v2_default_filters").get(dtype_category) - compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category) - if filters is not None: - filters = tuple(numcodecs.get_codec(f) for f in filters) - - return filters, numcodecs.get_codec(compressor) + filters_config = zarr_config.get("array.v2_default_filters").get(dtype_category) + compressor_config = zarr_config.get("array.v2_default_compressor").get(dtype_category) + if compressor_config is not None: + compressor = get_codec(compressor_config["name"], compressor_config.get("configuration", {})) + else: + compressor = None + if filters_config is not None: + filters = tuple(get_codec(f['name'], f.get('configuration', {})) for f in filters_config) + else: + filters = None + return filters, compressor def _parse_chunk_encoding_v2( @@ -4688,13 +4696,10 @@ def _parse_chunk_encoding_v2( if compressor is None or compressor == (): _compressor = None elif compressor == "auto": - _compressor = default_compressor_v2(dtype) - elif isinstance(compressor, tuple | list) and len(compressor) == 1: + _compressor = default_compressor + elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4702,14 +4707,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters_v2(dtype) else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) if isinstance(dtype, HasObjectCodec): # check the filters and the compressor for the object codec required for this data type diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 23c27e40c6..3f0d5f59bc 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -491,18 +491,121 @@ async def write( def codecs_from_list( codecs: Iterable[Codec], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + from zarr.codecs.numcodec import NumcodecsWrapper from zarr.codecs.sharding import ShardingCodec - array_array: tuple[ArrayArrayCodec, ...] = () array_bytes_maybe: ArrayBytesCodec | None = None bytes_bytes: tuple[BytesBytesCodec, ...] = () + # handle two cases + # either all of the codecs are numcodecwrapper instances, in which case we set the last element + # to array-bytes and the rest to array-array + # or one of the codecs is an array-bytes, in which case we convert any preceding numcodecswrapper + # instances to array-array, and any following numcodecswrapper instances to bytes-bytes + + codecs_tup = tuple(codecs) + array_array_idcs: tuple[tuple[int, ArrayArrayCodec], ...] = () + array_bytes_idcs: tuple[tuple[int, ArrayBytesCodec], ...] = () + bytes_bytes_idcs: tuple[tuple[int, BytesBytesCodec], ...] = () + numcodec_wrapper_idcs: tuple[tuple[int, NumcodecsWrapper], ...] = () + + for idx, codec in enumerate(codecs_tup): + match codec: + case ArrayArrayCodec(): + array_array_idcs += ((idx,codec),) + case ArrayBytesCodec(): + array_bytes_idcs += ((idx,codec),) + case BytesBytesCodec(): + bytes_bytes_idcs += ((idx,codec),) + case NumcodecsWrapper(): + numcodec_wrapper_idcs += ((idx,codec),) + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", stacklevel=3, ) + if len(array_bytes_idcs) == 0: + if len(numcodec_wrapper_idcs) == 0: + msg = f'No ArrayBytesCodec was found, that is a big error!. Got {codecs_tup} instead.' + raise ValueError(msg) + elif len(numcodec_wrapper_idcs) == len(codecs_tup): + # convert the last entry to an array-bytes codec, and the previous codecs to + # array-array + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) + else: + if len(array_array_idcs) > 0: + last_array_array_idx = array_array_idcs[-1][0] + + if last_array_array_idx == len(codecs_tup) - 1: + raise ValueError( + "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." + ) + + for idx, aac in enumerate(codecs_tup[:(last_array_array_idx + 1)]): + if isinstance(aac, NumcodecsWrapper): + array_array += (aac.to_array_array(),) + elif isinstance(aac, ArrayArrayCodec): + array_array += (aac,) + else: + msg = ( + f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" + ) + raise TypeError(msg) + + if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() + else: + msg = ( + f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " + f"{last_array_array_idx + 1}." + "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " + f"{type(codecs_tup[last_array_array_idx + 1])}" + ) + raise TypeError(msg) + for idx, rem in enumerate(codecs_tup[(last_array_array_idx + 2):]): + if isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) + elif isinstance(rem, BytesBytesCodec): + bytes_bytes += (rem,) + else: + msg = ( + f"Invalid codec {rem} at index {idx}. Expected a BytesBytesCodec" + ) + raise TypeError(msg) + + elif len(array_bytes_idcs) == 1: + bb_idx, ab_codec = array_bytes_idcs[0] + array_bytes_maybe = ab_codec + + for idx, aa_codec in enumerate(codecs_tup[:bb_idx]): + if isinstance(aa_codec, NumcodecsWrapper): + array_array += (c.to_bytes_bytes(),) + elif isinstance(aa_codec, ArrayArrayCodec): + array_array += (aa_codec,) + else: + msg = ( + f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" + ) + raise TypeError(msg) + + if bb_idx < len(codecs_tup) - 1: + for idx, bb_codec in enumerate(codecs_tup[bb_idx + 1:]): + if isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + elif isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + else: + msg = ( + f"Invalid codec {bb_codec} at index {idx}. Expected a BytesBytesCodec" + ) + raise TypeError(msg) + else: + raise ValueError('More than one ArrayBytes codec found, that is a big error!') + + return array_array, array_bytes_maybe, bytes_bytes for prev_codec, cur_codec in pairwise((None, *codecs)): if isinstance(cur_codec, ArrayArrayCodec): @@ -540,7 +643,8 @@ def codecs_from_list( f"Got {type(prev_codec)} instead." ) bytes_bytes += (cur_codec,) - else: + elif isinstance(cur_codec, NumcodecsWrapper): + raise TypeError if array_bytes_maybe is None: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index feb3d37f5b..7276c3e43d 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -12,6 +12,8 @@ Any, Generic, Literal, + NotRequired, + Required, TypedDict, TypeVar, cast, @@ -50,10 +52,39 @@ TConfig_co = TypeVar("TConfig_co", bound=BaseConfig, covariant=True) -class NamedConfig(TypedDict, Generic[TName_co, TConfig_co]): - name: ReadOnly[TName_co] - configuration: ReadOnly[TConfig_co] +class NamedConfig(TypedDict, Generic[TName, TConfig]): + """ + A typed dictionary representing an object with a name and configuration, where the configuration + is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + + The configuration key is not required. + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: NotRequired[ReadOnly[TConfig]] + """The configuration of the object.""" + +class NamedRequiredConfig(NamedConfig[TName, TConfig]): + """ + A typed dictionary representing an object with a name and configuration, where the configuration + is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + + The configuration key is required. + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: Required[ReadOnly[TConfig]] + """The configuration of the object.""" def product(tup: ChunkCoords) -> int: return functools.reduce(operator.mul, tup, 1) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index b53bc525cd..020ea12c7a 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -107,12 +107,12 @@ def enable_gpu(self) -> ConfigSet: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "default": {"id": "zstd", "level": 0, "checksum": False}, - "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, + "default": {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + "variable-length-string": {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, }, "v2_default_filters": { "default": None, - "variable-length-string": [{"id": "vlen-utf8"}], + "variable-length-string": [{"name": "vlen-utf8"}], }, "v3_default_filters": {"default": [], "variable-length-string": []}, "v3_default_serializer": { diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 02a3a18b33..bb820600fd 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,18 +1,19 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast import numcodecs.abc -from zarr.abc.codec import Codec +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.abc.metadata import Metadata -from zarr.codecs.numcodec import Numcodec, NumcodecsAdapter +from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper, NumcodecsArrayArrayCodec from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.dtype import get_data_type_from_json_v2 -from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, TDType_co, TScalar_co, ZDType +from zarr.core.dtype import get_data_type_from_json +from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.registry import get_codec if TYPE_CHECKING: from typing import Literal, Self @@ -58,7 +59,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLike_V2: TypeAlias = Mapping[str, JSON] | Numcodec | Codec @dataclass(frozen=True, kw_only=True) @@ -68,9 +69,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Codec attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -83,8 +84,8 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + compressor: CompressorLike_V2 | None = None, + filters: Iterable[CompressorLike_V2] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -153,22 +154,22 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # To resolve a numpy object dtype array, we need to search for an object codec, # which could be in filters or as a compressor. - # we will use a hard-coded list of object codecs for this search. - object_codec_id: str | None = None - maybe_object_codecs = (data.get("filters"), data.get("compressor")) - for maybe_object_codec in maybe_object_codecs: - if isinstance(maybe_object_codec, Sequence): - for codec in maybe_object_codec: - if isinstance(codec, dict) and codec.get("id") in ObjectCodecIds: - object_codec_id = codec["id"] - break - elif ( - isinstance(maybe_object_codec, dict) - and maybe_object_codec.get("id") in ObjectCodecIds - ): - object_codec_id = maybe_object_codec["id"] - break - dtype = get_data_type_from_json_v2(data["dtype"], object_codec_id=object_codec_id) + # we will reference a hard-coded collection of object codec ids for this search. + + _filters, _compressor = (_data.get("filters"), _data.get("compressor")) + if _filters is not None: + _filters = cast("tuple[dict[str, JSON], ...]", _filters) + object_codec_id = get_object_codec_id(tuple(_filters) + (_compressor,)) + else: + object_codec_id = get_object_codec_id((_compressor,)) + # we add a layer of indirection here around the dtype attribute of the array metadata + # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { + "name": _data["dtype"], + "object_codec_id": object_codec_id, + } + dtype = get_data_type_from_json(dtype_spec, zarr_format=2) + _data["dtype"] = dtype fill_value_encoded = _data.get("fill_value") if fill_value_encoded is not None: @@ -182,11 +183,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: expected |= {"dtype", "chunks"} # check if `filters` is an empty sequence; if so use None instead and raise a warning - filters = _data.get("filters") if ( - isinstance(filters, Sequence) - and not isinstance(filters, (str, bytes)) - and len(filters) == 0 + isinstance(_filters, Sequence) + and len(_filters) == 0 ): msg = ( "Found an empty list of filters in the array metadata document. " @@ -202,18 +201,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], Numcodec): - raise ValueError('raw numcodecs codecs are not allowed.') - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - - zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + if self.compressor is not None: + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + else: + zarray_dict["compressor"] = None new_filters = [] if zarray_dict["filters"] is not None: - new_filters.append(f.to_json(zarr_format=2)) + new_filters.extend([f.to_json(zarr_format=2) for f in self.filters]) else: new_filters = None zarray_dict["filters"] = new_filters @@ -262,20 +256,24 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[ArrayArrayCodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Codec | NumcodecsWrapper] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if isinstance(val, (Codec, NumcodecsWrapper)): out.append(val) + elif isinstance(val, Numcodec): + out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + name = val['id'] + codec = get_codec(name, {k: v for k, v in val.items() if k != 'id'}) + out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -285,20 +283,27 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if isinstance(data, Numcodec): + return (NumcodecsWrapper(codec=data),) + elif isinstance(data, Codec): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec | Codec): + if data is None or isinstance(data, Codec | NumcodecsWrapper): return data + if isinstance(data, Numcodec): + try: + return get_codec(data.codec_id, {k: v for k,v in data.get_config().items() if k != 'id'}) + except KeyError: + return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_codec(data['id'], {k: v for k, v in data.items() if k != 'id'}) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4b62b7ef3d..b022a0bd65 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, TypedDict from zarr.abc.metadata import Metadata -from zarr.codecs.numcodec import NumcodecsAdapter +from zarr.codecs.numcodec import NumcodecsWrapper from zarr.core.buffer.core import default_buffer_prototype from zarr.core.dtype import ( VariableLengthString, @@ -40,7 +40,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec, get_codec_class def parse_zarr_format(data: object) -> Literal[3]: @@ -67,15 +67,19 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out += (get_codec_class(name_parsed).from_dict(c),) + name_parsed, _config = parse_named_configuration(c, require_configuration=False) + if _config is None: + config = {} + else: + config = _config + out += (get_codec(name_parsed, config),) return out def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec - abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] + abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, (ArrayBytesCodec, NumcodecsWrapper))] if len(abcs) == 0: raise ValueError("At least one ArrayBytesCodec is required.") elif len(abcs) > 1: @@ -337,12 +341,12 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") - out_dict["codecs"] = [] + out_dict["codecs"] = () for codec in self.codecs: - if isinstance(codec, NumcodecsAdapter): - out_dict["codecs"].append(codec.to_json(zarr_format=3)) + if isinstance(codec, NumcodecsWrapper): + out_dict["codecs"] += (codec.to_json(zarr_format=3),) else: - out_dict["codecs"].append(codec.to_dict()) + out_dict["codecs"] += (codec.to_dict(),) # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 51fbb94029..3a881c6a80 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -2,6 +2,7 @@ import warnings from collections import defaultdict +from collections.abc import Mapping from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar @@ -18,6 +19,7 @@ Codec, CodecPipeline, ) + from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -125,55 +127,10 @@ def fully_qualified_name(cls: type) -> str: module = cls.__module__ return module + "." + cls.__qualname__ -def register_filter(key: str, codec_cls: type[ArrayArrayCodec]) -> None: - if key not in __filter_registries: - __filter_registries[key] = Registry() - __filter_registries[key].register(codec_cls) - -def register_serializer(key: str, codec_cls: type[ArrayBytesCodec]) -> None: - from zarr.codecs.numcodec import NumcodecsArrayBytesCodec, is_numcodec_cls - if is_numcodec_cls(codec_cls): - _codec_cls = NumcodecsArrayBytesCodec(_codec=codec_cls) - else: - _codec_cls = codec_cls - if key not in __serializer_registries: - __serializer_registries[key] = Registry() - __serializer_registries[key].register(_codec_cls) - -def register_serializer(key: str, codec_cls: type[ArrayBytesCodec]) -> None: - from zarr.codecs.numcodec import NumcodecsArrayBytesCodec, is_numcodec_cls - if is_numcodec_cls(codec_cls): - _codec_cls = NumcodecsArrayBytesCodec(_codec=codec_cls) - else: - _codec_cls = codec_cls - if key not in __serializer_registries: - __serializer_registries[key] = Registry() - __serializer_registries[key].register(_codec_cls) - -def register_compressor(key: str, codec_cls: type[BytesBytesCodec | Numcodec]) -> None: - from zarr.codecs.numcodec import NumcodecsBytesBytesCodec, is_numcodec_cls - if is_numcodec_cls(codec_cls): - _codec_cls = NumcodecsBytesBytesCodec(_codec=codec_cls) - else: - _codec_cls = codec_cls - if key not in __compressor_registries: - __compressor_registries[key] = Registry() - __compressor_registries[key].register(_codec_cls) - def register_codec(key: str, codec_cls: type[Codec]) -> None: - from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec - if issubclass(codec_cls, ArrayBytesCodec): - register_serializer(key, codec_cls) - elif issubclass(codec_cls, ArrayArrayCodec): - register_filter(key, codec_cls) - else: - register_compressor(key, codec_cls) - - """ if key not in __codec_registries: __codec_registries[key] = Registry() __codec_registries[key].register(codec_cls) - """ def register_pipeline(pipe_cls: type[CodecPipeline]) -> None: __pipeline_registry.register(pipe_cls) @@ -186,16 +143,9 @@ def register_ndbuffer(cls: type[NDBuffer], qualname: str | None = None) -> None: def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def get_filter_class(key: str, reload_config: bool = False) -> type[ArrayArrayCodec]: - return _get_codec_class(key, __serializer_registries, reload_config=reload_config) - -def get_serializer_class(key: str, reload_config: bool = False) -> type[ArrayBytesCodec]: - return _get_codec_class(key, __serializer_registries, reload_config=reload_config) - -def get_compressor_class(key: str, reload_config: bool = False) -> type[BytesBytesCodec]: - return _get_codec_class(key, __compressor_registries, reload_config=reload_config) - -def _get_codec_class(key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False) -> type[Codec]: +def _get_codec_class( + key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False +) -> type[Codec]: if reload_config: _reload_config() @@ -204,6 +154,7 @@ def _get_codec_class(key: str, registry: dict[str, Registry[Codec]], *, reload_c registry[key].lazy_load() codec_classes = registry[key] + if not codec_classes: raise KeyError(key) @@ -222,7 +173,27 @@ def _get_codec_class(key: str, registry: dict[str, Registry[Codec]], *, reload_c return selected_codec_cls raise KeyError(key) + +def get_codec(name: str, configuration: Mapping[str, object]) -> Codec | NumcodecsWrapper: + """ + Get an instance of a codec from a name and a configuration + """ + # avoid circular import + from zarr.codecs.numcodec import NumcodecsWrapper, get_numcodec_class + try: + codec_cls = get_codec_class(name) + return codec_cls(**configuration) + except KeyError as e: + # if we can't find the codec in the zarr python registry, try the numcodecs registry + try: + codec_cls = get_numcodec_class(name) + return NumcodecsWrapper(codec=codec_cls.from_config(configuration)) + except KeyError: + raise KeyError(name) from e + def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: + return _get_codec_class(key, __codec_registries, reload_config=reload_config) + if reload_config: _reload_config() @@ -270,7 +241,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesB result: BytesBytesCodec if isinstance(data, dict): - result = get_compressor_class(data["name"]).from_dict(data) + result = get_codec(data["name"], data["configuration"]) if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) @@ -292,12 +263,12 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayB from zarr.abc.codec import ArrayBytesCodec from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec if isinstance(data, dict): - result = get_serializer_class(data["name"]).from_dict(data) + result = get_codec(data["name"], data.get("configuration", {})) if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) elif isinstance(data, Numcodec): - return NumcodecsArrayBytesCodec(_codec=data) + return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -314,12 +285,12 @@ def _parse_array_array_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayA from zarr.abc.codec import ArrayArrayCodec from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec if isinstance(data, dict): - result = get_filter_class(data["name"]).from_dict(data) + result = get_codec(data["name"], data["configuration"]) if not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) elif isinstance(data, Numcodec): - return NumcodecsArrayArrayCodec(_codec=data) + return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") diff --git a/tests/test_api.py b/tests/test_api.py index 01fb40f050..75db29a19e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1299,14 +1299,10 @@ def test_v2_without_compressor() -> None: def test_v2_with_v3_compressor() -> None: - # Check trying to create a v2 array with a v3 compressor fails - with pytest.raises( - ValueError, - match="Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. Use a numcodecs codec directly instead.", - ): - zarr.create( - store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() - ) + # Check trying to create a v2 array with a v3 compressor succeeds + zarr.create( + store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() + ) def add_empty_file(path: Path) -> Path: diff --git a/tests/test_array.py b/tests/test_array.py index 0da68a1af8..fd1e1f011e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -465,7 +465,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=(ZstdCodec(),), ) assert result == expected @@ -542,7 +542,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=(ZstdCodec(),), ) assert result == expected diff --git a/tests/test_examples.py b/tests/test_examples.py index c97766364b..620a82da59 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -71,8 +71,9 @@ def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: # We resave the script after inserting the absolute path to the local Zarr project directory, # and then test its behavior. # This allows the example to be useful to users who don't have Zarr installed, but also testable. + # --refresh ensures that uv doesn't use a cached build of our local package resave_script(script_path, dest_path) - result = subprocess.run(["uv", "run", str(dest_path)], capture_output=True, text=True) + result = subprocess.run(["uv", "run", "--refresh", str(dest_path)], capture_output=True, text=True) assert result.returncode == 0, ( f"Script at {script_path} failed to run. Output: {result.stdout} Error: {result.stderr}" ) diff --git a/tests/test_image_codecs.py b/tests/test_image_codecs.py new file mode 100644 index 0000000000..c4017b65de --- /dev/null +++ b/tests/test_image_codecs.py @@ -0,0 +1,17 @@ +import numcodecs +import numpy as np +from imagecodecs.numcodecs import Jpeg + +import zarr + +numcodecs.register_codec(Jpeg) +jpg_codec = Jpeg() +store = {} + +z_w = zarr.create_array( + store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=3 +) + +z_r = zarr.open_array(store=store, zarr_format=3) + +print(z_r.metadata.to_dict()["codecs"]) \ No newline at end of file diff --git a/tests/test_x.py b/tests/test_x.py new file mode 100644 index 0000000000..f38d5e4e00 --- /dev/null +++ b/tests/test_x.py @@ -0,0 +1,4 @@ +from zarr.registry import get_codec + +def test(): + c = get_codec('gzip', {"level": 1}) From e463d0af0e4158f1668834c2e555d15b7e3c2f28 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 20 Jul 2025 20:08:53 +0200 Subject: [PATCH 124/129] pass tests --- src/zarr/abc/codec.py | 16 ++- src/zarr/codecs/blosc.py | 107 +++++++++++------- src/zarr/codecs/bytes.py | 108 +++++++++++++++--- src/zarr/codecs/crc32c_.py | 66 ++++++++++- src/zarr/codecs/gzip.py | 30 +++-- src/zarr/codecs/numcodec.py | 23 ++-- src/zarr/codecs/sharding.py | 137 ++++++++++++++++++++++- src/zarr/codecs/transpose.py | 88 ++++++++++++++- src/zarr/codecs/vlen_utf8.py | 74 +++++++++++- src/zarr/codecs/zstd.py | 79 +++++++++++-- src/zarr/core/array.py | 74 +++++++----- src/zarr/core/codec_pipeline.py | 120 ++++++++++++++------ src/zarr/core/common.py | 3 +- src/zarr/core/metadata/v2.py | 24 +++- src/zarr/core/metadata/v3.py | 23 ++-- src/zarr/registry.py | 103 +++++++++-------- tests/test_array.py | 10 +- tests/test_config.py | 13 ++- tests/test_group.py | 11 +- tests/test_gzip.py | 1 + tests/test_info.py | 4 +- tests/test_metadata/test_consolidated.py | 4 +- 22 files changed, 867 insertions(+), 251 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index ec3634862e..067a05b185 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -53,6 +53,12 @@ class CodecJSON_V2(TypedDict, Generic[TName]): CodecConfig_V3 = NamedConfig[str, Mapping[str, object]] +CodecJSON_V3 = str | CodecConfig_V3 + +# The widest type we will accept for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. @@ -187,15 +193,15 @@ def to_json( raise NotImplementedError @classmethod - def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + def _from_json_v2(cls, data: CodecJSON) -> Self: raise NotImplementedError @classmethod - def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + def _from_json_v3(cls, data: CodecJSON) -> Self: raise NotImplementedError @classmethod - def from_json(cls, data: Mapping[str, object], zarr_format: ZarrFormat) -> Self: + def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: if zarr_format == 2: return cls._from_json_v2(data) elif zarr_format == 3: @@ -494,3 +500,7 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return await func(chunk, chunk_spec) return wrap + + +# Raised when a codec JSON data is invalid +class CodecValidationError(ValueError): ... diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index d26699dd14..d86513a6a0 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -5,16 +5,20 @@ from dataclasses import dataclass, replace from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, ClassVar, Literal, NotRequired, TypedDict, get_args, overload -from typing_extensions import Final +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, NamedConfig, NamedRequiredConfig, ZarrFormat, parse_enum, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, + parse_named_configuration, +) from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec @@ -24,11 +28,19 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") + +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") + + class BloscConfigV2(TypedDict): cname: BloscCname clevel: int shuffle: int blocksize: int + typesize: NotRequired[int] class BloscConfigV3(TypedDict): cname: BloscCname @@ -47,18 +59,25 @@ class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): The JSON form of the Blosc codec in Zarr V3. """ -BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] -BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") -BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] -BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) -def parse_shuffle(value: object) -> BloscShuffle: - if value not in BLOSC_SHUFFLE: - raise ValueError( - f"Value must be one of {BLOSC_SHUFFLE}. Got {value} instead." - ) - return value def parse_cname(value: object) -> BloscCname: if value not in BLOSC_CNAME: @@ -95,6 +114,12 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): is_fixed_size = False @@ -117,7 +142,7 @@ def __init__( typesize_parsed = parse_typesize(typesize) if typesize is not None else None cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -128,6 +153,7 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration(data, "blosc") return cls(**configuration_parsed) # type: ignore[arg-type] @@ -145,34 +171,37 @@ def to_dict(self) -> dict[str, JSON]: } @classmethod - def _from_json_v2(cls, data: Mapping[str, object]) -> Self: - return cls( - typesize=data["typesize"], - cname=data["cname"], - clevel=data["clevel"], - shuffle=BLOSC_SHUFFLE[data["shuffle"]], - blocksize=data["blocksize"], - ) - - @classmethod - def _from_json_v3(cls, data: Mapping[str, object]) -> Self: - return cls( - typesize=data["configuration"]["typesize"], - cname=data["configuration"]["cname"], - clevel=data["configuration"]["clevel"], - shuffle=data["configuration"]["shuffle"], - blocksize=data["configuration"]["blocksize"], + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" ) + raise CodecValidationError(msg) @classmethod - def from_json(cls, data: Mapping[str, object], zarr_format: ZarrFormat) -> Self: - if zarr_format == 2: - return cls._from_json_v2(data) - elif zarr_format == 3: - return cls._from_json_v3(data) - raise ValueError( - f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" ) + raise CodecValidationError(msg) @overload def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 4dbfc0ff0d..b139bdb157 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,15 +1,16 @@ from __future__ import annotations import sys +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec @@ -29,33 +30,106 @@ class Endian(Enum): little = "little" -default_system_endian = Endian(sys.byteorder) +# TODO: unify with the endianness defined in core.dtype.common +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" + +default_system_endian = sys.byteorder + + +class BytesConfig(TypedDict): + endian: NotRequired[EndiannessStr] + + +class BytesJSON_V2(CodecJSON_V2[Literal["bytes"]], BytesConfig): ... + + +BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] + + +def parse_endianness(data: object) -> EndiannessStr: + if data in ENDIANNESS_STR: + return data # type: ignore [return-value] + raise ValueError(f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}") + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BytesJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"id", "endian"}, {"id"}) + and data["id"] == "bytes" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BytesJSON_V3]: + return data == "bytes" or ( + ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name"}, {"name", "configuration"}) + and data["name"] == "bytes" + ) + and isinstance(data.get("configuration", {}), Mapping) + and set(data.get("configuration", {}).keys()) in ({"endian"}, set()) + ) @dataclass(frozen=True) class BytesCodec(ArrayBytesCodec): is_fixed_size = True - endian: Endian | None + endian: EndiannessStr | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: EndiannessStr | str | None = default_system_endian) -> None: + endian_parsed = None if endian is None else parse_endianness(endian) object.__setattr__(self, "endian", endian_parsed) @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.endian is None: + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls(endian=data.get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + # Three different representations of the exact same codec... + if data in ("bytes", {"name": "bytes"}, {"name": "bytes", "configuration": {}}): + return cls() + else: + return cls(endian=data["configuration"].get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @overload + def to_json(self, zarr_format: Literal[2]) -> BytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BytesJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BytesJSON_V2 | BytesJSON_V3: + if zarr_format == 2: + if self.endian is not None: + return { + "id": "bytes", + "endian": self.endian, + } + return {"id": "bytes"} + elif zarr_format == 3: + if self.endian is not None: + return { + "name": "bytes", + "configuration": {"endian": self.endian}, + } return {"name": "bytes"} - else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if not isinstance(array_spec.dtype, HasEndianness): @@ -74,7 +148,7 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian = self.endian.value if self.endian is not None else None + endian = self.endian if self.endian is not None else None if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] else: @@ -108,7 +182,7 @@ async def _encode_single( ): # type-ignore is a numpy bug # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) # type: ignore[arg-type] chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 6da673ceac..81072366e7 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,14 +1,15 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, cast, overload import numpy as np import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec -from zarr.core.common import JSON, parse_named_configuration +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -18,18 +19,77 @@ from zarr.core.buffer import Buffer +class Crc32Config(TypedDict): ... + + +class Crc32cJSON_V2(CodecJSON_V2[Literal["crc32c"]]): ... + + +class Crc32cJSON_V3(NamedConfig[Literal["crc32c"], Crc32Config]): ... + + +def check_json_v2(data: CodecJSON) -> TypeGuard[Crc32cJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()) == {"id"} and data["id"] == "crc32c" + + +def check_json_v3(data: CodecJSON) -> TypeGuard[Crc32cJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name", "configuration"}, {"name"}) + and data["name"] == "crc32c" + and data.get("configuration") in ({}, None) + ) + + @dataclass(frozen=True) class Crc32cCodec(BytesBytesCodec): is_fixed_size = True @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) parse_named_configuration(data, "crc32c", require_configuration=False) return cls() + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls() + msg = ( + "Invalid Zarr V2 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('id')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls() + msg = ( + "Invalid Zarr V3 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('name')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return {"name": "crc32c"} + @overload + def to_json(self, zarr_format: Literal[2]) -> Crc32cJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Crc32cJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "crc32c"} + elif zarr_format == 3: + return {"name": "crc32c"} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + async def _decode_single( self, chunk_bytes: Buffer, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 23177e8755..5fdac6beab 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -7,9 +7,13 @@ from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -37,7 +41,8 @@ class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): The JSON form of the GZip codec in Zarr V2. """ -class GZipJSON_V3(NamedConfig[Literal["gzip"], GZipConfig]): + +class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): """ The JSON form of the GZip codec in Zarr V3. """ @@ -56,11 +61,10 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return self.to_json(zarr_format=3) @overload def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... @@ -77,17 +81,19 @@ def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: ) # pragma: no cover @classmethod - def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipJSON_V2]: + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V2]: return ( - set(data.keys()) == {"id", "level"} + isinstance(data, Mapping) + and set(data.keys()) == {"id", "level"} and data["id"] == "gzip" and isinstance(data["level"], int) ) @classmethod - def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipJSON_V3]: + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V3]: return ( - set(data.keys()) == {"name", "configuration"} + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} and data["name"] == "gzip" and isinstance(data["configuration"], dict) and "level" in data["configuration"] @@ -95,13 +101,13 @@ def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipJSON_V3]: ) @classmethod - def _from_json_v2(cls, data: Mapping[str, object]) -> Self: + def _from_json_v2(cls, data: CodecJSON) -> Self: if cls._check_json_v2(data): return cls(level=data["level"]) raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") @classmethod - def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + def _from_json_v3(cls, data: CodecJSON) -> Self: if cls._check_json_v3(data): return cls(level=data["configuration"]["level"]) raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py index a22b46ff4e..afb15b1a0d 100644 --- a/src/zarr/codecs/numcodec.py +++ b/src/zarr/codecs/numcodec.py @@ -13,14 +13,19 @@ import numpy as np from typing_extensions import Protocol, runtime_checkable -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BaseCodec, BytesBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BaseCodec, + BytesBytesCodec, + CodecJSON, + CodecJSON_V2, +) from zarr.core.array_spec import ArraySpec from zarr.core.buffer.core import Buffer, BufferPrototype, NDArrayLike, NDBuffer from zarr.core.buffer.cpu import as_numpy_array_wrapper if TYPE_CHECKING: - from collections.abc import Mapping - from zarr.core.array_spec import ArraySpec from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat @@ -31,8 +36,8 @@ def get_numcodec_class(name: str) -> type[Numcodec]: Parameters ---------- - config : dict-like - Configuration object. + name : str + The name of the codec to get Returns ------- @@ -42,7 +47,7 @@ def get_numcodec_class(name: str) -> type[Numcodec]: -------- >>> import numcodecs as codecs - >>> codec = codecs.get_codec(dict(id='zlib', level=1)) + >>> codec = codecs.get_codec('zlib') >>> codec Zlib(level=1) @@ -120,11 +125,11 @@ def to_json( raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover @classmethod - def _from_json_v2(cls, data: Mapping[str, object]) -> Self: - return cls(_codec=resolve_numcodec(data)) # type: ignore[arg-type] + def _from_json_v2(cls, data: CodecJSON) -> Self: + return cls(codec=resolve_numcodec(data)) # type: ignore[arg-type] @classmethod - def _from_json_v3(cls, data: Mapping[str, object]) -> Self: + def _from_json_v3(cls, data: CodecJSON) -> Self: raise NotImplementedError( "This class does not support creating instances from JSON data for Zarr format 3." ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index cd8676b4d1..04d2c3554e 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,11 +1,22 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + NamedTuple, + NotRequired, + Self, + TypedDict, + TypeGuard, + cast, + overload, +) import numpy as np import numpy.typing as npt @@ -15,7 +26,11 @@ ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, + CodecJSON, + CodecJSON_V2, + CodecJSON_V3, CodecPipeline, + CodecValidationError, ) from zarr.abc.store import ( ByteGetter, @@ -38,6 +53,7 @@ from zarr.core.common import ( ChunkCoords, ChunkCoordsLike, + NamedRequiredConfig, parse_enum, parse_named_configuration, parse_shapelike, @@ -65,6 +81,28 @@ ShardMapping = Mapping[ChunkCoords, Buffer] ShardMutableMapping = MutableMapping[ChunkCoords, Buffer] +IndexLocation = Literal["start", "end"] + + +class ShardingConfigV2(TypedDict): + codecs: tuple[CodecJSON_V2[str], ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V2[str], ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingConfigV3(TypedDict): + codecs: tuple[CodecJSON_V3, ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V3, ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingJSON_V2(CodecJSON_V2[Literal["sharding_indexed"]], ShardingConfigV2): ... + + +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... + class ShardingCodecIndexLocation(Enum): """ @@ -79,6 +117,37 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +def check_json_v2(data: CodecJSON) -> TypeGuard[ShardingJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "codecs", "chunk_shape"} + and data["id"] == "sharding_indexed" + and isinstance(data["chunk_shape"], Sequence) + and not isinstance(data["chunk_shape"], str) + and isinstance(data["codecs"], Sequence) + and not isinstance(data["codecs"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ShardingJSON_V3]: + # TODO: Automate this with a function that does runtime type checking on typeddicts. + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "sharding_indexed" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"codecs", "chunk_shape", "index_codecs", "index_location"} + and isinstance(data["configuration"]["chunk_shape"], Sequence) + and not isinstance(data["configuration"]["chunk_shape"], str) + and isinstance(data["configuration"]["codecs"], Sequence) + and not isinstance(data["configuration"]["codecs"], str) + and isinstance(data["configuration"]["index_codecs"], Sequence) + and not isinstance(data["configuration"]["index_codecs"], str) + and data["configuration"]["index_location"] in ("start", "end") + ) + + @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): shard_dict: ShardMapping @@ -383,14 +452,47 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") return cls(**configuration_parsed) # type: ignore[arg-type] + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + codecs=data["codecs"], + index_codecs=data["index_codecs"], + index_location=data["index_location"], + chunk_shape=data["chunk_shape"], + ) + msg = ( + "Invalid Zarr V2 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'codecs', 'index_codecs', 'chunk_shape', 'index_location')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + codecs=data["configuration"]["codecs"], + index_codecs=data["configuration"]["index_codecs"], + index_location=data["configuration"]["index_location"], + chunk_shape=data["configuration"]["chunk_shape"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('codecs', 'index_codecs', 'index_location', 'chunk_shape')" + ) + raise CodecValidationError(msg) + @property def codec_pipeline(self) -> CodecPipeline: return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return { "name": "sharding_indexed", "configuration": { @@ -401,6 +503,37 @@ def to_dict(self) -> dict[str, JSON]: }, } + @overload + def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ShardingJSON_V3: ... + + def to_json(self, zarr_format: int) -> ShardingJSON_V2 | ShardingJSON_V3: + if zarr_format == 2: + return { + "id": "sharding_indexed", + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "chunk_shape": self.chunk_shape, + "index_location": self.index_location.value, + } + elif zarr_format == 3: + return { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": self.chunk_shape, + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "index_location": self.index_location.value, + }, + } + raise ValueError(f"Unsupported Zarr format {zarr_format}. Expected 2 or 3.") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index be89690441..4d976e529b 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -1,14 +1,19 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.abc.codec import ArrayArrayCodec +from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.array_spec import ArraySpec -from zarr.core.common import JSON, ChunkCoordsLike, parse_named_configuration +from zarr.core.common import ( + JSON, + ChunkCoordsLike, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -27,6 +32,42 @@ def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: return tuple(cast("Iterable[int]", data)) +class TransposeConfig(TypedDict): + order: tuple[int, ...] + + +class TransposeJSON_V2(CodecJSON_V2[Literal["transpose"]], TransposeConfig): + """ + The JSON form of the Transpose codec in Zarr V2. + """ + + +class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): + """ + The JSON form of the Transpose codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[TransposeJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "configuration"} + and data["id"] == "transpose" + and isinstance(data["order"], Sequence) + and not isinstance(data["order"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[TransposeJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "transpose" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"order"} + ) + + @dataclass(frozen=True) class TransposeCodec(ArrayArrayCodec): is_fixed_size = True @@ -40,12 +81,47 @@ def __init__(self, *, order: ChunkCoordsLike) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v2(data): + return cls(order=data["order"]) # type: ignore[arg-type] + msg = ( + "Invalid Zarr V2 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'order')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v3(data): + return cls(order=data["configuration"]["order"]) + msg = ( + "Invalid Zarr V3 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('order')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + @overload + def to_json(self, zarr_format: Literal[2]) -> TransposeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> TransposeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "transpose", "order": self.order} + elif zarr_format == 3: + return {"name": "transpose", "configuration": {"order": self.order}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def validate( self, shape: tuple[int, ...], diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 070af864f5..cf7a97f816 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,13 +1,12 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, NotRequired, TypedDict, overload -from warnings import warn +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec, CodecJSON_V2 +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec @@ -31,10 +30,20 @@ class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... +class VLenBytesConfig(TypedDict): ... + + +class VLenBytesJSON_V2(CodecJSON_V2[Literal["vlen-bytes"]]): ... + + +VLenBytesJSON_V3 = NamedConfig[Literal["vlen-bytes"], VLenBytesConfig] | Literal["vlen-bytes"] + + @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-utf8", require_configuration=False ) @@ -54,6 +63,30 @@ def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: else: return {"name": "vlen-utf8"} + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V2]: + return data == {"id": "vlen-utf8"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V3]: + return data in ( + {"name": "vlen-utf8"}, + {"name": "vlen-utf8", "configuration": {}}, + "vlen-utf8", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -91,6 +124,7 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-bytes", require_configuration=False ) @@ -100,6 +134,40 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenBytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenBytesJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenBytesJSON_V2 | VLenBytesJSON_V3: + if zarr_format == 2: + return {"id": "vlen-bytes"} + else: + return {"name": "vlen-bytes"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V2]: + return data == {"id": "vlen-bytes"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V3]: + return data in ( + {"name": "vlen-bytes"}, + {"name": "vlen-bytes", "configuration": {}}, + "vlen-bytes", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 3476417d32..69620efde3 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -1,17 +1,23 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING, Literal, Required, TypedDict +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, overload import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec, CodecJSON_V2 +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, + parse_named_configuration, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -20,19 +26,40 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer -class ZstdSettings(TypedDict): + +class ZstdConfig_V2(TypedDict): level: int -class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdSettings): + +class ZstdConfig_V3(TypedDict): + level: int + checksum: bool + + +class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdConfig_V2): """ - The JSON form of the Zstandard codec in Zarr v2. + The JSON form of the ZStandard codec in Zarr v2. """ -class ZstdJSON_V3(NamedConfig[Literal["zstd"], ZstdSettings]): + +class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): """ - The JSON form of the GZip codec in Zarr v3. + The JSON form of the ZStandard codec in Zarr v3. """ - configuration: Required[ZstdSettings] + + +def check_json_v2(data: CodecJSON) -> TypeGuard[ZstdJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()).issuperset({"id", "level"}) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "zstd" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"level", "checksum"} + ) def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): @@ -72,12 +99,46 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration(data, "zstd") return cls(**configuration_parsed) # type: ignore[arg-type] + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + if "checksum" in data: + return cls(level=data["level"], checksum=data["checksum"]) + else: + return cls(level=data["level"]) + + msg = ( + "Invalid Zarr V2 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'level')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + level=data["configuration"]["level"], checksum=data["configuration"]["checksum"] + ) + msg = ( + "Invalid Zarr V3 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + @overload + def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ZstdJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: if zarr_format == 2: return {"id": "zstd", "level": self.level} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 73c87f83d1..00c2f0beca 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,8 +19,6 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated @@ -28,7 +26,10 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs.bytes import BytesCodec -from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper +from zarr.codecs.numcodec import Numcodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec +from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes @@ -205,8 +206,20 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None _codecs += metadata.filters if metadata.compressor is not None: _codecs += (metadata.compressor,) - if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs): + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance( + metadata.dtype, HasObjectCodec + ): + # The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style + # chain of filters + compressor might not contain a codec identifiable as an array-bytes codec. + # In such a case, we will insert a bytes codec that applies no endian transformation. + # We skip this insertion if the data type is an instance of HasObjectCodec, because + # in zarr v2 these data types required a special codec that functioned like an array bytes codec. _codecs = (BytesCodec(endian=None),) + _codecs + if metadata.order == "F": + # Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec + # framework, we express C or F ordered arrays by adding a transpose codec to the front + # of the list of codecs. + _codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -609,7 +622,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -1020,7 +1033,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Codec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1049,7 +1062,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1062,7 +1075,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -2149,7 +2162,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2165,7 +2178,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2176,7 +2189,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3820,23 +3833,21 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4685,13 +4696,12 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Codec, ...] | None, Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Codec, ...] | None + _compressor: Codec | None if compressor is None or compressor == (): _compressor = None @@ -4714,12 +4724,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.to_json(zarr_format=2),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.to_json(zarr_format=2) for f in _filters], + _compressor.to_json(zarr_format=2) if _compressor is not None else None, ) ) if object_codec_id is None: @@ -4759,7 +4769,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = (filters,) else: maybe_array_array = cast("Iterable[Codec | dict[str, JSON]]", filters) - out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + out_array_array = tuple( + _parse_array_array_codec(c, zarr_format=3) for c in maybe_array_array + ) if serializer == "auto": out_array_bytes = default_serializer_v3(dtype) @@ -4767,7 +4779,7 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - out_array_bytes = _parse_array_bytes_codec(serializer) + out_array_bytes = _parse_array_bytes_codec(serializer, zarr_format=3) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () @@ -4780,7 +4792,9 @@ def _parse_chunk_encoding_v3( else: maybe_bytes_bytes = compressors # type: ignore[assignment] - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + out_bytes_bytes = tuple( + _parse_bytes_bytes_codec(c, zarr_format=3) for c in maybe_bytes_bytes + ) # specialize codecs as needed given the dtype @@ -4813,8 +4827,6 @@ def _parse_deprecated_compressor( compressors = () else: compressors = (compressor,) - elif zarr_format == 2 and compressor == compressors == "auto": - compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 3f0d5f59bc..53bc8dbab9 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -87,7 +87,6 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) - return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, @@ -219,7 +218,6 @@ async def encode_batch( zip(chunk_array_batch, chunk_specs, strict=False) ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs, strict=False) ) @@ -494,7 +492,7 @@ def codecs_from_list( from zarr.codecs.numcodec import NumcodecsWrapper from zarr.codecs.sharding import ShardingCodec array_array: tuple[ArrayArrayCodec, ...] = () - array_bytes_maybe: ArrayBytesCodec | None = None + array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () # handle two cases @@ -516,48 +514,73 @@ def codecs_from_list( case ArrayBytesCodec(): array_bytes_idcs += ((idx,codec),) case BytesBytesCodec(): - bytes_bytes_idcs += ((idx,codec),) - case NumcodecsWrapper(): - numcodec_wrapper_idcs += ((idx,codec),) + bytes_bytes_idcs += ((idx, codec),) + case NumcodecsWrapper(): # type: ignore[union-attr] + numcodec_wrapper_idcs += ((idx, codec),) - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs_tup) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", stacklevel=3, ) if len(array_bytes_idcs) == 0: + # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an + # array-bytes codec, this is an error. if len(numcodec_wrapper_idcs) == 0: - msg = f'No ArrayBytesCodec was found, that is a big error!. Got {codecs_tup} instead.' + msg = ( + f"The codecs {codecs_tup} do not include an ArrayBytesCodec or a codec castable to an " + "ArrayBytesCodec, such as a NumcodecsWrapper. This is an invalid sequence of codecs." + ) raise ValueError(msg) elif len(numcodec_wrapper_idcs) == len(codecs_tup): - # convert the last entry to an array-bytes codec, and the previous codecs to - # array-array - array_bytes_maybe = codecs_tup[-1].to_array_bytes() - array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) + # All the codecs are numcodecs wrappers. This means we have no information about which + # codec is array-array, array-bytes, and bytes-bytes, so we we just cast the numcodecs wrappers + # into a sequence of array-array codecs terminated by a single array-bytes codec. + # This choice is almost arbitrary. + # It would be equally valid to convert the first codec to an array-bytes, and the remaining + # codecs to bytes-bytes, or to pick a random codec and convert it to array-bytes, then + # converting all the preceding codecs to array-array, and the following codecs to bytes-bytes. + # But we know from experience that the Zarr V2-style chunk encoding pipelines typically + # start with array-array transformations, so casting all but one of the unknown codecs + # to array-array is a safe choice. + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) else: + # There are no array-bytes codecs, there is at least one numcodec wrapper, but there are + # also some array-array and / or bytes-bytes codecs if len(array_array_idcs) > 0: + # There is at least one array-array codec. We will use it as a reference point for + # casting any numcodecs wrappers. last_array_array_idx = array_array_idcs[-1][0] if last_array_array_idx == len(codecs_tup) - 1: + # The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec. This + # cannot be fixed by converting numcodecs wrappers, so we raise an exception. raise ValueError( "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." ) - for idx, aac in enumerate(codecs_tup[:(last_array_array_idx + 1)]): - if isinstance(aac, NumcodecsWrapper): - array_array += (aac.to_array_array(),) - elif isinstance(aac, ArrayArrayCodec): + for idx, aac in enumerate(codecs_tup[: (last_array_array_idx + 1)]): + # Iterate over the codecs leading up to the last array-array codec. + if isinstance(aac, ArrayArrayCodec): + # Any array-array codec gets added to the list of array-array codecs array_array += (aac,) + elif isinstance(aac, NumcodecsWrapper): + # Any numcodecs wrapper gets converted to an array-array codec + array_array += (aac.to_array_array(),) else: - msg = ( - f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" - ) + # Any other kind of codec is invalid and we raise an exception. + msg = f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" raise TypeError(msg) if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + # The codec following the last array-array codec is a numcodecs wrapper. + # We will cast it to an array-bytes codec. array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() else: + # The codec following the last array-array codec was a bytes bytes codec, or + # something else entirely. This is invalid and we raise an exception. msg = ( f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " f"{last_array_array_idx + 1}." @@ -565,42 +588,69 @@ def codecs_from_list( f"{type(codecs_tup[last_array_array_idx + 1])}" ) raise TypeError(msg) - for idx, rem in enumerate(codecs_tup[(last_array_array_idx + 2):]): - if isinstance(rem, NumcodecsWrapper): - bytes_bytes += (rem.to_bytes_bytes(),) - elif isinstance(rem, BytesBytesCodec): + + start = last_array_array_idx + 2 + for idx, rem in enumerate(codecs_tup[start:]): + # We have already checked the codec after the last array-array codec, so we start + # iterating over the codecs after that. + if isinstance(rem, BytesBytesCodec): bytes_bytes += (rem,) + elif isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) else: - msg = ( - f"Invalid codec {rem} at index {idx}. Expected a BytesBytesCodec" - ) + msg = f"Invalid codec {rem} at index {start + idx}. Expected a BytesBytesCodec" raise TypeError(msg) + else: + # there are no array-array codecs, just numcodecs wrappers and bytes-bytes codecs + first_bytes_bytes_idx = bytes_bytes_idcs[0][0] + if first_bytes_bytes_idx == 0: + raise ValueError( + "The first codec is a BytesBytesCodec, but there is no ArrayBytesCodec." + ) + else: + # Iterate over all codecs. Cast all numcodecs wrappers to array-array codecs, until + # the codec immediately prior to the first bytes-bytes codec, which we cast to + # an array-bytes codec. All codecs after that point are cast to bytes-bytes codecs. + for idx, bb_codec in enumerate(codecs_tup): + if idx < first_bytes_bytes_idx - 1: + # This must be a numcodecs wrapper. cast it to array-array + array_array += (bb_codec.to_array_array(),) + elif idx == first_bytes_bytes_idx - 1: + array_bytes_maybe = bb_codec.to_array_bytes() + else: + if isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + elif isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + else: + msg = f"Invalid codec {bb_codec} at index {idx}. Expected a NumcodecsWrapper" + raise TypeError(msg) elif len(array_bytes_idcs) == 1: bb_idx, ab_codec = array_bytes_idcs[0] array_bytes_maybe = ab_codec - for idx, aa_codec in enumerate(codecs_tup[:bb_idx]): - if isinstance(aa_codec, NumcodecsWrapper): - array_array += (c.to_bytes_bytes(),) - elif isinstance(aa_codec, ArrayArrayCodec): + end = bb_idx + + for idx, aa_codec in enumerate(codecs_tup[:end]): + if isinstance(aa_codec, ArrayArrayCodec): array_array += (aa_codec,) + elif isinstance(aa_codec, NumcodecsWrapper): + array_array += (aa_codec.to_array_array(),) else: msg = ( f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" ) raise TypeError(msg) - + start = bb_idx + 1 if bb_idx < len(codecs_tup) - 1: - for idx, bb_codec in enumerate(codecs_tup[bb_idx + 1:]): + for idx, bb_codec in enumerate(codecs_tup[start:]): if isinstance(bb_codec, NumcodecsWrapper): bytes_bytes += (bb_codec.to_bytes_bytes(),) elif isinstance(bb_codec, BytesBytesCodec): bytes_bytes += (bb_codec,) else: - msg = ( - f"Invalid codec {bb_codec} at index {idx}. Expected a BytesBytesCodec" - ) + msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" raise TypeError(msg) else: raise ValueError('More than one ArrayBytes codec found, that is a big error!') diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 7276c3e43d..71b7261f71 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -69,7 +69,8 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): configuration: NotRequired[ReadOnly[TConfig]] """The configuration of the object.""" -class NamedRequiredConfig(NamedConfig[TName, TConfig]): + +class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): """ A typed dictionary representing an object with a name and configuration, where the configuration is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bb820600fd..fd9b4071e4 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -9,7 +9,8 @@ from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.abc.metadata import Metadata -from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper, NumcodecsArrayArrayCodec +from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 @@ -105,6 +106,18 @@ def __init__( fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value + + array_spec = ArraySpec( + shape=shape_parsed, + dtype=dtype, + fill_value=fill_value_parsed, + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + prototype=default_buffer_prototype(), # TODO: prototype is not needed here. + ) + if compressor_parsed is not None: + compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + if filters_parsed is not None: + filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -271,8 +284,7 @@ def parse_filters(data: object) -> tuple[ArrayArrayCodec, ...] | None: elif isinstance(val, Numcodec): out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - name = val['id'] - codec = get_codec(name, {k: v for k, v in val.items() if k != 'id'}) + codec = get_codec(val, zarr_format=2) out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." @@ -295,15 +307,17 @@ def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ + # TODO: only validate the compressor in one place. currently we do it twice, once in init_array + # and again when constructing metadata if data is None or isinstance(data, Codec | NumcodecsWrapper): return data if isinstance(data, Numcodec): try: - return get_codec(data.codec_id, {k: v for k,v in data.get_config().items() if k != 'id'}) + return get_codec(data.get_config(), zarr_format=2) except KeyError: return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return get_codec(data['id'], {k: v for k, v in data.items() if k != 'id'}) + return get_codec(data, zarr_format=2) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b022a0bd65..96e5f10bdc 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -5,11 +5,9 @@ from zarr.abc.metadata import Metadata from zarr.codecs.numcodec import NumcodecsWrapper from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import ( - VariableLengthString, - ZDType, - get_data_type_from_json_v3, -) +from zarr.core.codec_pipeline import codecs_from_list +from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json +from zarr.core.dtype.common import check_dtype_spec_v3 if TYPE_CHECKING: from typing import Self @@ -34,7 +32,6 @@ ZARR_JSON, ChunkCoords, DimensionNames, - parse_named_configuration, parse_shapelike, ) from zarr.core.config import config @@ -67,19 +64,14 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _config = parse_named_configuration(c, require_configuration=False) - if _config is None: - config = {} - else: - config = _config - out += (get_codec(name_parsed, config),) + out += (get_codec(c, zarr_format=3),) return out def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec - abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, (ArrayBytesCodec, NumcodecsWrapper))] + abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: raise ValueError("At least one ArrayBytesCodec is required.") elif len(abcs) > 1: @@ -92,7 +84,10 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseSc """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec - abc = validate_array_bytes_codec(codecs) + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + _codecs = (*array_array_codecs, array_bytes_codec, *bytes_bytes_codecs) + + abc = validate_array_bytes_codec(_codecs) # Recursively resolve array-bytes codecs within sharding codecs while isinstance(abc, ShardingCodec): diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 3a881c6a80..7b07a3e0a1 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -17,11 +17,12 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON, CodecPipeline, ) - from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper + from zarr.codecs.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat __all__ = [ "Registry", @@ -174,52 +175,44 @@ def _get_codec_class( raise KeyError(key) -def get_codec(name: str, configuration: Mapping[str, object]) -> Codec | NumcodecsWrapper: +def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec: """ Get an instance of a codec from a name and a configuration """ # avoid circular import from zarr.codecs.numcodec import NumcodecsWrapper, get_numcodec_class + + codec_name: str + if zarr_format == 2: + if isinstance(request, str): + raise TypeError( + f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" + ) + codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} + elif zarr_format == 3: + if isinstance(request, str): + codec_name = request + codec_config = {} + else: + codec_name = request["name"] + codec_config = request.get("configuration", {}) + else: + raise ValueError( + f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" + ) # pragma: no cover + try: - codec_cls = get_codec_class(name) - return codec_cls(**configuration) - except KeyError as e: + codec_cls = get_codec_class(codec_name) + return codec_cls.from_json(request, zarr_format=zarr_format) + except KeyError: # if we can't find the codec in the zarr python registry, try the numcodecs registry - try: - codec_cls = get_numcodec_class(name) - return NumcodecsWrapper(codec=codec_cls.from_config(configuration)) - except KeyError: - raise KeyError(name) from e + codec = get_numcodec_class(codec_name)(**codec_config) + return NumcodecsWrapper(codec=codec) def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: return _get_codec_class(key, __codec_registries, reload_config=reload_config) - if reload_config: - _reload_config() - - if key in __codec_registries: - # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) - __codec_registries[key].lazy_load() - - codec_classes = __codec_registries[key] - if not codec_classes: - raise KeyError(key) - - config_entry = config.get("codecs", {}).get(key) - if config_entry is None: - if len(codec_classes) == 1: - return next(iter(codec_classes.values())) - warnings.warn( - f"Codec '{key}' not configured in config. Selecting any implementation.", - stacklevel=2, - ) - return list(codec_classes.values())[-1] - selected_codec_cls = codec_classes[config_entry] - - if selected_codec_cls: - return selected_codec_cls - raise KeyError(key) - def _resolve_codec(data: dict[str, JSON]) -> Codec: """ @@ -229,7 +222,9 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it @@ -237,16 +232,18 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesB """ # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec - from zarr.codecs.numcodec import Numcodec, NumcodecsBytesBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsBytesBytesCodec, NumcodecsWrapper result: BytesBytesCodec if isinstance(data, dict): - result = get_codec(data["name"], data["configuration"]) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_bytes_bytes() if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) elif isinstance(data, Numcodec): - return NumcodecsBytesBytesCodec(_codec=data) + return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") @@ -254,16 +251,21 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> BytesB return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayBytesCodec: +def _parse_array_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec - from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec, NumcodecsWrapper + if isinstance(data, dict): - result = get_codec(data["name"], data.get("configuration", {})) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_bytes() if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) @@ -276,17 +278,22 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayB return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec | Numcodec) -> ArrayArrayCodec: +def _parse_array_array_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec - from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec, NumcodecsWrapper + if isinstance(data, dict): - result = get_codec(data["name"], data["configuration"]) - if not isinstance(result, ArrayArrayCodec): + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_array() + elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) elif isinstance(data, Numcodec): diff --git a/tests/test_array.py b/tests/test_array.py index fd1e1f011e..fce204ce38 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1070,8 +1070,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": True}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": True}},), ], ) @pytest.mark.parametrize( @@ -1625,12 +1625,12 @@ def test_roundtrip_numcodecs() -> None: store = MemoryStore() compressors = [ - {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, - {"name": "numcodecs.zlib", "configuration": {"level": 4}}, + {"name": "shuffle", "configuration": {"elementsize": 2}}, + {"name": "zlib", "configuration": {"level": 4}}, ] filters = [ { - "name": "numcodecs.fixedscaleoffset", + "name": "fixedscaleoffset", "configuration": { "scale": 100.0, "offset": 0.0, diff --git a/tests/test_config.py b/tests/test_config.py index 7878ebde58..92882f4381 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -206,7 +206,18 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {}}], + compressors=[ + { + "name": "blosc", + "configuration": { + "typesize": 1, + "cname": "lz4", + "clevel": 1, + "blocksize": 1, + "shuffle": "bitshuffle", + }, + } + ], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_group.py b/tests/test_group.py index eb243a008c..c522406db1 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -11,7 +11,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr import zarr.api.asynchronous @@ -21,6 +20,7 @@ from zarr.abc.store import Store from zarr.core import sync_group from zarr.core._info import GroupInfo +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.dtype.npy.int import UInt8 @@ -522,7 +522,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": Blosc(), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { @@ -549,8 +549,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "name": "default", }, "codecs": ( - {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + default_serializer_v3(dtype).to_json(zarr_format=zarr_format), + *[ + c.to_json(zarr_format=zarr_format) + for c in default_compressors_v3(dtype) + ], ), "data_type": dtype.to_json(zarr_format=zarr_format), "fill_value": fill_value, diff --git a/tests/test_gzip.py b/tests/test_gzip.py index a092418615..a72210f5a9 100644 --- a/tests/test_gzip.py +++ b/tests/test_gzip.py @@ -4,6 +4,7 @@ import zarr from zarr.codecs import GzipCodec +from zarr.core.common import ZarrFormat @pytest.mark.parametrize("zarr_format", [2, 3]) diff --git a/tests/test_info.py b/tests/test_info.py index f7369b565a..b45828c2cd 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -72,7 +72,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -113,7 +113,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted}) diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 395e036db2..63703c2048 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr.api.asynchronous import zarr.api.synchronous @@ -17,6 +16,7 @@ open, open_consolidated, ) +from zarr.core.array import default_compressor_v2 from zarr.core.buffer import cpu, default_buffer_prototype from zarr.core.dtype import parse_data_type from zarr.core.group import ConsolidatedMetadata, GroupMetadata @@ -523,7 +523,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=0, - compressor=Blosc(), + compressor=default_compressor_v2(dtype), order="C", ), "g1": GroupMetadata( From 2cfc84809f172663a0a036eec4e2fedbc8b473e2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 20 Jul 2025 20:17:05 +0200 Subject: [PATCH 125/129] expand example --- examples/image_codecs.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/examples/image_codecs.py b/examples/image_codecs.py index bfdb81e152..3a2f8e2c26 100644 --- a/examples/image_codecs.py +++ b/examples/image_codecs.py @@ -2,26 +2,49 @@ # requires-python = ">=3.11" # dependencies = [ # "zarr @ file:///home/bennettd/dev/zarr-python/", -# "imagecodecs==2025.3.30" +# "imagecodecs==2025.3.30", +# "pytest" # ] # /// # "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", +from typing import Literal import numcodecs import numpy as np +import pytest from imagecodecs.numcodecs import Jpeg import zarr numcodecs.register_codec(Jpeg) jpg_codec = Jpeg() -store = {} -z_w = zarr.create_array( - store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=3 -) -z_r = zarr.open_array(store=store, zarr_format=3) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test(zarr_format: Literal[2, 3]) -> None: + store = {} + if zarr_format == 2: + zarr.create_array( + store=store, + data=np.zeros((100, 100, 3), dtype=np.uint8), + compressors=jpg_codec, + zarr_format=zarr_format, + ) + else: + zarr.create_array( + store=store, + data=np.zeros((100, 100, 3), dtype=np.uint8), + serializer=jpg_codec, + zarr_format=zarr_format, + ) -print(z_r.metadata.to_dict()["codecs"]) + z_r = zarr.open_array(store=store, zarr_format=zarr_format) + if zarr_format == 2: + print(z_r.metadata.to_dict()["compressor"]) + else: + print(z_r.metadata.to_dict()["codecs"]) + + +if __name__ == "__main__": + pytest.main([__file__, f"-c {__file__}", "-s"]) From 60939c25a76da980963eea222a878f5c5f0e1b63 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 21 Jul 2025 10:12:07 +0200 Subject: [PATCH 126/129] revert to main --- changes/2874.feature.rst | 9 - docs/user-guide/arrays.rst | 18 +- docs/user-guide/config.rst | 24 +- docs/user-guide/data_types.rst | 423 ++++++++++++++++++++++++++------ docs/user-guide/groups.rst | 2 + docs/user-guide/index.rst | 1 - docs/user-guide/performance.rst | 3 + 7 files changed, 367 insertions(+), 113 deletions(-) delete mode 100644 changes/2874.feature.rst diff --git a/changes/2874.feature.rst b/changes/2874.feature.rst deleted file mode 100644 index 4c50532ae0..0000000000 --- a/changes/2874.feature.rst +++ /dev/null @@ -1,9 +0,0 @@ -Adds zarr-specific data type classes. This replaces the internal use of numpy data types for zarr -v2 and a fixed set of string enums for zarr v3. This change is largely internal, but it does -change the type of the ``dtype`` and ``data_type`` fields on the ``ArrayV2Metadata`` and -``ArrayV3Metadata`` classes. It also changes the JSON metadata representation of the -variable-length string data type, but the old metadata representation can still be -used when reading arrays. The logic for automatically choosing the chunk encoding for a given data -type has also changed, and this necessitated changes to the ``config`` API. - -For more on this new feature, see the `documentation `_ \ No newline at end of file diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 871db60874..67b134d442 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -184,6 +184,7 @@ which can be used to print useful diagnostics, e.g.:: Type : Array Zarr format : 3 Data type : Int32(endianness='little') + Fill value : 0 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -201,6 +202,7 @@ prints additional diagnostics, e.g.:: Type : Array Zarr format : 3 Data type : Int32(endianness='little') + Fill value : 0 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -210,7 +212,7 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 3558573 + No. bytes stored : 3558573 (3.4M) Storage ratio : 112.4 Chunks Initialized : 100 @@ -244,16 +246,6 @@ built-in delta filter:: >>> z.compressors (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) -The default compressor can be changed by setting the value of the using Zarr's -:ref:`user-guide-config`, e.g.:: - - >>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}): - ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.filters - () - >>> z.compressors - (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) - To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) @@ -288,6 +280,7 @@ Here is an example using a delta filter with the Blosc compressor:: Type : Array Zarr format : 3 Data type : Int32(endianness='little') + Fill value : 0 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -605,6 +598,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Type : Array Zarr format : 3 Data type : UInt8() + Fill value : 0 Shape : (10000, 10000) Shard shape : (1000, 1000) Chunk shape : (100, 100) @@ -615,7 +609,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Serializer : BytesCodec(endian=None) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) - No. bytes stored : 3981473 + No. bytes stored : 3981473 (3.8M) Storage ratio : 25.1 Shards Initialized : 100 diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 4479e30619..0ae8017ca9 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -43,27 +43,9 @@ This is the current default configuration:: >>> zarr.config.pprint() {'array': {'order': 'C', - 'v2_default_compressor': {'default': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'variable-length-string': {'checksum': False, - 'id': 'zstd', - 'level': 0}}, - 'v2_default_filters': {'default': None, - 'variable-length-string': [{'id': 'vlen-utf8'}]}, - 'v3_default_compressors': {'default': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'variable-length-string': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, - 'v3_default_filters': {'default': [], 'variable-length-string': []}, - 'v3_default_serializer': {'default': {'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - 'variable-length-string': {'name': 'vlen-utf8'}}, - 'write_empty_chunks': False}, + 'write_empty_chunks': False}, 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.core.buffer.cpu.Buffer', + 'buffer': 'zarr.buffer.cpu.Buffer', 'codec_pipeline': {'batch_size': 1, 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', @@ -78,5 +60,5 @@ This is the current default configuration:: 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, 'default_zarr_format': 3, 'json_indent': 2, - 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', + 'ndbuffer': 'zarr.buffer.cpu.NDBuffer', 'threading': {'max_workers': None}} diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index c101ae50fc..d4b49ca43f 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,34 +1,56 @@ -Data types -========== +.. _user-guide-data-types: -Zarr's data type model +Array data types +================ + +Zarr's Data Type Model ---------------------- -Every Zarr array has a "data type", which defines the meaning and physical layout of the -array's elements. As Zarr Python is tightly integrated with `NumPy `_, -it's easy to create arrays with NumPy data types: +Zarr is designed for interoperability with NumPy, so if you are familiar with NumPy or any other +N-dimensional array library, Zarr's model for array data types should seem familiar. However, Zarr +data types have some unique features that are described in this document. -.. code-block:: python +Zarr arrays operate under an essential design constraint: unlike NumPy arrays, Zarr arrays +are designed to be stored and accessed by other Zarr implementations. This means that, among other things, +Zarr data types must be serializable to metadata documents in accordance with the Zarr specifications, +which adds some unique aspects to the Zarr data type model. - >>> import zarr - >>> import numpy as np - >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) - >>> z - +The following sections explain Zarr's data type model in greater detail and demonstrate the +Zarr Python APIs for working with Zarr data types. + +Array Data Types +^^^^^^^^^^^^^^^^ + +Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data +type is encoded in the JSON metadata for the array. This means that the data type of an array must be +JSON-serializable. -Unlike NumPy arrays, Zarr arrays are designed to accessed by Zarr -implementations in different programming languages. This means Zarr data types must be interpreted -correctly when clients read an array. Each Zarr data type defines procedures for -encoding and decoding both the data type itself, and scalars from that data type to and from Zarr array metadata. And these serialization procedures -depend on the Zarr format. +In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. +Zarr V3 changed the name of this field to ``data_type`` and also defined new rules for the values +that can be assigned to the ``data_type`` field. -Data types in Zarr version 2 ------------------------------ +For example, in Zarr V2, the boolean array data type was represented in array metadata as the +string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool"``. + +Scalars +^^^^^^^ + +Zarr also specifies how array elements, i.e., scalars, are encoded in array metadata. This is necessary +because Zarr uses a field in array metadata to define a default value for chunks that are not stored. +This field, called ``fill_value`` in both Zarr V2 and Zarr V3 metadata documents, contains a +JSON value that can be decoded to a scalar value compatible with the array's data type. + +For the boolean data type, the scalar encoding is simple—booleans are natively supported by +JSON, so Zarr saves booleans as JSON booleans. Other scalars, like floats or raw bytes, have +more elaborate encoding schemes, and in some cases, this scheme depends on the Zarr format version. + +Data Types in Zarr Version 2 +---------------------------- Version 2 of the Zarr format defined its data types relative to `NumPy's data types `_, -and added a few non-NumPy data types as well. Thus the JSON identifier for a NumPy-compatible data -type is just the NumPy ``str`` attribute of that data type: +and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr +V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: .. code-block:: python @@ -38,89 +60,196 @@ type is just the NumPy ``str`` attribute of that data type: >>> >>> store = {} >>> np_dtype = np.dtype('int64') + >>> np_dtype.str + '>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> dtype_meta '>> assert dtype_meta == np_dtype.str .. note:: + The ``<`` character in the data type metadata encodes the `endianness `_, - or "byte order", of the data type. Following NumPy's example, + or "byte order," of the data type. As per the NumPy model, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. -In addition to defining a representation of the data type itself (which in the example above was -just a simple string ``"`_, and +`"object" <#object-data-type>`_ data types. + +Structured Data Type +^^^^^^^^^^^^^^^^^^^^ + +NumPy allows the construction of a so-called "structured" data types comprised of ordered collections +of named fields, where each field is itself a distinct NumPy data type. See the NumPy documentation +`here `_. + +Crucially, NumPy does not use a special data type for structured data types—instead, NumPy +implements structured data types as an optional feature of the so-called "Void" data type, which models +arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void +data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` +attribute does not convey information about the fields contained in a structured data type. +For these reasons, Zarr V2 uses a special data type encoding for structured data types. +They are stored in JSON as lists of pairs, where the first element is a string, and the second +element is a Zarr V2 data type specification. This representation supports recursion. + +For example: + +.. code-block:: python + + >>> store = {} + >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', [('subfield_c', '>f4'), ('subfield_d', 'i2')])]) + >>> np_dtype.str + '|V8' + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> dtype_meta + [['field_a', '>i2'], ['field_b', [['subfield_c', '>f4'], ['subfield_d', 'M[10s]"`` in - Zarr V2. This is more compact, but can be harder to parse. +- Zarr V3 data types do not have endianness. This is a departure from Zarr V2, where multi-byte + data types are defined with endianness information. Instead, Zarr V3 requires that the endianness + of encoded array chunks is specified in the ``codecs`` attribute of array metadata. The Zarr + V3 specification leaves the in-memory endianness of decoded array chunks as an implementation detail. For more about data types in Zarr V3, see the `V3 specification `_. -Data types in Zarr Python +Data Types in Zarr Python ------------------------- -The two Zarr formats that Zarr Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as NumPy-compatible strings, while data types in Zarr version -3 are encoded as either strings or ``JSON`` objects, -and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. +The two Zarr formats that Zarr Python supports specify data types in different ways: data types in +Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data +types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data +types do not have any associated endianness information, unlike Zarr V2 data types. + +Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences. +We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_, +which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. -To abstract over these syntactical and semantic differences, Zarr Python uses a class called -`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ provide Zarr V2 and Zarr V3 compatibility -routines for ""native" data types. In this context, a "native" data type is a Python class, -typically defined in another library, that models an array's data type. For example, ``np.uint8`` is a native -data type defined in NumPy, which Zarr Python wraps with a ``ZDType`` instance called +In this context, a "native" data type is a Python class, typically defined in another library, that +models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. +Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. -Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an -API for the following operations: +As of this writing, the only native data types Zarr Python supports are NumPy data types. We could +avoid the "native data type" jargon and just say "NumPy data type," but we do not want to rule out the +possibility of using non-NumPy array backends in the future. -- Wrapping / unwrapping a native data type -- Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. -- Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. +Each data type supported by Zarr Python is modeled by a ``ZDType`` subclass, which provides an +API for the following operations: +- Encoding and decoding a native data type +- Encoding and decoding a data type to and from Zarr V2 and Zarr V3 array metadata +- Encoding and decoding a scalar value to and from Zarr V2 and Zarr V3 array metadata +- Casting a Python object to a scalar value consistent with the data type + +List of data types +^^^^^^^^^^^^^^^^^^ + +The following section lists the data types built in to Zarr Python. With a few exceptions, Zarr +Python supports nearly all of the data types in NumPy. If you need a data type that is not listed +here, it's possible to create it yourself: see :ref:`adding-new-data-types`. + +Boolean +""""""" +- `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ + +Integral +"""""""" +- `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ +- `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ +- `Signed 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int32>`_ +- `Signed 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int64>`_ +- `Unsigned 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt8>`_ +- `Unsigned 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt16>`_ +- `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ +- `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ + +Floating-point +"""""""""""""" +- `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ +- `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ +- `64-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float64>`_ +- `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ +- `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ + +String +"""""" +- `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ +- `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ + +Bytes +""""" +- `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ +- `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ +- `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ + +Temporal +"""""""" +- `DateTime64 <../api/zarr/dtype/index.html#zarr.dtype.DateTime64>`_ +- `TimeDelta64 <../api/zarr/dtype/index.html#zarr.dtype.TimeDelta64>`_ + +Struct-like +""""""""""" +- `Structured <../api/zarr/dtype/index.html#zarr.dtype.Structured>`_ Example Usage -~~~~~~~~~~~~~ +^^^^^^^^^^^^^ + +This section will demonstrates the basic usage of Zarr data types. Create a ``ZDType`` from a native data type: @@ -130,7 +259,7 @@ Create a ``ZDType`` from a native data type: >>> import numpy as np >>> int8 = Int8.from_native_dtype(np.dtype('int8')) -Convert back to native data type: +Convert back to a native data type: .. code-block:: python @@ -144,14 +273,27 @@ Get the default scalar value for the data type: >>> default_value = int8.default_scalar() >>> assert default_value == np.int8(0) - -Serialize to JSON for Zarr V2 and V3 +Serialize to JSON for Zarr V2: .. code-block:: python >>> json_v2 = int8.to_json(zarr_format=2) >>> json_v2 - '|i1' + {'name': '|i1', 'object_codec_id': None} + +.. note:: + + The representation returned by ``to_json(zarr_format=2)`` is more abstract than the literal contents + of Zarr V2 array metadata, because the JSON representation used by the ``ZDType`` classes must be + distinct across different data types. As noted `earlier <#object-data-type>`_, Zarr V2 identifies + multiple distinct data types with the "object" data type identifier ``"|O"``. Extra information + is needed to disambiguate these data types from one another. That's the reason for the + ``object_codec_id`` field you see here. + +And for V3: + +.. code-block:: python + >>> json_v3 = int8.to_json(zarr_format=3) >>> json_v3 'int8' @@ -170,3 +312,144 @@ Deserialize a scalar value from JSON: >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) + +.. _adding-new-data-types: + +Adding New Data Types +^^^^^^^^^^^^^^^^^^^^^ + +Each Zarr data type is a separate Python class that inherits from +`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. You can define a custom data type by +writing your own subclass of `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ and adding +your data type to the data type registry. A complete example of this process is included below. + +The source code for this example can be found in the ``examples/custom_dtype.py`` file in the Zarr +Python project directory. + +.. literalinclude:: ../../examples/custom_dtype.py + :language: python + +Data Type Resolution +^^^^^^^^^^^^^^^^^^^^ + +Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array +with a NumPy data type object: + +.. code-block:: python + + >>> from zarr import create_array + >>> import numpy as np + >>> a = create_array({}, shape=(10,), dtype=np.dtype('int')) + >>> a + + +Or a string representation of a NumPy data type: + +.. code-block:: python + + >>> a = create_array({}, shape=(10,), dtype='>> a + + +The ``Array`` object presents itself like a NumPy array, including exposing a NumPy +data type as its ``dtype`` attribute: + +.. code-block:: python + + >>> type(a.dtype) + + +But if we inspect the metadata for the array, we can see the Zarr data type object: + +.. code-block:: python + + >>> type(a.metadata.data_type) + + +This example illustrates a general problem Zarr Python has to solve: how can we allow users to +specify a data type as a string or a NumPy ``dtype`` object, and produce the right Zarr data type +from that input? We call this process "data type resolution." Zarr Python also performs data type +resolution when reading stored arrays, although in this case the input is a JSON value instead +of a NumPy data type. + +For simple data types like ``int``, the solution could be extremely simple: just +maintain a lookup table that maps a NumPy data type to the Zarr data type equivalent. But not all +data types are so simple. Consider this case: + +.. code-block:: python + + >>> from zarr import create_array + >>> import warnings + >>> import numpy as np + >>> warnings.simplefilter("ignore", category=FutureWarning) + >>> a = create_array({}, shape=(10,), dtype=[('a', 'f8'), ('b', 'i8')]) + >>> a.dtype # this is the NumPy data type + dtype([('a', '>> a.metadata.data_type # this is the Zarr data type + Structured(fields=(('a', Float64(endianness='little')), ('b', Int64(endianness='little')))) + +In this example, we created a +`NumPy structured data type `_. +This data type is a container that can hold any NumPy data type, which makes it recursive. It is +not possible to make a lookup table that relates all NumPy structured data types to their Zarr +equivalents, as there is a nearly unbounded number of different structured data types. So instead of +a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. + +Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," +is essentially a dictionary where the keys are strings (a canonical name for each data type), and the +values are the data type classes themselves. Dynamic data type resolution entails iterating over +these data type classes, invoking that class' `from_native_dtype <#api/dtype/ZDType.from_native_dtype>`_ +method, and returning a concrete data type instance if and only if exactly one of those constructor +invocations is successful. + +In plain language, we take some user input, like a NumPy data type, offer it to all the +known data type classes, and return an instance of the one data type class that can accept that user input. + +We want to avoid a situation where the same native data type matches multiple Zarr data types; that is, +a NumPy data type should *uniquely* specify a single Zarr data type. But data type resolution is +dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we +attempt data type resolution against *every* data type class, and if, for some reason, a native data +type matches multiple Zarr data types, we treat this as an error and raise an exception. + +If you have a NumPy data type and you want to get the corresponding ``ZDType`` instance, you can use +the ``parse_data_type`` function, which will use the dynamic resolution described above. ``parse_data_type`` +handles a range of input types: + +- NumPy data types: + + .. code-block:: python + + >>> import numpy as np + >>> from zarr.dtype import parse_data_type + >>> my_dtype = np.dtype('>M8[10s]') + >>> parse_data_type(my_dtype, zarr_format=2) + DateTime64(endianness='big', scale_factor=10, unit='s') + + +- NumPy data type-compatible strings: + + .. code-block:: python + + >>> dtype_str = '>M8[10s]' + >>> parse_data_type(dtype_str, zarr_format=2) + DateTime64(endianness='big', scale_factor=10, unit='s') + +- ``ZDType`` instances: + + .. code-block:: python + + >>> from zarr.dtype import DateTime64 + >>> zdt = DateTime64(endianness='big', scale_factor=10, unit='s') + >>> parse_data_type(zdt, zarr_format=2) # Use a ZDType (this is a no-op) + DateTime64(endianness='big', scale_factor=10, unit='s') + +- Python dictionaries (requires ``zarr_format=3``). These dictionaries must be consistent with the + ``JSON`` form of the data type: + + .. code-block:: python + + >>> dt_dict = {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}} + >>> parse_data_type(dt_dict, zarr_format=3) + DateTime64(endianness='little', scale_factor=10, unit='s') + >>> parse_data_type(dt_dict, zarr_format=3).to_json(zarr_format=3) + {'name': 'numpy.datetime64', 'configuration': {'unit': 's', 'scale_factor': 10}} diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index f9633dd6c1..a343c3617e 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -129,6 +129,7 @@ property. E.g.:: Type : Array Zarr format : 3 Data type : Int64(endianness='little') + Fill value : 0 Shape : (1000000,) Chunk shape : (100000,) Order : C @@ -145,6 +146,7 @@ property. E.g.:: Type : Array Zarr format : 3 Data type : Float32(endianness='little') + Fill value : 0.0 Shape : (1000, 1000) Chunk shape : (100, 100) Order : C diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 1c0211cf15..f92c576f32 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -8,7 +8,6 @@ User guide installation arrays - data_types groups attributes storage diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 4bcffc15ff..0f31e5d7be 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -92,6 +92,7 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Type : Array Zarr format : 3 Data type : UInt8() + Fill value : 0 Shape : (10000, 10000, 1000) Shard shape : (1000, 1000, 1000) Chunk shape : (100, 100, 100) @@ -122,6 +123,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Type : Array Zarr format : 3 Data type : Int32(endianness='little') + Fill value : 0 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : C @@ -141,6 +143,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Type : Array Zarr format : 3 Data type : Int32(endianness='little') + Fill value : 0 Shape : (10000, 10000) Chunk shape : (1000, 1000) Order : F From 31c95ca38cf7c7ca7f79c5825a512846ae17acb6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 21 Jul 2025 22:27:33 +0200 Subject: [PATCH 127/129] recover from bad rebase --- src/zarr/abc/codec.py | 2 +- src/zarr/api/asynchronous.py | 31 +- src/zarr/api/synchronous.py | 2 +- src/zarr/codecs/blosc.py | 40 +- src/zarr/codecs/bytes.py | 1 - src/zarr/codecs/gzip.py | 1 + src/zarr/codecs/numcodec.py | 44 +- src/zarr/codecs/sharding.py | 12 - src/zarr/codecs/vlen_utf8.py | 18 +- src/zarr/codecs/zstd.py | 13 +- src/zarr/core/_info.py | 1 + src/zarr/core/array.py | 172 ++- src/zarr/core/codec_pipeline.py | 14 +- src/zarr/core/common.py | 14 +- src/zarr/core/config.py | 25 +- src/zarr/core/dtype/__init__.py | 124 +- src/zarr/core/dtype/common.py | 166 ++- src/zarr/core/dtype/npy/bool.py | 269 +++- src/zarr/core/dtype/npy/common.py | 69 +- src/zarr/core/dtype/npy/complex.py | 313 ++++- src/zarr/core/dtype/npy/float.py | 321 ++++- src/zarr/core/dtype/npy/int.py | 1497 +++++++++++++++++---- src/zarr/core/dtype/npy/string.py | 887 ++++++++---- src/zarr/core/dtype/npy/time.py | 839 ++++++++++-- src/zarr/core/dtype/registry.py | 161 ++- src/zarr/core/dtype/wrapper.py | 336 ++--- src/zarr/core/metadata/v2.py | 43 +- src/zarr/core/metadata/v3.py | 8 +- src/zarr/dtype.py | 88 +- src/zarr/registry.py | 21 +- src/zarr/testing/strategies.py | 20 +- tests/conftest.py | 2 +- tests/package_with_entrypoint/__init__.py | 34 +- tests/test_array.py | 58 +- tests/test_config.py | 94 +- tests/test_dtype/conftest.py | 7 +- tests/test_dtype/test_npy/test_bool.py | 5 +- tests/test_dtype/test_npy/test_common.py | 12 +- tests/test_dtype/test_npy/test_complex.py | 15 +- tests/test_dtype/test_npy/test_float.py | 23 +- tests/test_dtype/test_npy/test_int.py | 50 +- tests/test_dtype/test_npy/test_string.py | 135 +- tests/test_dtype/test_npy/test_time.py | 26 +- tests/test_dtype/test_wrapper.py | 67 +- tests/test_dtype_registry.py | 49 +- tests/test_examples.py | 4 +- tests/test_group.py | 5 +- tests/test_gzip.py | 7 +- tests/test_image_codecs.py | 2 +- tests/test_info.py | 4 + tests/test_metadata/test_v2.py | 1 + tests/test_metadata/test_v3.py | 2 +- tests/test_properties.py | 9 +- tests/test_regression/test_regression.py | 144 -- tests/test_v2.py | 58 +- tests/test_x.py | 4 - 56 files changed, 4622 insertions(+), 1747 deletions(-) delete mode 100644 tests/test_regression/test_regression.py delete mode 100644 tests/test_x.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 067a05b185..c1140ee4be 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -22,7 +22,7 @@ from collections.abc import Awaitable, Callable, Iterable from typing import Self - from zarr.abc.store import ByteGetter, ByteSetter + from zarr.abc.store import ByteGetter, ByteSetter, Store from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 2a17e425c8..9a380082b0 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,14 +9,17 @@ import numpy.typing as npt from typing_extensions import deprecated +from zarr.abc.store import Store from zarr.core.array import ( + DEFAULT_FILL_VALUE, Array, AsyncArray, - _get_default_chunk_encoding_v2, + CompressorLike, create_array, + from_array, get_array_metadata, ) -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams +from zarr.core.array_spec import ArrayConfigLike, parse_array_config from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -28,7 +31,7 @@ _default_zarr_format, _warn_write_empty_chunks_kwarg, ) -from zarr.core.dtype import ZDTypeLike, get_data_type_from_native_dtype, parse_data_type +from zarr.core.dtype import ZDTypeLike, get_data_type_from_native_dtype from zarr.core.group import ( AsyncGroup, ConsolidatedMetadata, @@ -36,7 +39,8 @@ create_hierarchy, ) from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.errors import NodeTypeValidationError +from zarr.errors import GroupNotFoundError, NodeTypeValidationError +from zarr.storage import StorePath from zarr.storage._common import make_store_path if TYPE_CHECKING: @@ -857,7 +861,7 @@ async def open_group( async def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 - chunks: ChunkCoords | int | None = None, # TODO: v2 allowed chunks=True + chunks: ChunkCoords | int | bool | None = None, dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", fill_value: Any | None = DEFAULT_FILL_VALUE, @@ -1005,21 +1009,6 @@ async def create( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) or _default_zarr_format() ) - zdtype = parse_data_type(dtype, zarr_format=zarr_format) - if zarr_format == 2: - if chunks is None: - chunks = shape - default_filters, default_compressor = _get_default_chunk_encoding_v2(zdtype) - if not filters: - filters = default_filters # type: ignore[assignment] - if compressor is None: - compressor = default_compressor - elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] - if chunks is not None: - chunk_shape = chunks - chunks = None - else: - chunk_shape = shape if synchronizer is not None: warnings.warn("synchronizer is not yet implemented", RuntimeWarning, stacklevel=2) @@ -1060,7 +1049,7 @@ async def create( store_path, shape=shape, chunks=chunks, - dtype=zdtype, + dtype=dtype, compressor=compressor, fill_value=fill_value, overwrite=overwrite, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index e25835900d..4ce02e7b6d 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -602,7 +602,7 @@ def create( chunks: ChunkCoords | int | bool | None = None, dtype: ZDTypeLike | None = None, compressor: CompressorLike = "auto", - fill_value: Any | None = None, # TODO: need type + fill_value: Any | None = DEFAULT_FILL_VALUE, # TODO: need type order: MemoryOrder | None = None, store: str | StoreLike | None = None, synchronizer: Any | None = None, diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index d86513a6a0..f40993b201 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -3,7 +3,6 @@ import asyncio from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload @@ -17,7 +16,6 @@ JSON, NamedRequiredConfig, ZarrFormat, - parse_named_configuration, ) from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec @@ -42,6 +40,7 @@ class BloscConfigV2(TypedDict): blocksize: int typesize: NotRequired[int] + class BloscConfigV3(TypedDict): cname: BloscCname clevel: int @@ -49,11 +48,13 @@ class BloscConfigV3(TypedDict): blocksize: int typesize: int + class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2): """ The JSON form of the Blosc codec in Zarr V2. """ + class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ The JSON form of the Blosc codec in Zarr V3. @@ -81,11 +82,10 @@ def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: def parse_cname(value: object) -> BloscCname: if value not in BLOSC_CNAME: - raise ValueError( - f"Value must be one of {BLOSC_CNAME}. Got {value} instead." - ) + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") return value + # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc numcodecs.blosc.use_threads = False @@ -154,21 +154,9 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: return cls.from_json(data, zarr_format=3) - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: return self.to_json(zarr_format=3) - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname, - "clevel": self.clevel, - "shuffle": self.shuffle, - "blocksize": self.blocksize, - }, - } @classmethod def _from_json_v2(cls, data: CodecJSON) -> Self: @@ -213,22 +201,23 @@ def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: raise ValueError("typesize and blocksize need to be set for encoding.") if zarr_format == 2: return { - "id": "blosc", + "id": "blosc", "clevel": self.clevel, "cname": self.cname, "shuffle": BLOSC_SHUFFLE.index(self.shuffle), - "blocksize": self.blocksize - } + "blocksize": self.blocksize, + } elif zarr_format == 3: return { - "name": "blosc", + "name": "blosc", "configuration": { "clevel": self.clevel, "cname": self.cname, "shuffle": self.shuffle, "typesize": self.typesize, - "blocksize": self.blocksize - }} + "blocksize": self.blocksize, + }, + } raise ValueError( f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." ) # pragma: no cover @@ -241,10 +230,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle="bitshuffle" if item_size == 1 else "shuffle"), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index b139bdb157..93fe2e034d 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -12,7 +12,6 @@ from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness -from zarr.core.dtype.npy.common import endianness_to_numpy_str from zarr.registry import register_codec if TYPE_CHECKING: diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 5fdac6beab..c2d3bbaa55 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -36,6 +36,7 @@ def parse_gzip_level(data: JSON) -> int: class GZipConfig(TypedDict): level: int + class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): """ The JSON form of the GZip codec in Zarr V2. diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py index afb15b1a0d..93eedd979c 100644 --- a/src/zarr/codecs/numcodec.py +++ b/src/zarr/codecs/numcodec.py @@ -6,10 +6,8 @@ import asyncio from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, overload -import numcodecs -import numcodecs.registry as numcodecs_registry import numpy as np from typing_extensions import Protocol, runtime_checkable @@ -31,6 +29,7 @@ BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] | NDArrayLike + def get_numcodec_class(name: str) -> type[Numcodec]: """Obtain a numcodec codec class by name. @@ -52,6 +51,8 @@ def get_numcodec_class(name: str) -> type[Numcodec]: Zlib(level=1) """ + import numcodecs.registry as numcodecs_registry + cls = numcodecs_registry.codec_registry.get(name) if cls is None and name in numcodecs_registry.entries: cls = numcodecs_registry.entries[name].load() @@ -60,9 +61,6 @@ def get_numcodec_class(name: str) -> type[Numcodec]: return cls raise KeyError(name) -def resolve_numcodec(config: CodecJSON_V2[str]) -> Numcodec: - return numcodecs.get_codec(config) # type: ignore[no-any-return] - @runtime_checkable class Numcodec(Protocol): @@ -83,6 +81,7 @@ def get_config(self) -> CodecJSON_V2[str]: ... @classmethod def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: """ Check if the given object implements the Numcodec protocol. Because the @runtime_checkable @@ -90,18 +89,18 @@ def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: we need to manually check for the presence of the required attributes and methods. """ return ( - isinstance(obj, type) and - hasattr(obj, "codec_id") and - isinstance(obj.codec_id, str) and - hasattr(obj, "encode") and - callable(obj.encode) and - hasattr(obj, "decode") and - callable(obj.decode) and - hasattr(obj, "get_config") and - callable(obj.get_config) and - hasattr(obj, "from_config") and - callable(obj.from_config) - ) + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) @dataclass(frozen=True, kw_only=True) @@ -113,9 +112,7 @@ def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... @overload def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: if zarr_format == 2: return self.codec.get_config() elif zarr_format == 3: @@ -126,7 +123,9 @@ def to_json( @classmethod def _from_json_v2(cls, data: CodecJSON) -> Self: - return cls(codec=resolve_numcodec(data)) # type: ignore[arg-type] + raise NotADirectoryError( + "This class does not support creating instances from JSON data for Zarr format 2." + ) @classmethod def _from_json_v3(cls, data: CodecJSON) -> Self: @@ -137,7 +136,6 @@ def _from_json_v3(cls, data: CodecJSON) -> Self: def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: raise NotImplementedError - def to_array_array(self) -> NumcodecsArrayArrayCodec: """ Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 04d2c3554e..54a3d0bc31 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -55,7 +55,6 @@ ChunkCoordsLike, NamedRequiredConfig, parse_enum, - parse_named_configuration, parse_shapelike, product, ) @@ -453,8 +452,6 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: return cls.from_json(data, zarr_format=3) - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") - return cls(**configuration_parsed) # type: ignore[arg-type] @classmethod def _from_json_v2(cls, data: CodecJSON) -> Self: @@ -493,15 +490,6 @@ def codec_pipeline(self) -> CodecPipeline: def to_dict(self) -> dict[str, JSON]: return self.to_json(zarr_format=3) - return { - "name": "sharding_indexed", - "configuration": { - "chunk_shape": self.chunk_shape, - "codecs": tuple(s.to_dict() for s in self.codecs), - "index_codecs": tuple(s.to_dict() for s in self.index_codecs), - "index_location": self.index_location.value, - }, - } @overload def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index cf7a97f816..4124883ab0 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -21,14 +21,15 @@ _vlen_utf8_codec = VLenUTF8() _vlen_bytes_codec = VLenBytes() -class VlenUF8Config(TypedDict): - ... -class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): - ... +class VlenUF8Config(TypedDict): ... + + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): ... + + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... -class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): - ... class VLenBytesConfig(TypedDict): ... @@ -125,11 +126,6 @@ class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: return cls.from_json(data, zarr_format=3) - _, configuration_parsed = parse_named_configuration( - data, "vlen-bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 69620efde3..f942bd5700 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -16,7 +16,6 @@ JSON, NamedRequiredConfig, ZarrFormat, - parse_named_configuration, ) from zarr.registry import register_codec @@ -61,6 +60,7 @@ def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: and set(data["configuration"].keys()) == {"level", "checksum"} ) + def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): if data >= 23: @@ -100,8 +100,6 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: return cls.from_json(data, zarr_format=3) - _, configuration_parsed = parse_named_configuration(data, "zstd") - return cls(**configuration_parsed) # type: ignore[arg-type] @classmethod def _from_json_v2(cls, data: CodecJSON) -> Self: @@ -125,13 +123,13 @@ def _from_json_v3(cls, data: CodecJSON) -> Self: ) msg = ( "Invalid Zarr V3 JSON representation of the zstd codec. " - f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration') " "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" ) raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return self.to_json(zarr_format=3) @overload def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... @@ -143,7 +141,10 @@ def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: if zarr_format == 2: return {"id": "zstd", "level": self.level} else: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return { + "name": "zstd", + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 1a16a4808a..a5b14d573a 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -81,6 +81,7 @@ class ArrayInfo: _type: Literal["Array"] = "Array" _zarr_format: ZarrFormat _data_type: ZDType[TBaseDType, TBaseScalar] + _fill_value: object _shape: tuple[int, ...] _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 00c2f0beca..40d5f3371d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -61,18 +61,18 @@ _default_zarr_format, _warn_order_kwarg, concurrent_map, - parse_order, parse_shapelike, product, ) -from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( + VariableLengthBytes, + VariableLengthUTF8, ZDType, ZDTypeLike, parse_data_type, ) -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -109,6 +109,7 @@ ) from zarr.core.metadata.v2 import ( CompressorLike_V2, + get_object_codec_id, parse_compressor, parse_filters, ) @@ -119,12 +120,10 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, - get_codec, get_pipeline_class, ) from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path from zarr.storage._utils import _relativize_path -from collections.abc import Sequence if TYPE_CHECKING: from collections.abc import Iterator @@ -741,7 +740,10 @@ def _create_metadata_v3( shape = parse_shapelike(shape) if codecs is None: - filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype) + filters = default_filters_v3(dtype) + serializer = default_serializer_v3(dtype) + compressors = default_compressors_v3(dtype) + codecs_parsed = (*filters, serializer, *compressors) else: codecs_parsed = tuple(codecs) @@ -752,8 +754,9 @@ def _create_metadata_v3( else: chunk_key_encoding_parsed = chunk_key_encoding - if fill_value is None: - # v3 spec will not allow a null fill value + if isinstance(fill_value, DefaultFillValue) or fill_value is None: + # Use dtype's default scalar for DefaultFillValue sentinel + # For v3, None is converted to DefaultFillValue behavior fill_value_parsed = dtype.default_scalar() else: fill_value_parsed = fill_value @@ -829,14 +832,20 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[Codec] | None = None, - compressor: Codec | None = None, + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: dimension_separator = "." - if fill_value is None: - fill_value = dtype.default_scalar() # type: ignore[assignment] + + # Handle DefaultFillValue sentinel + if isinstance(fill_value, DefaultFillValue): + fill_value_parsed: Any = dtype.default_scalar() + else: + # For v2, preserve None as-is (backward compatibility) + fill_value_parsed = fill_value + return ArrayV2Metadata( shape=shape, dtype=dtype, @@ -862,7 +871,7 @@ async def _create_v2( dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, filters: Iterable[CompressorLike_V2] | None = None, - compressor: CompressorLike = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -876,7 +885,7 @@ async def _create_v2( compressor_parsed: CompressorLike_V2 if compressor == "auto": - _, compressor_parsed = _get_default_chunk_encoding_v2(dtype) + compressor_parsed = default_compressor_v2(dtype) else: compressor_parsed = compressor @@ -1765,6 +1774,7 @@ def _info( return ArrayInfo( _zarr_format=self.metadata.zarr_format, _data_type=self._zdtype, + _fill_value=self.metadata.fill_value, _shape=self.shape, _order=self.order, _shard_shape=self.shards, @@ -4480,30 +4490,26 @@ async def create_array( data_parsed, shape_parsed, dtype_parsed = _parse_data_params( data=data, shape=shape, dtype=dtype ) - result = await init_array( - store_path=store_path, - shape=shape_parsed, - dtype=dtype_parsed, - chunks=chunks, - shards=shards, - filters=filters, - compressors=compressors, - serializer=serializer, - fill_value=fill_value, - order=order, - zarr_format=zarr_format, - attributes=attributes, - chunk_key_encoding=chunk_key_encoding, - dimension_names=dimension_names, - overwrite=overwrite, - config=config, - ) - - if write_data is True and data_parsed is not None: - await result._set_selection( - BasicIndexer(..., shape=result.shape, chunk_grid=result.chunk_grid), - data_parsed, - prototype=default_buffer_prototype(), + if data_parsed is not None: + return await from_array( + store, + data=data_parsed, + write_data=write_data, + name=name, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, ) else: mode: Literal["a"] = "a" @@ -4649,46 +4655,74 @@ def _parse_chunk_key_encoding( return result -def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: +def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ - Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. + Given a data type, return the default filters for that data type. + + This is an empty tuple. No data types have default filters. """ + return () - dtype_category = categorize_data_type(dtype) - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) +def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]: + """ + Given a data type, return the default compressors for that data type. - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) + This is just a tuple containing ``ZstdCodec`` + """ + return (ZstdCodec(),) -def _get_default_chunk_encoding_v2( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[Codec | NumcodecsWrapper, ...] | None, Codec | NumcodecsWrapper | None]: +def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: + """ + Given a data type, return the default serializer for that data type. + + The default serializer for most data types is the ``BytesCodec``, which may or may not be + parameterized with an endianness, depending on whether the data type has endianness. Variable + length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and + ``VLenBytesCodec``, respectively. + + """ + serializer: ArrayBytesCodec = BytesCodec(endian=None) + + if isinstance(dtype, HasEndianness): + serializer = BytesCodec(endian="little") + elif isinstance(dtype, HasObjectCodec): + if dtype.object_codec_id == "vlen-bytes": + serializer = VLenBytesCodec() + elif dtype.object_codec_id == "vlen-utf8": + serializer = VLenUTF8Codec() + else: + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." + raise ValueError(msg) + return serializer + + +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: """ Given a data type, return the default filters for that data type. - This is an empty tuple. No data types have default filters. + For data types that require an object codec, namely variable length data types, + this is a tuple containing the object codec. Otherwise it's ``None``. """ - dtype_category = categorize_data_type(dtype) - filters_config = zarr_config.get("array.v2_default_filters").get(dtype_category) - compressor_config = zarr_config.get("array.v2_default_compressor").get(dtype_category) - if compressor_config is not None: - compressor = get_codec(compressor_config["name"], compressor_config.get("configuration", {})) - else: - compressor = None - if filters_config is not None: - filters = tuple(get_codec(f['name'], f.get('configuration', {})) for f in filters_config) - else: - filters = None - return filters, compressor + if isinstance(dtype, HasObjectCodec): + if dtype.object_codec_id == "vlen-bytes": + return (VLenBytesCodec(),) + elif dtype.object_codec_id == "vlen-utf8": + return (VLenUTF8Codec(),) + else: + msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." + raise ValueError(msg) + return None + + +def default_compressor_v2(dtype: ZDType[Any, Any]) -> BytesBytesCodec: + """ + Given a data type, return the default compressors for that data type. + + This is just the ``Zstd`` codec. + """ + return ZstdCodec() def _parse_chunk_encoding_v2( @@ -4706,7 +4740,7 @@ def _parse_chunk_encoding_v2( if compressor is None or compressor == (): _compressor = None elif compressor == "auto": - _compressor = default_compressor + _compressor = default_compressor_v2(dtype) elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 53bc8dbab9..e4141d0087 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -491,6 +491,7 @@ def codecs_from_list( ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: from zarr.codecs.numcodec import NumcodecsWrapper from zarr.codecs.sharding import ShardingCodec + array_array: tuple[ArrayArrayCodec, ...] = () array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () @@ -510,9 +511,9 @@ def codecs_from_list( for idx, codec in enumerate(codecs_tup): match codec: case ArrayArrayCodec(): - array_array_idcs += ((idx,codec),) + array_array_idcs += ((idx, codec),) case ArrayBytesCodec(): - array_bytes_idcs += ((idx,codec),) + array_bytes_idcs += ((idx, codec),) case BytesBytesCodec(): bytes_bytes_idcs += ((idx, codec),) case NumcodecsWrapper(): # type: ignore[union-attr] @@ -586,7 +587,7 @@ def codecs_from_list( f"{last_array_array_idx + 1}." "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " f"{type(codecs_tup[last_array_array_idx + 1])}" - ) + ) raise TypeError(msg) start = last_array_array_idx + 2 @@ -638,9 +639,7 @@ def codecs_from_list( elif isinstance(aa_codec, NumcodecsWrapper): array_array += (aa_codec.to_array_array(),) else: - msg = ( - f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" - ) + msg = f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" raise TypeError(msg) start = bb_idx + 1 if bb_idx < len(codecs_tup) - 1: @@ -653,7 +652,7 @@ def codecs_from_list( msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" raise TypeError(msg) else: - raise ValueError('More than one ArrayBytes codec found, that is a big error!') + raise ValueError("More than one ArrayBytes codec found, that is a big error!") return array_array, array_bytes_maybe, bytes_bytes @@ -694,7 +693,6 @@ def codecs_from_list( ) bytes_bytes += (cur_codec,) elif isinstance(cur_codec, NumcodecsWrapper): - raise TypeError if array_bytes_maybe is None: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 71b7261f71..e0f279adc3 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -10,6 +10,7 @@ from typing import ( TYPE_CHECKING, Any, + Final, Generic, Literal, NotRequired, @@ -52,7 +53,7 @@ TConfig_co = TypeVar("TConfig_co", bound=BaseConfig, covariant=True) -class NamedConfig(TypedDict, Generic[TName, TConfig]): +class NamedConfig(TypedDict, Generic[TName_co, TConfig_co]): """ A typed dictionary representing an object with a name and configuration, where the configuration is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. @@ -63,14 +64,14 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): The configuration key is not required. """ - name: ReadOnly[TName] + name: ReadOnly[TName_co] """The name of the object.""" - configuration: NotRequired[ReadOnly[TConfig]] + configuration: NotRequired[ReadOnly[TConfig_co]] """The configuration of the object.""" -class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): +class NamedRequiredConfig(TypedDict, Generic[TName_co, TConfig_co]): """ A typed dictionary representing an object with a name and configuration, where the configuration is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. @@ -81,12 +82,13 @@ class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): The configuration key is required. """ - name: ReadOnly[TName] + name: ReadOnly[TName_co] """The name of the object.""" - configuration: Required[ReadOnly[TConfig]] + configuration: Required[ReadOnly[TConfig_co]] """The configuration of the object.""" + def product(tup: ChunkCoords) -> int: return functools.reduce(operator.mul, tup, 1) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 020ea12c7a..cc3c33cd17 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -106,27 +106,6 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "default": {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - "variable-length-string": {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - }, - "v2_default_filters": { - "default": None, - "variable-length-string": [{"name": "vlen-utf8"}], - }, - "v3_default_filters": {"default": [], "variable-length-string": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable-length-string": {"name": "vlen-utf8"}, - }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "variable-length-string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}} - ], - }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, @@ -169,8 +148,8 @@ def categorize_data_type(dtype: ZDType[Any, Any]) -> DTypeCategory: This is used by the config system to determine how to encode arrays with the associated data type when the user has not specified a particular serialization scheme. """ - from zarr.core.dtype import VariableLengthString + from zarr.core.dtype import VariableLengthUTF8 - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthUTF8): return "variable-length-string" return "default" diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 575086cb6f..aadf127c9b 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -2,17 +2,33 @@ from typing import TYPE_CHECKING, Final, TypeAlias -from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeJSON, +) from zarr.core.dtype.npy.bool import Bool +from zarr.core.dtype.npy.bytes import ( + NullTerminatedBytes, + NullterminatedBytesJSON_V2, + NullTerminatedBytesJSON_V3, + RawBytes, + RawBytesJSON_V2, + RawBytesJSON_V3, + VariableLengthBytes, + VariableLengthBytesJSON_V2, +) from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.sized import ( - FixedLengthBytes, - Structured, +from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3 +from zarr.core.dtype.npy.time import ( + DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, + TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, ) -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -24,9 +40,11 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( - FixedLengthASCII, FixedLengthUTF32, - VariableLengthString, + FixedLengthUTF32JSON_V2, + FixedLengthUTF32JSON_V3, + VariableLengthUTF8, + VariableLengthUTF8JSON_V2, ) from zarr.core.dtype.registry import DataTypeRegistry from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -38,9 +56,11 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", - "FixedLengthASCII", - "FixedLengthBytes", + "DateTime64JSON_V2", + "DateTime64JSON_V3", "FixedLengthUTF32", + "FixedLengthUTF32JSON_V2", + "FixedLengthUTF32JSON_V3", "Float16", "Float32", "Float64", @@ -48,16 +68,29 @@ "Int16", "Int32", "Int64", + "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", + "NullterminatedBytesJSON_V2", + "RawBytes", + "RawBytesJSON_V2", + "RawBytesJSON_V3", "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", "TBaseDType", "TBaseScalar", "TimeDelta64", "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", "UInt8", "UInt16", "UInt32", "UInt64", - "VariableLengthString", + "VariableLengthBytes", + "VariableLengthBytesJSON_V2", + "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", "parse_data_type", @@ -74,19 +107,22 @@ ComplexFloatDType = Complex64 | Complex128 COMPLEX_FLOAT_DTYPE: Final = Complex64, Complex128 -StringDType = FixedLengthUTF32 | VariableLengthString | FixedLengthASCII -STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthString, FixedLengthASCII +StringDType = FixedLengthUTF32 | VariableLengthUTF8 +STRING_DTYPE: Final = FixedLengthUTF32, VariableLengthUTF8 TimeDType = DateTime64 | TimeDelta64 TIME_DTYPE: Final = DateTime64, TimeDelta64 +BytesDType = RawBytes | NullTerminatedBytes | VariableLengthBytes +BYTES_DTYPE: Final = RawBytes, NullTerminatedBytes, VariableLengthBytes + AnyDType = ( Bool | IntegerDType | FloatDType | ComplexFloatDType | StringDType - | FixedLengthBytes + | BytesDType | Structured | TimeDType | VariableLengthBytes @@ -99,12 +135,18 @@ *FLOAT_DTYPE, *COMPLEX_FLOAT_DTYPE, *STRING_DTYPE, - FixedLengthBytes, + *BYTES_DTYPE, Structured, *TIME_DTYPE, VariableLengthBytes, ) +# These are aliases for variable-length UTF-8 strings +# We handle them when a user requests a data type instead of using NumPy's dtype inferece because +# the default NumPy behavior -- to inspect the user-provided array data and choose +# an appropriately sized U dtype -- is unworkable for Zarr. +VLEN_UTF8_ALIAS: Final = ("str", str, "string") + # This type models inputs that can be coerced to a ZDType ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str @@ -130,29 +172,59 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, return data_type_registry.match_dtype(dtype=na_dtype) -def get_data_type_from_json_v3( - dtype_spec: JSON, +def get_data_type_from_json( + dtype_spec: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json_v3(dtype_spec) - - -def get_data_type_from_json_v2( - dtype_spec: JSON, *, object_codec_id: str | None = None -) -> ZDType[TBaseDType, TBaseScalar]: - return data_type_registry.match_json_v2(dtype_spec, object_codec_id=object_codec_id) + """ + Given a JSON representation of a data type and a Zarr format version, + attempt to create a ZDType instance from the registered ZDType classes. + """ + return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format) def parse_data_type( - dtype_spec: ZDTypeLike, *, zarr_format: ZarrFormat, object_codec_id: str | None = None + dtype_spec: ZDTypeLike, + *, + zarr_format: ZarrFormat, ) -> ZDType[TBaseDType, TBaseScalar]: """ Interpret the input as a ZDType instance. + + Parameters + ---------- + dtype_spec : ZDTypeLike + The input to be interpreted as a ZDType instance. This could be a native data type + (e.g., a NumPy data type), a Python object that can be converted into a native data type, + a ZDType instance (in which case the input is returned unchanged), or a JSON object + representation of a data type. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The ZDType instance corresponding to the input. + + Examples + -------- + >>> from zarr.dtype import parse_data_type + >>> import numpy as np + >>> parse_data_type("int32", zarr_format=2) + Int32(endianness='little') + >>> parse_data_type(np.dtype('S10'), zarr_format=2) + NullTerminatedBytes(length=10) + >>> parse_data_type({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3) + DateTime64(endianness='little', scale_factor=10, unit='s') """ if isinstance(dtype_spec, ZDType): return dtype_spec # dict and zarr_format 3 means that we have a JSON object representation of the dtype if zarr_format == 3 and isinstance(dtype_spec, Mapping): - return get_data_type_from_json_v3(dtype_spec) # type: ignore[arg-type] + return get_data_type_from_json(dtype_spec, zarr_format=3) + if dtype_spec in VLEN_UTF8_ALIAS: + # If the dtype request is one of the aliases for variable-length UTF-8 strings, + # return that dtype. + return VariableLengthUTF8() # type: ignore[return-value] # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case # we can create a numpy dtype from it, and do the dtype inference from that return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index bbdc06c50d..3cc31df9e3 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -1,39 +1,187 @@ from __future__ import annotations import warnings +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import ClassVar, Final, Literal +from typing import ( + ClassVar, + Final, + Generic, + Literal, + TypedDict, + TypeGuard, + TypeVar, +) + +from typing_extensions import ReadOnly + +from zarr.core.common import NamedConfig + +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" -Endianness = Literal["little", "big"] SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") + JSONFloatV2 = float | SpecialFloatStrings JSONFloatV3 = float | SpecialFloatStrings | str +ObjectCodecID = Literal["vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2"] +# These are the ids of the known object codecs for zarr v2. +OBJECT_CODEC_IDS: Final = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2") + +# This is a wider type than our standard JSON type because we need +# to work with typeddict objects which are assignable to Mapping[str, object] +DTypeJSON = str | int | float | Sequence["DTypeJSON"] | None | Mapping[str, object] + +# The DTypeJSON_V2 type exists because ZDType.from_json takes a single argument, which must contain +# all the information necessary to decode the data type. Zarr v2 supports multiple distinct +# data types that all used the "|O" data type identifier. These data types can only be +# discriminated on the basis of their "object codec", i.e. a special data type specific +# compressor or filter. So to figure out what data type a zarr v2 array has, we need the +# data type identifier from metadata, as well as an object codec id if the data type identifier +# is "|O". +# So we will pack the name of the dtype alongside the name of the object codec id, if applicable, +# in a single dict, and pass that to the data type inference logic. +# These type variables have a very wide bound because the individual zdtype +# classes can perform a very specific type check. + +# This is the JSON representation of a structured dtype in zarr v2 +StructuredName_V2 = Sequence["str | StructuredName_V2"] + +# This models the type of the name a dtype might have in zarr v2 array metadata +DTypeName_V2 = StructuredName_V2 | str + +TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True) +TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True) + + +class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): + name: ReadOnly[TDTypeNameV2_co] + object_codec_id: ReadOnly[TObjectCodecID_co] + + +DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] + + +def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2]: + """ + A type guard for the inner elements of a structured dtype. This is a recursive check because + the type is itself recursive. + + This check ensures that all the elements are 2-element sequences beginning with a string + and ending with either another string or another 2-element sequence beginning with a string and + ending with another instance of that type. + """ + if isinstance(data, (str, Mapping)): + return False + if not isinstance(data, Sequence): + return False + if len(data) != 2: + return False + if not (isinstance(data[0], str)): + return False + if isinstance(data[-1], str): + return True + elif isinstance(data[-1], Sequence): + return check_structured_dtype_v2_inner(data[-1]) + return False + + +def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]: + """ + Check that all the elements of a sequence are valid zarr v2 structured dtype identifiers + """ + return all(check_structured_dtype_v2_inner(d) for d in data) + + +def check_dtype_name_v2(data: object) -> TypeGuard[DTypeName_V2]: + """ + Type guard for narrowing the type of a python object to an valid zarr v2 dtype name. + """ + if isinstance(data, str): + return True + elif isinstance(data, Sequence): + return check_structured_dtype_name_v2(data) + return False + + +def check_dtype_spec_v2(data: object) -> TypeGuard[DTypeSpec_V2]: + """ + Type guard for narrowing a python object to an instance of DTypeSpec_V2 + """ + if not isinstance(data, Mapping): + return False + if set(data.keys()) != {"name", "object_codec_id"}: + return False + if not check_dtype_name_v2(data["name"]): + return False + return isinstance(data["object_codec_id"], str | None) + + +# By comparison, The JSON representation of a dtype in zarr v3 is much simpler. +# It's either a string, or a structured dict +DTypeSpec_V3 = str | NamedConfig[str, Mapping[str, object]] + + +def check_dtype_spec_v3(data: object) -> TypeGuard[DTypeSpec_V3]: + """ + Type guard for narrowing the type of a python object to an instance of + DTypeSpec_V3, i.e either a string or a dict with a "name" field that's a string and a + "configuration" field that's a mapping with string keys. + """ + if isinstance(data, str) or ( # noqa: SIM103 + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and isinstance(data["configuration"], Mapping) + and all(isinstance(k, str) for k in data["configuration"]) + ): + return True + return False + + +def unpack_dtype_json(data: DTypeSpec_V2 | DTypeSpec_V3) -> DTypeJSON: + """ + Return the array metadata form of the dtype JSON representation. For the Zarr V3 form of dtype + metadata, this is a no-op. For the Zarr V2 form of dtype metadata, this unpacks the dtype name. + """ + if isinstance(data, Mapping) and set(data.keys()) == {"name", "object_codec_id"}: + return data["name"] + return data + class DataTypeValidationError(ValueError): ... -@dataclass(frozen=True) +class ScalarTypeValidationError(ValueError): ... + + +@dataclass(frozen=True, kw_only=True) class HasLength: """ A mix-in class for data types with a length attribute, such as fixed-size collections of unicode strings, or bytes. + + Attributes + ---------- + length : int + The length of the scalars belonging to this data type. Note that this class does not assign + a unit to the length. Child classes may assign units. """ length: int -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasEndianness: """ A mix-in class for data types with an endianness attribute """ - endianness: Endianness | None = "little" + endianness: EndiannessStr = "little" -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasItemSize: """ A mix-in class for data types with an item size attribute. @@ -46,7 +194,7 @@ def item_size(self) -> int: raise NotImplementedError -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasObjectCodec: """ A mix-in class for data types that require an object codec id. @@ -77,9 +225,9 @@ def v3_unstable_dtype_warning(dtype: object) -> None: """ msg = ( f"The data type ({dtype}) does not have a Zarr V3 specification. " - "That means that the representation of data saved with this data type may change without " + "That means that the representation of arrays saved with this data type may change without " "warning in a future version of Zarr Python. " - "Arrays stored with this data type may be unreadable by other Zarr libraries " + "Arrays stored with this data type may be unreadable by other Zarr libraries. " "Use this data type at your own risk! " "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " "status of data type specifications for Zarr V3." diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index b1800127e8..37371cd0cd 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,69 +1,256 @@ +from __future__ import annotations + from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasItemSize -from zarr.core.dtype.npy.common import check_json_bool -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasItemSize, + check_dtype_spec_v2, +) +from zarr.core.dtype.wrapper import TBaseDType, ZDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ - Wrapper for numpy boolean dtype. + A Zarr data type for arrays containing booleans. + + Wraps the ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of + ``np.bool_``. Attributes ---------- - name : str - The name of the dtype. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] - The numpy dtype class. + + _zarr_v3_name : Literal["bool"] = "bool" + The Zarr v3 name of the dtype. + _zarr_v2_name : ``Literal["|b1"]`` = ``"|b1"`` + The Zarr v2 name of the dtype, which is also a string representation + of the boolean dtype used by NumPy. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + The NumPy dtype class. + + References + ---------- + This class implements the boolean data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|b1",) + _zarr_v2_name: ClassVar[Literal["|b1"]] = "|b1" dtype_cls = np.dtypes.BoolDType @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of Bool from an instance of np.dtypes.BoolDType. + + Parameters + ---------- + dtype : TBaseDType + The NumPy boolean dtype instance to convert. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the provided dtype is not compatible with this ZDType. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self: Self) -> np.dtypes.BoolDType: + """ + Create a NumPy boolean dtype instance from this ZDType. + + Returns + ------- + np.dtypes.BoolDType + The NumPy boolean dtype. + """ return self.dtype_cls() @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|b1"]]: + def _check_json_v2( + cls, + data: DTypeJSON, + ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: """ - Check that the input is a valid JSON representation of a bool. + Check that the input is a valid JSON representation of a Bool. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` + True if the input is a valid JSON representation, False otherwise. """ - return data in cls._zarr_v2_names + return ( + check_dtype_spec_v2(data) + and data["name"] == cls._zarr_v2_name + and data["object_codec_id"] is None + ) @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["bool"]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. + """ return data == cls._zarr_v3_name + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|b1"]: ... + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["|b1", "bool"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: + """ + Serialize this Bool instance to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` + The JSON representation of the Bool instance. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self._zarr_v2_name, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() + def _check_scalar(self, data: object) -> bool: + """ + Check if the input can be cast to a boolean scalar. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + bool + True if the input can be cast to a boolean scalar, False otherwise. + """ + return True + + def cast_scalar(self, data: object) -> np.bool_: + """ + Cast the input to a numpy boolean scalar. + + Parameters + ---------- + data : object + The data to cast. + + Returns + ------- + ``np.bool_`` + The numpy boolean scalar. + + Raises + ------ + TypeError + If the input cannot be converted to a numpy boolean. + """ + if self._check_scalar(data): + return np.bool_(data) + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no cover def default_scalar(self) -> np.bool_: """ @@ -71,7 +258,7 @@ def default_scalar(self) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The default value. """ return np.False_ @@ -107,20 +294,26 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The numpy boolean scalar. + + Raises + ------ + TypeError + If the input is not a valid boolean type. """ - if check_json_bool(data): - return self._cast_scalar_unchecked(data) + if self._check_scalar(data): + return np.bool_(data) raise TypeError(f"Invalid type: {data}. Expected a boolean.") # pragma: no cover - def check_scalar(self, data: object) -> bool: - # Anything can become a bool - return True - - def _cast_scalar_unchecked(self, data: object) -> np.bool_: - return np.bool_(data) - @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 1 diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 03dc194a7a..67644449a0 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Any, + Final, Literal, SupportsComplex, SupportsFloat, @@ -14,12 +15,17 @@ SupportsInt, TypeGuard, TypeVar, - get_args, ) import numpy as np -from zarr.core.dtype.common import SPECIAL_FLOAT_STRINGS, Endianness, JSONFloatV2, JSONFloatV3 +from zarr.core.dtype.common import ( + ENDIANNESS_STR, + SPECIAL_FLOAT_STRINGS, + EndiannessStr, + JSONFloatV2, + JSONFloatV3, +) if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -30,7 +36,26 @@ DateTimeUnit = Literal[ "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" ] -EndiannessNumpy = Literal[">", "<", "|", "="] +DATETIME_UNIT: Final = ( + "Y", + "M", + "W", + "D", + "h", + "m", + "s", + "ms", + "us", + "μs", + "ns", + "ps", + "fs", + "as", + "generic", +) + +NumpyEndiannessStr = Literal[">", "<", "="] +NUMPY_ENDIANNESS_STR: Final = ">", "<", "=" TFloatDType_co = TypeVar( "TFloatDType_co", @@ -47,18 +72,18 @@ TComplexScalar_co = TypeVar("TComplexScalar_co", bound=np.complex64 | np.complex128, covariant=True) -def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: +def endianness_from_numpy_str(endianness: NumpyEndiannessStr) -> EndiannessStr: """ Convert a numpy endianness string literal to a human-readable literal value. Parameters ---------- - endianness : Literal[">", "<", "=", "|"] + endianness : Literal[">", "<", "="] The numpy string representation of the endianness. Returns ------- - Endianness or None + Endianness The human-readable representation of the endianness. Raises @@ -74,26 +99,21 @@ def endianness_from_numpy_str(endianness: EndiannessNumpy) -> Endianness | None: return "little" case ">": return "big" - case "|": - # for dtypes without byte ordering semantics - return None - raise ValueError( - f"Invalid endianness: {endianness!r}. Expected one of {get_args(EndiannessNumpy)}" - ) + raise ValueError(f"Invalid endianness: {endianness!r}. Expected one of {NUMPY_ENDIANNESS_STR}") -def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: +def endianness_to_numpy_str(endianness: EndiannessStr) -> NumpyEndiannessStr: """ Convert an endianness literal to its numpy string representation. Parameters ---------- - endianness : Endianness or None + endianness : Endianness The endianness to convert. Returns ------- - Literal[">", "<", "|"] + Literal[">", "<"] The numpy string representation of the endianness. Raises @@ -106,13 +126,22 @@ def endianness_to_numpy_str(endianness: Endianness | None) -> EndiannessNumpy: return "<" case "big": return ">" - case None: - return "|" raise ValueError( - f"Invalid endianness: {endianness!r}. Expected one of {get_args(Endianness)} or None" + f"Invalid endianness: {endianness!r}. Expected one of {ENDIANNESS_STR} or None" ) +def get_endianness_from_numpy_dtype(dtype: np.dtype[np.generic]) -> EndiannessStr: + """ + Gets the endianness from a numpy dtype that has an endianness. This function will + raise a ValueError if the numpy data type does not have a concrete endianness. + """ + endianness = dtype.byteorder + if dtype.byteorder in NUMPY_ENDIANNESS_STR: + return endianness_from_numpy_str(endianness) # type: ignore [arg-type] + raise ValueError(f"The dtype {dtype} has an unsupported endianness: {endianness}") + + def float_from_json_v2(data: JSONFloatV2) -> float: """ Convert a JSON float to a float (Zarr v2). @@ -355,9 +384,7 @@ def check_json_float_v2(data: JSON) -> TypeGuard[JSONFloatV2]: Bool True if the data is a float, False otherwise. """ - if data == "NaN" or data == "Infinity" or data == "-Infinity": - return True - return isinstance(data, float | int) + return data in ("NaN", "Infinity", "-Infinity") or isinstance(data, float | int) def check_json_float_v3(data: JSON) -> TypeGuard[JSONFloatV3]: diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index f7db6fe94d..2f432a9e0a 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -5,13 +7,19 @@ Literal, Self, TypeGuard, - cast, + overload, ) import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( ComplexLike, TComplexDType_co, @@ -22,75 +30,258 @@ complex_float_from_json_v3, complex_float_to_json_v2, complex_float_to_json_v3, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: - from zarr.core.dtype.npy.common import EndiannessNumpy + from zarr.core.common import JSON, ZarrFormat @dataclass(frozen=True) class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): + """ + A base class for Zarr data types that wrap NumPy complex float data types. + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a NumPy complex dtype. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with this data type. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> TComplexDType_co: + """ + Convert this class to a NumPy complex dtype with the appropriate byte order. + + Returns + ------- + TComplexDType_co + A NumPy data type object representing the complex data type with the specified byte order. + """ + byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - def to_json(self, zarr_format: ZarrFormat) -> str: + @classmethod + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + """ + Check that the input is a valid JSON representation of this data type. + + The input data must be a mapping that contains a "name" key that is one of + the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. + """ + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this data type in Zarr V3. + + This method verifies that the provided data matches the expected Zarr V3 + representation, which is the string specified by the class-level attribute _zarr_v3_name. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this class in Zarr V3, False otherwise. + """ + + return data == cls._zarr_v3_name + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this class. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): + # Going via numpy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> str: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ - Convert the wrapped data type to a JSON-serializable form. + Serialize this object to a JSON-serializable representation. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] | str + If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is + returned. + If ``zarr_format`` is 3, a string representation of the complex data type is returned. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ + if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: + """ + Check that the input is a scalar complex value. - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + Parameters + ---------- + data : object + The value to check. + + Returns + ------- + TypeGuard[ComplexLike] + True if the input is a scalar complex value, False otherwise. """ - Check that the input is a valid JSON representation of this data type. + return isinstance(data, ComplexLike) + + def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: """ - return data in cls._zarr_v2_names + Cast the provided scalar data to the native scalar type of this class. - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: - return data == cls._zarr_v3_name + Parameters + ---------- + data : ComplexLike + The data to cast. - def check_scalar(self, data: object) -> bool: - return isinstance(data, ComplexLike) + Returns + ------- + TComplexScalar_co + The casted data as a numpy complex scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be a scalar complex value. + """ + return self.to_native_dtype().type(data) # type: ignore[return-value] - def _cast_scalar_unchecked(self, data: object) -> TComplexScalar_co: - return self.to_native_dtype().type(data) # type: ignore[arg-type, return-value] + def cast_scalar(self, data: object) -> TComplexScalar_co: + """ + Attempt to cast a given object to a numpy complex scalar. + + Parameters + ---------- + data : object + The data to be cast to a numpy complex scalar. + + Returns + ------- + TComplexScalar_co + The data cast as a numpy complex scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy complex scalar. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) def default_scalar(self) -> TComplexScalar_co: """ @@ -159,21 +350,69 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + Wraps the ``np.dtypes.Complex64DType`` data type. Scalars for this data type + are instances of ``np.complex64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex64DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8", "c8"], Literal["c8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + Wraps the ``np.dtypes.Complex128DType`` data type. Scalars for this data type + are instances of ``np.complex128``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex128DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex128"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c16"], Literal["c16", "c16"], Literal["c16", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 16 diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 174b2338ae..3113bc5b61 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -1,42 +1,167 @@ +from __future__ import annotations + from dataclasses import dataclass -from typing import ClassVar, Self, TypeGuard, cast +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, FloatLike, TFloatDType_co, TFloatScalar_co, check_json_float_v2, check_json_float_v3, - endianness_from_numpy_str, endianness_to_numpy_str, float_from_json_v2, float_from_json_v3, float_to_json_v2, float_to_json_v3, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): + """ + A base class for Zarr data types that wrap NumPy float data types. + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this ZDType from a NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> TFloatDType_co: + """ + Convert the wrapped data type to a NumPy data type. + + Returns + ------- + TFloatDType_co + The NumPy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - def to_json(self, zarr_format: ZarrFormat) -> str: + @classmethod + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + """ + Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid JSON representation of this data type, False otherwise. + """ + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[str] + True if the input is a valid JSON representation of this class, False otherwise. + """ + return data == cls._zarr_v3_name + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> str: ... + + def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ Convert the wrapped data type to a JSON-serializable form. @@ -47,56 +172,88 @@ def to_json(self, zarr_format: ZarrFormat) -> str: Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] or str + The JSON-serializable representation of the wrapped data type. + + Raises + ------ + ValueError + If zarr_format is not 2 or 3. """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: + """ + Check that the input is a valid scalar value. - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + Parameters + ---------- + data : object + The input to check. + + Returns + ------- + TypeGuard[FloatLike] + True if the input is a valid scalar value, False otherwise. """ - Check that the input is a valid JSON representation of this data type. + return isinstance(data, FloatLike) + + def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: """ - return data in cls._zarr_v2_names + Cast a scalar value to a NumPy float scalar. - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: - return data == cls._zarr_v3_name + Parameters + ---------- + data : FloatLike + The scalar value to cast. - def check_scalar(self, data: object) -> TypeGuard[FloatLike]: - return isinstance(data, FloatLike) + Returns + ------- + TFloatScalar_co + The NumPy float scalar. + """ + return self.to_native_dtype().type(data) # type: ignore[return-value] - def _cast_scalar_unchecked(self, data: object) -> TFloatScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] + def cast_scalar(self, data: object) -> TFloatScalar_co: + """ + Cast a scalar value to a NumPy float scalar. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + TFloatScalar_co + The NumPy float scalar. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) def default_scalar(self) -> TFloatScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this zdtype. Returns ------- - Int scalar + TFloatScalar_co The default value. """ return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ - Read a JSON-serializable value as a numpy float. + Read a JSON-serializable value as a NumPy float scalar. Parameters ---------- @@ -107,8 +264,8 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- - TScalar_co - The numpy float. + TFloatScalar_co + The NumPy float scalar. """ if zarr_format == 2: if check_json_float_v2(data): @@ -145,41 +302,119 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st See the zarr specifications for details on the JSON encoding for floats. """ if zarr_format == 2: - return float_to_json_v2(self._cast_scalar_unchecked(data)) + return float_to_json_v2(self.cast_scalar(data)) elif zarr_format == 3: - return float_to_json_v3(self._cast_scalar_unchecked(data)) + return float_to_json_v3(self.cast_scalar(data)) else: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @dataclass(frozen=True, kw_only=True) class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + """ + A Zarr data type for arrays containing 16-bit floating point numbers. + + Wraps the ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances + of ``np.float16``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float16DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float16 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f2", "f2"], Literal["f2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): + """ + A Zarr data type for arrays containing 32-bit floating point numbers. + + Wraps the ``np.dtypes.Float32DType`` data type. Scalars for this data type are instances + of ``np.float32``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float32DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float32 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f4", "f4"], Literal["f4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): + """ + A Zarr data type for arrays containing 64-bit floating point numbers. + + Wraps the ``np.dtypes.Float64DType`` data type. Scalars for this data type are instances + of ``np.float64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float64DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float64 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">f8", "f8"], Literal["f8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 92705917f9..01a79142a3 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -1,5 +1,8 @@ +from __future__ import annotations + from dataclasses import dataclass from typing import ( + TYPE_CHECKING, ClassVar, Literal, Self, @@ -7,21 +10,28 @@ SupportsInt, TypeGuard, TypeVar, - cast, overload, ) import numpy as np -from zarr.core.common import JSON, ZarrFormat -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, check_json_int, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType + +if TYPE_CHECKING: + from zarr.core.common import JSON, ZarrFormat _NumpyIntDType = ( np.dtypes.Int8DType @@ -43,52 +53,156 @@ @dataclass(frozen=True) class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): - # This attribute holds the possible zarr v2 JSON names for the data type + """ + A base class for integer data types in Zarr. + + This class provides methods for serialization and deserialization of integer types + in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. + """ + _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid JSON representation of this data type. + Check that the input is a valid JSON representation of this integer data type in Zarr V2. + + This method verifies that the provided data matches the expected Zarr V2 representation + for this data type. The input data must be a mapping that contains a "name" key that is + one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid representation of this class in Zarr V2, + False otherwise. """ - return data in cls._zarr_v2_names + + return ( + check_dtype_spec_v2(data) + and data["name"] in cls._zarr_v2_names + and data["object_codec_id"] is None + ) @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[str]: + def _check_json_v3(cls, data: object) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this class in Zarr v3, + False otherwise. + """ return data == cls._zarr_v3_name - def check_scalar(self, data: object) -> TypeGuard[IntLike]: + def _check_scalar(self, data: object) -> TypeGuard[IntLike]: + """ + Check if the input object is of an IntLike type. + + This method verifies whether the provided data can be considered as an integer-like + value, which includes objects supporting integer conversion. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[IntLike] + True if the data is IntLike, False otherwise. + """ + return isinstance(data, IntLike) - def _cast_scalar_unchecked(self, data: object) -> TIntScalar_co: - return self.to_native_dtype().type(data) # type: ignore[return-value, arg-type] + def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: + """ + Casts a given scalar value to the native integer scalar type without type checking. + + Parameters + ---------- + data : IntLike + The scalar value to cast. + + Returns + ------- + TIntScalar_co + The casted integer scalar of the native dtype. + """ + + return self.to_native_dtype().type(data) # type: ignore[return-value] + + def cast_scalar(self, data: object) -> TIntScalar_co: + """ + Attempt to cast a given object to a NumPy integer scalar. + + Parameters + ---------- + data : object + The data to be cast to a NumPy integer scalar. + + Returns + ------- + TIntScalar_co + The data cast as a NumPy integer scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy integer scalar. + """ + + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) def default_scalar(self) -> TIntScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this dtype. Returns ------- - Int scalar + TIntScalar_co The default value. """ return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Read a JSON-serializable value as a numpy int scalar. + Read a JSON-serializable value as a NumPy int scalar. Parameters ---------- data : JSON The JSON-serializable value. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - TScalar_co - The numpy scalar. + TIntScalar_co + The NumPy int scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. """ if check_json_int(data): return self._cast_scalar_unchecked(data) @@ -96,14 +210,15 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ - Convert an object to JSON-serializable scalar. + Convert an object to a JSON serializable scalar. For the integer data types, + the JSON form is a plain integer. Parameters ---------- - data : _BaseScalar + data : object The value to convert. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- @@ -115,446 +230,1284 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): - dtype_cls = np.dtypes.Int8DType - _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|i1",) + """ + A Zarr data type for arrays containing 8-bit signed integers. - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|i1"]: ... + Wraps the ``np.dtypes.Int8DType`` data type. Scalars for this data type are + instances of ``np.int8``. - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... + Attributes + ---------- + dtype_cls : np.dtypes.Int8DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 8-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ - def to_json(self, zarr_format: ZarrFormat) -> Literal["int8", "|i1"]: + dtype_cls = np.dtypes.Int8DType + _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" + _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Convert the wrapped data type to a JSON-serializable form. + Create an Int8 from a np.dtype('int8') instance. Parameters ---------- - zarr_format : ZarrFormat - The zarr format version. + dtype : TBaseDType + The np.dtype('int8') instance. Returns ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Self + An instance of this data type. - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class Int8. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self: Self) -> np.dtypes.Int8DType: + """ + Convert the Int8 instance to a np.dtype('int8') instance. + + Returns + ------- + np.dtypes.Int8DType + The np.dtype('int8') instance. + """ return self.dtype_cls() @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V2-flavored JSON. - @property - def item_size(self) -> int: - return 1 + Parameters + ---------- + data : DTypeJSON + The JSON data. + Returns + ------- + Self + An instance of this data type. -@dataclass(frozen=True, kw_only=True) -class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): - dtype_cls = np.dtypes.UInt8DType - _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" - _zarr_v2_names: ClassVar[tuple[str, ...]] = ("|u1",) + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int8. + """ + if cls._check_json_v2(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int8. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|u1"]: ... + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... + def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint8", "|u1"]: + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - str - The JSON-serializable representation of the wrapped data type + ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() - - def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: - return self.dtype_cls() - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 1 @dataclass(frozen=True, kw_only=True) -class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): - dtype_cls = np.dtypes.Int16DType - _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i2", " Literal[">i2", " Literal["int16"]: ... + Attributes + ---------- + dtype_cls : np.dtypes.UInt8DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 8-bit unsigned integer data type defined in Zarr V2 and V3. - def to_json(self, zarr_format: ZarrFormat) -> Literal["int16", ">i2", "`__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.UInt8DType + _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" + _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a UInt8 from a np.dtype('uint8') instance. """ - Convert the wrapped data type to a JSON-serializable form. + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. + def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: + """ + Create a NumPy unsigned 8-bit integer dtype instance from this UInt8 ZDType. Returns ------- - str - The JSON-serializable representation of the wrapped data type + np.dtypes.UInt8DType + The NumPy unsigned 8-bit integer dtype. """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + return self.dtype_cls() @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. - def to_native_dtype(self) -> np.dtypes.Int16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + Parameters + ---------- + data : DTypeJSON + The JSON data. - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - # This ensures that we get the endianness correct without annoying string parsing - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + + if cls._check_json_v2(data): return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" + raise DataTypeValidationError(msg) - @property - def item_size(self) -> int: - return 2 + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + Returns + ------- + Self + An instance of this data type. -@dataclass(frozen=True, kw_only=True) -class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): - dtype_cls = np.dtypes.UInt16DType - _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u2", " Literal[">u2", " DTypeConfig_V2[Literal["|u1"], None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["uint16"]: ... + def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint16", ">u2", " DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ if zarr_format == 2: - return self.to_native_dtype().str + # For Zarr format version 2, return a dictionary with the name and object codec ID. + return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: + # For Zarr format version 3, return the v3 name as a string. return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) - - def to_native_dtype(self) -> np.dtypes.UInt16DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() + # Raise an error if the zarr_format is neither 2 nor 3. raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @property def item_size(self) -> int: - return 2 + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ + return 1 @dataclass(frozen=True, kw_only=True) -class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): - dtype_cls = np.dtypes.Int32DType - _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i4", " Literal[">i4", " Literal["int32"]: ... + Attributes + ---------- + dtype_cls : np.dtypes.Int16DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 16-bit signed integer data type defined in Zarr V2 and V3. - def to_json(self, zarr_format: ZarrFormat) -> Literal["int32", ">i4", "`__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.Int16DType + _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" + _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: """ - Convert the wrapped data type to a JSON-serializable form. + Create an instance of this data type from a np.dtype('int16') instance. Parameters ---------- - zarr_format : ZarrFormat - The zarr format version. + dtype : np.dtype + The instance of np.dtype('int16') to create from. Returns ------- - str - The JSON-serializable representation of the wrapped data type - """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Self + An instance of this data type. - @classmethod - def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: - # We override the base implementation to address a windows-specific, pre-numpy 2 issue where - # ``np.dtype('i')`` is an instance of ``np.dtypes.IntDType`` that acts like `int32` instead of ``np.dtype('int32')`` - # In this case, ``type(np.dtype('i')) == np.dtypes.Int32DType`` will evaluate to ``True``, - # despite the two classes being different. Thus we will create an instance of `cls` with the - # latter dtype, after pulling in the byte order of the input - if dtype == np.dtypes.Int32DType(): - return cls._from_native_dtype_unsafe( - np.dtypes.Int32DType().newbyteorder(dtype.byteorder) - ) - else: - return super().from_native_dtype(dtype) + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('int16'). + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def to_native_dtype(self) -> np.dtypes.Int16DType: + """ + Convert the data type to a np.dtype('int16') instance. - def to_native_dtype(self) -> np.dtypes.Int32DType: + Returns + ------- + np.dtype + The np.dtype('int16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. - @property - def item_size(self) -> int: - return 4 + Parameters + ---------- + data : DTypeJSON + The JSON data. + Returns + ------- + Self + An instance of this data type. -@dataclass(frozen=True, kw_only=True) -class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): - dtype_cls = np.dtypes.UInt32DType - _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u4", " Literal[">u4", " Literal["uint32"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint32", ">u4", " Self: """ - Convert the wrapped data type to a JSON-serializable form. + Create an instance of this data type from Zarr V3-flavored JSON. Parameters ---------- - zarr_format : ZarrFormat - The zarr format version. + data : DTypeJSON + The JSON data. Returns ------- - str - The JSON-serializable representation of the wrapped data type + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. """ - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " Literal["int16"]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal[">i2", "i2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ + return 2 + + +@dataclass(frozen=True, kw_only=True) +class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + """ + A Zarr data type for arrays containing 16-bit unsigned integers. + + Wraps the ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of + ``np.uint16``. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt16DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the unsigned 16-bit unsigned integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.UInt16DType + _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" + _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a np.dtype('uint16') instance. - def to_native_dtype(self) -> np.dtypes.UInt32DType: + Parameters + ---------- + dtype : np.dtype + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('uint16'). + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self) -> np.dtypes.UInt16DType: + """ + Convert the data type to a np.dtype('uint16') instance. + + Returns + ------- + np.dtype + The np.dtype('uint16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " Literal["uint16"]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal[">u2", "u2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ + return 2 + + +@dataclass(frozen=True, kw_only=True) +class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + """ + A Zarr data type for arrays containing 32-bit signed integers. + + Wraps the ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of + ``np.int32``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int32DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 32-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.Int32DType + _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" + _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " TypeGuard[np.dtypes.Int32DType]: + """ + A type guard that checks if the input is assignable to the type of ``cls.dtype_class`` + + This method is overridden for this particular data type because of a Windows-specific issue + where np.dtype('i') creates an instance of ``np.dtypes.IntDType``, rather than an + instance of ``np.dtypes.Int32DType``, even though both represent 32-bit signed integers. + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return super()._check_native_dtype(dtype) or dtype == np.dtypes.Int32DType() + + @classmethod + def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: + """ + Create an Int32 from a np.dtype('int32') instance. + + Parameters + ---------- + dtype : TBaseDType + The np.dtype('int32') instance. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self: Self) -> np.dtypes.Int32DType: + """ + Convert the Int32 instance to a np.dtype('int32') instance. + + Returns + ------- + np.dtypes.Int32DType + The np.dtype('int32') instance. + """ + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ + if cls._check_json_v3(data): return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " Literal["int32"]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal[">i4", "i4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) -class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): - dtype_cls = np.dtypes.Int64DType - _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">i8", "`__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.UInt32DType + _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" + _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " Self: + """ + Create a UInt32 from a np.dtype('uint32') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class 32-bit unsigned + integer. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self) -> np.dtypes.UInt32DType: + """ + Create a NumPy unsigned 32-bit integer dtype instance from this UInt32 ZDType. + + Returns + ------- + np.dtypes.UInt32DType + The NumPy unsigned 32-bit integer dtype. + """ + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 32-bit unsigned + integer. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 32-bit unsigned + integer. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) @overload - def to_json(self, zarr_format: Literal[2]) -> Literal[">i8", " DTypeConfig_V2[Literal[">u4", " Literal["int64"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["int64", ">i8", " Literal["uint32"]: ... + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal[">u4", "u4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ + return 4 + + +@dataclass(frozen=True, kw_only=True) +class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit signed integers. + + Wraps the ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of + ``np.int64``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int64DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 64-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + + dtype_cls = np.dtypes.Int64DType + _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" + _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an Int64 from a np.dtype('int64') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class 64-bit signed + integer. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) def to_native_dtype(self) -> np.dtypes.Int64DType: + """ + Create a NumPy signed 64-bit integer dtype instance from this Int64 ZDType. + + Returns + ------- + np.dtypes.Int64DType + The NumPy signed 64-bit integer dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 64-bit signed + integer. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 64-bit signed + integer. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal[">i8", "i8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit unsigned integers. + + Wraps the ``np.dtypes.UInt64DType`` data type. Scalars for this data type + are instances of ``np.uint64``. + + Attributes + ---------- + dtype_cls: np.dtypes.UInt64DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the unsigned 64-bit integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.UInt64DType _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[str, ...]] = (">u8", "u8"], Literal["u8", " Literal[">u8", " np.dtypes.UInt64DType: + """ + Convert the data type to a native NumPy dtype. + Returns + ------- + np.dtypes.UInt64DType + The native NumPy dtype.eeeeeeeeeeeeeeeee + """ + byte_order = endianness_to_numpy_str(self.endianness) + return self.dtype_cls().newbyteorder(byte_order) + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class unsigned 64-bit + integer. + """ + if cls._check_json_v2(data): + # Going via NumPy ensures that we get the endianness correct without + # annoying string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class unsigned 64-bit + integer. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... - def to_json(self, zarr_format: ZarrFormat) -> Literal["uint64", ">u8", " DTypeConfig_V2[Literal[">u8", "u8", " Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls(endianness=endianness_from_numpy_str(byte_order)) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a native NumPy dtype. - def to_native_dtype(self) -> np.dtypes.UInt64DType: - byte_order = endianness_to_numpy_str(self.endianness) - return self.dtype_cls().newbyteorder(byte_order) + Parameters + ---------- + dtype : TBaseDType + The native NumPy dtype. - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls() - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input dtype is not a valid representation of this class unsigned 64-bit + integer. + """ + if cls._check_native_dtype(dtype): + return cls(endianness=get_endianness_from_numpy_dtype(dtype)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 2299b7aab1..32375a1c71 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,21 +1,39 @@ from __future__ import annotations -import base64 import re from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import ( + TYPE_CHECKING, + ClassVar, + Literal, + Protocol, + Self, + TypedDict, + TypeGuard, + overload, + runtime_checkable, +) import numpy as np from zarr.core.common import NamedConfig -from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength, HasObjectCodec +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + HasLength, + HasObjectCodec, + check_dtype_spec_v2, + v3_unstable_dtype_warning, +) from zarr.core.dtype.npy.common import ( - EndiannessNumpy, check_json_str, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, ZDType +from zarr.core.dtype.wrapper import TDType_co, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat @@ -24,136 +42,174 @@ _NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType") -class LengthBytesConfig(TypedDict): - length_bytes: int +@runtime_checkable +class SupportsStr(Protocol): + def __str__(self) -> str: ... -# TDO: Fix this terrible name -FixedLengthASCIIJSONV3 = NamedConfig[Literal["fixed_length_ascii"], LengthBytesConfig] +class LengthBytesConfig(TypedDict): + """ + Configuration for a fixed-length string data type in Zarr V3. + Attributes + ---------- + length_bytes : int + The length in bytes of the data associated with this configuration. + """ -@dataclass(frozen=True, kw_only=True) -class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): - dtype_cls = np.dtypes.BytesDType - _zarr_v3_name: ClassVar[Literal["fixed_length_ascii"]] = "fixed_length_ascii" + length_bytes: int - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls(length=dtype.itemsize) - def to_native_dtype(self) -> np.dtypes.BytesDType[int]: - return self.dtype_cls(self.length) +class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``FixedLengthUTF32`` data type in Zarr V2. - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of a numpy S dtype. - """ - # match |S1, |S2, etc - return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthASCIIJSONV3]: - return ( - isinstance(data, dict) - and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name - and isinstance(data["configuration"], dict) - and "length_bytes" in data["configuration"] - ) + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + Examples + -------- - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthASCIIJSONV3: ... + .. code-block:: python - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthASCIIJSONV3: - if zarr_format == 2: - return self.to_native_dtype().str - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + { + "name": " Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"]) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - def default_scalar(self) -> np.bytes_: - return np.bytes_(b"") - - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: - if check_json_str(data): - return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) - raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - - def check_scalar(self, data: object) -> bool: - # this is generous for backwards compatibility - return isinstance(data, np.bytes_ | str | bytes | int) - def _cast_scalar_unchecked(self, data: object) -> np.bytes_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('S3').type('hello world') - # >>> x - # np.bytes_(b'hello world') - # >>> x.dtype - # dtype('S11') +class FixedLengthUTF32JSON_V3(NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig]): + """ + The JSON representation of the ``FixedLengthUTF32`` data type in Zarr V3. - if isinstance(data, int): - return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] - - @property - def item_size(self) -> int: - return self.length + References + ---------- + This representation is not currently defined in an external specification. + Examples + -------- + .. code-block:: python -# TODO: Fix this terrible name -FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] + { + "name": "fixed_length_utf32", + "configuration": { + "length_bytes": 12 + } + """ @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): + """ + A Zarr data type for arrays containing fixed-length UTF-32 strings. + + Wraps the ``np.dtypes.StrDType`` data type. Scalars for this data type are instances of + ``np.str_``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.StrDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["fixed_length_utf32"]] + The name of this data type in Zarr V3. + code_point_bytes : ClassVar[int] = 4 + The number of bytes per code point in UTF-32, which is 4. + """ + dtype_cls = np.dtypes.StrDType _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32" code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point + def __post_init__(self) -> None: + """ + We don't allow instances of this class with length less than 1 because there is no way such + a data type can contain actual data. + """ + if self.length < 1: + raise ValueError(f"length must be >= 1, got {self.length}.") + @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - byte_order = cast("EndiannessNumpy", dtype.byteorder) - return cls( - length=dtype.itemsize // (cls.code_point_bytes), - endianness=endianness_from_numpy_str(byte_order), + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a FixedLengthUTF32 from a NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_native_dtype(dtype): + endianness = get_endianness_from_numpy_dtype(dtype) + return cls( + length=dtype.itemsize // (cls.code_point_bytes), + endianness=endianness, + ) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) def to_native_dtype(self) -> np.dtypes.StrDType[int]: + """ + Convert the FixedLengthUTF32 instance to a NumPy data type. + + Returns + ------- + np.dtypes.StrDType[int] + The NumPy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def check_json_v2(cls, data: JSON, object_codec_id: str | None = None) -> TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: """ - Check that the input is a valid JSON representation of a numpy S dtype. + Check that the input is a valid JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[FixedLengthUTF32JSON_V2] + Whether the input is a valid JSON representation of a NumPy U dtype. """ - return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None + return ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and re.match(r"^[><]U\d+$", data["name"]) is not None + and data["object_codec_id"] is None + ) @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[FixedLengthUTF32JSONV3] + Whether the input is a valid JSON representation of a NumPy U dtype. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -165,15 +221,31 @@ def check_json_v3(cls, data: JSON) -> TypeGuard[FixedLengthUTF32JSONV3]: ) @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: + """ + Convert the FixedLengthUTF32 instance to a JSON representation. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format to use. - def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: + Returns + ------- + DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 + The JSON representation of the data type. + """ if zarr_format == 2: - return self.to_native_dtype().str + return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: + v3_unstable_dtype_warning(self) return { "name": self._zarr_v3_name, "configuration": {"length_bytes": self.length * self.code_point_bytes}, @@ -181,180 +253,535 @@ def to_json(self, zarr_format: ZarrFormat) -> str | FixedLengthUTF32JSONV3: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[index, call-overload] - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v2(data): + # Construct the NumPy dtype instead of string parsing. + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + raise DataTypeValidationError( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." + ) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v3(data): + return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) def default_scalar(self) -> np.str_: + """ + Return the default scalar value for this data type. + + Returns + ------- + ``np.str_`` + The default scalar value. + """ return np.str_("") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert the scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + str + The JSON representation of the scalar value. + """ return str(data) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + """ + Convert the JSON representation of a scalar value to the native scalar value. + + Parameters + ---------- + data : JSON + The JSON data. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + ``np.str_`` + The native scalar value. + """ if check_json_str(data): return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + TypeGuard[SupportsStr] + Whether the input is a valid scalar value for this data type. + """ # this is generous for backwards compatibility - return isinstance(data, str | np.str_ | bytes | int) + return isinstance(data, SupportsStr) + + def cast_scalar(self, data: object) -> np.str_: + """ + Cast the scalar value to the native scalar value. - def _cast_scalar_unchecked(self, data: object) -> np.str_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('U3').type('hello world') - # >>> x - # np.str_('hello world') - # >>> x.dtype - # dtype('U11') + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + ``np.str_`` + The native scalar value. + """ + if self._check_scalar(data): + # We explicitly truncate before casting because of the following NumPy behavior: + # >>> x = np.dtype('U3').type('hello world') + # >>> x + # np.str_('hello world') + # >>> x.dtype + # dtype('U11') - if isinstance(data, int): return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) # type: ignore[index] + + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no-cover @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.length * self.code_point_bytes -if _NUMPY_SUPPORTS_VLEN_STRING: +def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: + """ + Check if the input is a valid JSON scalar for a variable-length string. - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] - dtype_cls = np.dtypes.StringDType - _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - object_codec_id = "vlen-utf8" + This function is generous for backwards compatibility, as Zarr Python v2 would use ints for + variable-length string fill values. - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() + Parameters + ---------- + data : object + The JSON value to check. - def to_native_dtype(self) -> np.dtypes.StringDType: - return self.dtype_cls() + Returns + ------- + TypeGuard[int | str | float] + True if the input is a valid scalar for a variable-length string. + """ + return isinstance(data, int | str | float) - @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id - - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name - - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: - if zarr_format == 2: - # Note: unlike many other numpy data types, we don't serialize the .str attribute - # of the data type to JSON. This is because Zarr was using `|O` for strings before the - # numpy variable length string data type existed, and we want to be consistent with - # that practice - return "|O" - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - return cls() - def default_scalar(self) -> str: - return "" +class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]): + """ + A wrapper around the JSON representation of the ``VariableLengthUTF8`` data type in Zarr V2. - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return str(data) + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-utf8"``. - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. - def check_scalar(self, data: object) -> bool: - return isinstance(data, str) - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + Examples + -------- + .. code-block:: python -else: - # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. - @dataclass(frozen=True, kw_only=True) - class VariableLengthString(ZDType[np.dtypes.ObjectDType, str], HasObjectCodec): # type: ignore[no-redef] - dtype_cls = np.dtypes.ObjectDType - _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" - object_codec_id = "vlen-utf8" + { + "name": "|O", + "object_codec_id": "vlen-utf8" + } + """ - @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - return cls() - def to_native_dtype(self) -> np.dtypes.ObjectDType: - return self.dtype_cls() +# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. +# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length +# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object +# dtype as the native dtype. +class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): + """ + A base class for variable-length UTF-8 string data types. - @classmethod - def check_json_v2( - cls, data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[Literal["|O"]]: - """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the + Not intended for direct use, but as a base for concrete implementations. + + Attributes + ---------- + object_codec_id : ClassVar[Literal["vlen-utf8"]] + The object codec ID for this data type. + + References + ---------- + This data type does not have a Zarr V3 specification. + + The Zarr V2 data type specification can be found `here `__. + """ + + _zarr_v3_name: ClassVar[Literal["string"]] = "string" + object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a compatible NumPy data type. + + + Parameters + ---------- + dtype : TBaseDType + The native data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input is not compatible with this data type. + """ + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + @classmethod + def _check_json_v2( + cls, + data: DTypeJSON, + ) -> TypeGuard[VariableLengthUTF8JSON_V2]: + """ + "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype + for Zarr v2." + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + ``TypeGuard[VariableLengthUTF8JSON_V2]`` + Whether the input is a valid JSON representation of a NumPy "object" data type, and that the object codec id is appropriate for variable-length UTF-8 strings. - """ - return data == "|O" and object_codec_id == cls.object_codec_id + """ + return ( + check_dtype_spec_v2(data) + and data["name"] == "|O" + and data["object_codec_id"] == cls.object_codec_id + ) - @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[Literal["variable_length_utf8"]]: - return data == cls._zarr_v3_name + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[Literal["variable_length_utf8"]] + Whether the input is a valid JSON representation of a variable length UTF-8 string + data type. + """ + return data == cls._zarr_v3_name - @overload - def to_json(self, zarr_format: Literal[2]) -> Literal["|O"]: ... + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a NumPy "object" dtype. - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ... + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. - def to_json(self, zarr_format: ZarrFormat) -> Literal["|O", "variable_length_utf8"]: - if zarr_format == 2: - return "|O" - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v2(data): + return cls() + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" + ) + raise DataTypeValidationError(msg) - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a variable length UTF-8 + string data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. + + Returns + ------- + Self + An instance of this data type. + """ + if cls._check_json_v3(data): return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... + + def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: + """ + Convert this data type to a JSON representation. + + Parameters + ---------- + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + ``VariableLengthUTF8JSON_V2 | Literal["string"]`` + The JSON representation of this data type. + """ + if zarr_format == 2: + return {"name": "|O", "object_codec_id": self.object_codec_id} + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def default_scalar(self) -> str: + """ + Return the default scalar value for this data type. + + Returns + ------- + str + The default scalar value. + """ + return "" + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value to convert. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The JSON representation of the scalar value. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + raise TypeError(f"Invalid type: {data}. Expected a string.") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Convert a JSON representation of a scalar value to the native scalar type. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The native scalar type of the scalar value. + """ + if not check_vlen_string_json_scalar(data): + raise TypeError(f"Invalid type: {data}. Expected a string or number.") + return str(data) + + def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value to check. + + Returns + ------- + TypeGuard[SupportsStr] + Whether the input is a valid scalar value for this data type. + """ + return isinstance(data, SupportsStr) + + def _cast_scalar_unchecked(self, data: SupportsStr) -> str: + """ + Cast a scalar value to a string. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + str + The string representation of the scalar value. + """ + return str(data) + + def cast_scalar(self, data: object) -> str: + """ + Cast an object to a string. + + Parameters + ---------- + data : object + The value to cast. + + Returns + ------- + str + The input cast to str. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no cover + - def default_scalar(self) -> str: - return "" +if _NUMPY_SUPPORTS_VLEN_STRING: + + @dataclass(frozen=True, kw_only=True) + class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances + of ``str``. + + + Attributes + ---------- + dtype_cls : Type[np.dtypes.StringDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return data # type: ignore[return-value] + dtype_cls = np.dtypes.StringDType - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + def to_native_dtype(self) -> np.dtypes.StringDType: """ - Strings pass through + Create a NumPy string dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.StringDType + The NumPy string dtype. """ - if not check_json_str(data): - raise TypeError(f"Invalid type: {data}. Expected a string.") - return data + return self.dtype_cls() + +else: + # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. + @dataclass(frozen=True, kw_only=True) + class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances + of ``str``. - def check_scalar(self, data: object) -> bool: - return isinstance(data, str) - def _cast_scalar_unchecked(self, data: object) -> str: - return str(data) + Attributes + ---------- + dtype_cls : Type[np.dtypes.ObjectDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ + + dtype_cls = np.dtypes.ObjectDType + + def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.ObjectDType + The NumPy object dtype. + """ + return self.dtype_cls() diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 4c5ce45442..d523e16940 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import dataclass +from datetime import datetime, timedelta from typing import ( TYPE_CHECKING, ClassVar, @@ -15,22 +16,31 @@ ) import numpy as np +from typing_extensions import ReadOnly from zarr.core.common import NamedConfig -from zarr.core.dtype.common import HasEndianness, HasItemSize +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + HasEndianness, + HasItemSize, + check_dtype_spec_v2, +) from zarr.core.dtype.npy.common import ( + DATETIME_UNIT, DateTimeUnit, - EndiannessNumpy, check_json_int, - endianness_from_numpy_str, endianness_to_numpy_str, + get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import DTypeJSON_V2, DTypeJSON_V3, TBaseDType, ZDType +from zarr.core.dtype.wrapper import TBaseDType, ZDType if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -_DTypeName = Literal["datetime64", "timedelta64"] +TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None +DateTimeLike = str | int | bytes | np.datetime64 | datetime | None def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np.datetime64: @@ -48,7 +58,7 @@ def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np Returns ------- - np.datetime64 + numpy.datetime64 The datetime64 value. """ dtype_name = f"datetime64[{scale_factor}{unit}]" @@ -61,7 +71,7 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: Parameters ---------- - data : np.datetime64 | np.timedelta64 + data : np.datetime64 | numpy.timedelta64 The value to convert. Returns @@ -72,30 +82,155 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: return data.view(np.int64).item() -_BaseTimeDType_co = TypeVar( - "_BaseTimeDType_co", +def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: + """ + Type guard to check if the input JSON data is the literal string "NaT" + or an integer. + """ + return check_json_int(data) or data == "NaT" + + +BaseTimeDType_co = TypeVar( + "BaseTimeDType_co", bound=np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, covariant=True, ) -_BaseTimeScalar = TypeVar("_BaseTimeScalar", bound=np.timedelta64 | np.datetime64) +BaseTimeScalar_co = TypeVar( + "BaseTimeScalar_co", bound=np.timedelta64 | np.datetime64, covariant=True +) class TimeConfig(TypedDict): - unit: DateTimeUnit - interval: int + """ + The configuration for the numpy.timedelta64 or numpy.datetime64 data type in Zarr V3. + + Attributes + ---------- + unit : ReadOnly[DateTimeUnit] + A string encoding a unit of time. + scale_factor : ReadOnly[int] + A scale factor. + + Examples + -------- + .. code-block:: python + + {"unit": "ms", "scale_factor": 1} + """ + + unit: ReadOnly[DateTimeUnit] + scale_factor: ReadOnly[int] + + +class DateTime64JSON_V3(NamedConfig[Literal["numpy.datetime64"], TimeConfig]): + """ + The JSON representation of the ``numpy.datetime64`` data type in Zarr V3. + + References + ---------- + This representation is defined in the ``numpy.datetime64`` + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.datetime64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ + + +class TimeDelta64JSON_V3(NamedConfig[Literal["numpy.timedelta64"], TimeConfig]): + """ + The JSON representation of the ``TimeDelta64`` data type in Zarr V3. + + References + ---------- + This representation is defined in the numpy.timedelta64 + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ + + +class TimeDelta64JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``TimeDelta64`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + + Examples + -------- + .. code-block:: python + + { + "name": "`__. -DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] -TimeDelta64JSONV3 = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] + Examples + -------- + .. code-block:: python + + { + "name": " None: raise ValueError(f"unit must be one of {get_args(DateTimeUnit)}, got {self.unit!r}.") @classmethod - def _from_native_dtype_unsafe(cls, dtype: TBaseDType) -> Self: - unit, scale_factor = np.datetime_data(dtype.name) - unit = cast("DateTimeUnit", unit) - byteorder = cast("EndiannessNumpy", dtype.byteorder) - return cls( - unit=unit, scale_factor=scale_factor, endianness=endianness_from_numpy_str(byteorder) + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this class from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native NumPy dtype to convert. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the dtype is not a valid representation of this class. + """ + + if cls._check_native_dtype(dtype): + unit, scale_factor = np.datetime_data(dtype.name) + unit = cast("DateTimeUnit", unit) + return cls( + unit=unit, + scale_factor=scale_factor, + endianness=get_endianness_from_numpy_dtype(dtype), + ) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> _BaseTimeDType_co: + def to_native_dtype(self) -> BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. - dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" - return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - - @classmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: - if zarr_format == 2: - return cls.from_native_dtype(np.dtype(data)) # type: ignore[arg-type] - elif zarr_format == 3: - unit = data["configuration"]["unit"] # type: ignore[index, call-overload] - scale_factor = data["configuration"]["scale_factor"] # type: ignore[index, call-overload] - return cls(unit=unit, scale_factor=scale_factor) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + """ + Convert this data type to a NumPy temporal data type with the appropriate + unit and scale factor. - @overload - def to_json(self, zarr_format: Literal[2]) -> str: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... + Returns + ------- + BaseTimeDType_co + A NumPy data type object representing the time data type with + the specified unit, scale factor, and byte order. + """ - def to_json(self, zarr_format: ZarrFormat) -> str | DateTime64JSONV3 | TimeDelta64JSONV3: - if zarr_format == 2: - return cast("str", self.to_native_dtype().str) - elif zarr_format == 3: - return cast( - "DateTime64JSONV3 | TimeDelta64JSONV3", - { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - }, - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" + return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + """ + Convert a python object to a JSON representation of a datetime64 or timedelta64 scalar. + + Parameters + ---------- + data : object + The python object to convert. + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + int + The JSON representation of the scalar. + """ return datetimelike_to_int(data) # type: ignore[arg-type] - def check_scalar(self, data: object) -> bool: - # TODO: decide which values we should accept for datetimes. - try: - np.array([data], dtype=self.to_native_dtype()) - return True # noqa: TRY300 - except ValueError: - return False - @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True, slots=True) class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): """ - A wrapper for the ``TimeDelta64`` data type defined in numpy. - Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. - Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the - unit for ``TimeDelta64`` is optional. - """ + A Zarr data type for arrays containing NumPy TimeDelta64 data. - dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names = (">m8", " np.timedelta64: - return np.timedelta64("NaT") + Attributes + ---------- + dtype_cls : Type[np.dtypesTimeDelta64DType] + The NumPy dtype class for this data type. + scale_factor : int + The scale factor for this data type. + unit : DateTimeUnit + The unit for this data type. - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: - if check_json_int(data) or data == "NaT": - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover + References + ---------- + The Zarr V2 representation of this data type is defined in the Zarr V2 + `specification document `__. - def _cast_scalar_unchecked(self, data: object) -> np.timedelta64: - return self.to_native_dtype().type(data) # type: ignore[arg-type] + The Zarr V3 representation of this data type is defined in the ``numpy.timedelta64`` + `specification document `__ + """ + + # mypy infers the type of np.dtypes.TimeDelta64DType to be + # "Callable[[Literal['Y', 'M', 'W', 'D'] | Literal['h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as']], Never]" + dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] + unit: DateTimeUnit = "generic" + scale_factor: int = 1 + _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" + _zarr_v2_names: ClassVar[tuple[Literal[">m8"], Literal["m8", " TypeGuard[str]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[TimeDelta64JSON_V2]: + """ + Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, + which could be in the form of strings like "m8[10s]". This method serves as a type + guard, helping to refine the type of unknown JSON input by confirming its adherence to the + expected format for NumPy timedelta64 data types. + + The JSON input should contain a "name" key with a value that matches the expected string + pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed + within square brackets, following the base type identifier. + + Returns + ------- + bool + True if the JSON input is a valid representation of this class, + otherwise False. + """ + if not check_dtype_spec_v2(data): + return False + name = data["name"] # match m[M], etc # consider making this a standalone function - if not isinstance(data, str): + if not isinstance(name, str): return False - if not data.startswith(cls._zarr_v2_names): + if not name.startswith(cls._zarr_v2_names): return False - if len(data) == 3: + if len(name) == 3: # no unit, and # we already checked that this string is either m8 return True else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Returns + ------- + TypeGuard[DateTime64JSON_V3] + True if the JSON input is a valid representation of this class, + otherwise False. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -221,44 +412,268 @@ def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TimeDelta64 + An instance of TimeDelta64. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " + f"representation of an instance of {cls.dtype_cls}" + ) + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V3-flavored JSON. + + The JSON representation of a TimeDelta64 in Zarr V3 is a dict with a 'name' key + with the value 'numpy.timedelta64', and a 'configuration' key with a value of a dict + with a 'unit' key and a 'scale_factor' key. + + For example: + + .. code-block:: json + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "generic", + "scale_factor": 1 + } + } + + """ + if cls._check_json_v3(data): + unit = data["configuration"]["unit"] + scale_factor = data["configuration"]["scale_factor"] + return cls(unit=unit, scale_factor=scale_factor) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " + f"with a 'name' key with the value 'numpy.timedelta64', " + "and a 'configuration' key with a value of a dict with a 'unit' key and a " + "'scale_factor' key" + ) + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + TimeDelta64JSON_V2 | TimeDelta64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: + """ + Check if the input is a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[TimeDeltaLike] + True if the input is a scalar of this data type, False otherwise. + """ + if data is None: + return True + return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) + + def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: + """ + Cast the provided scalar input to a numpy timedelta64 without any type checking. + + This method assumes that the input data is already a valid scalar of this data type, + and does not perform any validation or type checks. It directly casts the input + to a numpy timedelta64 scalar using the unit and scale factor defined in the class. + + Parameters + ---------- + data : TimeDeltaLike + The scalar input data to cast. + + Returns + ------- + numpy.timedelta64 + The input data cast as a numpy timedelta64 scalar. + """ + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + + def cast_scalar(self, data: object) -> np.timedelta64: + """ + Cast the input to a numpy timedelta64 scalar. If the input is not a scalar of this data type, + raise a TypeError. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) + + def default_scalar(self) -> np.timedelta64: + """ + Return a default scalar of this data type. + + This method provides a default value for the timedelta64 scalar, which is + a 'Not-a-Time' (NaT) value. + """ + return np.timedelta64("NaT") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + """ + Create a scalar of this data type from JSON input. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + numpy.timedelta64 + The scalar value of this data type. + + Raises + ------ + TypeError + If the input JSON is not a valid representation of a scalar for this data type. + """ + if check_json_time(data): + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover + @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + """ + A Zarr data type for arrays containing NumPy Datetime64 data. + + Wraps the ``np.dtypes.TimeDelta64DType`` data type. Scalars for this data type + are instances of ``np.datetime64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypesTimeDelta64DType] + The numpy dtype class for this data type. + unit : DateTimeUnit + The unit of time for this data type. + scale_factor : int + The scale factor for the time unit. + + References + ---------- + The Zarr V2 representation of this data type is defined in the Zarr V2 + `specification document `__. + + The Zarr V3 representation of this data type is defined in the ``numpy.datetime64`` + `specification document `__ + """ + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names = (">M8", "M8"], Literal["M8", " np.datetime64: - return np.datetime64("NaT") - - def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: - if check_json_int(data) or data == "NaT": - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[arg-type] - raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover - - def _cast_scalar_unchecked(self, data: object) -> np.datetime64: - return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") # type: ignore[no-any-return, call-overload] - @classmethod - def check_json_v2(cls, data: JSON, *, object_codec_id: str | None = None) -> TypeGuard[str]: - # match M[M], etc - # consider making this a standalone function - if not isinstance(data, str): + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V2]: + """ + Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DateTime64JSON_V2] + True if the input is a valid JSON representation of a NumPy datetime64 data type, + otherwise False. + """ + if not check_dtype_spec_v2(data): + return False + name = data["name"] + if not isinstance(name, str): return False - if not data.startswith(cls._zarr_v2_names): + if not name.startswith(cls._zarr_v2_names): return False - if len(data) == 3: + if len(name) == 3: # no unit, and # we already checked that this string is either M8 return True else: - return data[4:-1].endswith(get_args(DateTimeUnit)) and data[-1] == "]" + return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DateTime64JSON_V3] + True if the input is a valid JSON representation of a numpy datetime64 data type in Zarr V3, False otherwise. + """ + return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -266,3 +681,205 @@ def check_json_v3(cls, data: JSON) -> TypeGuard[DateTime64JSONV3]: and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V2-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of this class. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + + if cls._check_json_v2(data): + name = data["name"] + return cls.from_native_dtype(np.dtype(name)) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " + f"representation of an instance of {cls.dtype_cls}" + ) + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V3-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of this class. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v3(data): + unit = data["configuration"]["unit"] + scale_factor = data["configuration"]["scale_factor"] + return cls(unit=unit, scale_factor=scale_factor) + msg = ( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a dict " + f"with a 'name' key with the value 'numpy.datetime64', " + "and a 'configuration' key with a value of a dict with a 'unit' key and a " + "'scale_factor' key" + ) + raise DataTypeValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + DateTime64JSON_V2 | DateTime64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + """ + Check if the input is convertible to a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[DateTimeLike] + True if the input is a scalar of this data type, False otherwise. + """ + if data is None: + return True + return isinstance(data, str | int | bytes | np.datetime64 | datetime) + + def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + """ + Cast the input to a scalar of this data type without any type checking. + + Parameters + ---------- + data : DateTimeLike + The scalar data to cast. + + Returns + ------- + numpy.datetime64 + The input cast to a NumPy datetime scalar. + """ + return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") + + def cast_scalar(self, data: object) -> np.datetime64: + """ + Cast the input to a scalar of this data type after a type check. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + numpy.datetime64 + The input cast to a NumPy datetime scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy datetime scalar. + """ + if self._check_scalar(data): + return self._cast_scalar_unchecked(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) + + def default_scalar(self) -> np.datetime64: + """ + Return the default scalar value for this data type. + + Returns + ------- + numpy.datetime64 + The default scalar value, which is a 'Not-a-Time' (NaT) value + """ + + return np.datetime64("NaT") + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + """ + Read a JSON-serializable value as a scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + numpy.datetime64 + The numpy datetime scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. + """ + if check_json_time(data): + return self._cast_scalar_unchecked(data) + raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0423f69dbe..cb9ab50044 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -6,12 +6,15 @@ import numpy as np -from zarr.core.dtype.common import DataTypeValidationError +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeJSON, +) if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.core.common import JSON + from zarr.core.common import ZarrFormat from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -20,34 +23,131 @@ # have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: + """ + A registry for ZDType classes. + + This registry is a mapping from Zarr data type names to their + corresponding ZDType classes. + + Attributes + ---------- + contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] + The mapping from Zarr data type names to their corresponding + ZDType classes. + """ + contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - def lazy_load(self) -> None: - for e in self.lazy_load_list: + def _lazy_load(self) -> None: + """ + Load all data types from the lazy load list and register them with + the registry. After loading, clear the lazy load list. + """ + for e in self._lazy_load_list: self.register(e.load()._zarr_v3_name, e.load()) - self.lazy_load_list.clear() + self._lazy_load_list.clear() def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - # don't register the same dtype twice + """ + Register a data type with the registry. + + Parameters + ---------- + key : str + The Zarr V3 name of the data type. + cls : type[ZDType[TBaseDType, TBaseScalar]] + The class of the data type to register. + + Notes + ----- + This method is idempotent. If the data type is already registered, this + method does nothing. + """ if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls def unregister(self, key: str) -> None: - """Unregister a data type by its key.""" + """ + Unregister a data type from the registry. + + Parameters + ---------- + key : str + The key associated with the ZDType class to be unregistered. + + Returns + ------- + None + + Raises + ------ + KeyError + If the data type is not found in the registry. + """ if key in self.contents: del self.contents[key] else: raise KeyError(f"Data type '{key}' not found in registry.") def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: + """ + Retrieve a registered ZDType class by its key. + + Parameters + ---------- + key : str + The key associated with the desired ZDType class. + + Returns + ------- + type[ZDType[TBaseDType, TBaseScalar]] + The ZDType class registered under the given key. + + Raises + ------ + KeyError + If the key is not found in the registry. + """ + return self.contents[key] def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a native data type, e.g. a NumPy data type, to a registered ZDType. + + Parameters + ---------- + dtype : TBaseDType + The native data type to match. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the provided NumPy data type. + + Raises + ------ + ValueError + If the data type is a NumPy "Object" type, which is ambiguous, or if multiple + or no Zarr data types are found that match the provided dtype. + + Notes + ----- + This function attempts to resolve a Zarr data type from a given native data type. + If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type + can represent multiple Zarr data types. In such cases, a specific Zarr data type + should be explicitly constructed instead of relying on dynamic resolution. + + If multiple matches are found, it will also raise a ValueError. In this case + conflicting data types must be unregistered, or the Zarr data type should be explicitly + constructed. + """ + if dtype == np.dtype("O"): msg = ( f"Zarr data type resolution from {dtype} failed. " @@ -56,7 +156,7 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: "data type. " "In this case you should construct your array by providing a specific Zarr data " 'type. For a list of Zarr data types that are compatible with the numpy "Object"' - "data type, see xxxxxxxxxxx" + "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) matched: list[ZDType[TBaseDType, TBaseScalar]] = [] @@ -71,29 +171,38 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " "You should unregister one of these data types, or avoid Zarr data type inference " "entirely by providing a specific Zarr data type when creating your array." - "For more information, see xxxxxxxxxxxxxxxxxx" + "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) - raise ValueError(f"No data type wrapper found that matches dtype '{dtype}'") + raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") - def match_json_v2( - self, data: JSON, *, object_codec_id: str | None = None + def match_json( + self, data: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: - # The dtype field in zarr v2 JSON metadata is not unique across different distinct data types. - # Specifically, multiple distinct data types all use the "|O" data type representation. - # These must be disambiguated by the presence of an "object codec", which is a codec - # like variable-length utf8 encoding for strings. - for val in self.contents.values(): - try: - return val.from_json_v2(data, object_codec_id=object_codec_id) - except DataTypeValidationError: - pass - raise ValueError(f"No data type wrapper found that matches {data}") + """ + Match a JSON representation of a data type to a registered ZDType. + + Parameters + ---------- + data : DTypeJSON + The JSON representation of a data type to match. + zarr_format : ZarrFormat + The Zarr format version to consider when matching data types. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the JSON representation. + + Raises + ------ + ValueError + If no matching Zarr data type is found for the given JSON data. + """ - def match_json_v3(self, data: JSON) -> ZDType[TBaseDType, TBaseScalar]: for val in self.contents.values(): try: - return val.from_json_v3(data) + return val.from_json(data, zarr_format=zarr_format) except DataTypeValidationError: pass - raise ValueError(f"No data type wrapper found that matches {data}") + raise ValueError(f"No Zarr data type found that matches {data!r}") diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 4c399bbb84..776aea81d8 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -1,8 +1,8 @@ """ Wrapper for native array data types. -The `ZDType` class is an abstract base class for wrapping native array data types, e.g. numpy dtypes. -It provides a common interface for working with data types in a way that is independent of the +The ``ZDType`` class is an abstract base class for wrapping native array data types, e.g. NumPy dtypes. +``ZDType`` provides a common interface for working with data types in a way that is independent of the underlying data type system. The wrapper class encapsulates a native data type. Instances of the class can be created from a @@ -10,20 +10,20 @@ wrapper class. The wrapper class is responsible for: -- Reversibly serializing a native data type to Zarr V2 or Zarr V3 metadata. +- Serializing and deserializing a native data type to Zarr V2 or Zarr V3 metadata. This ensures that the data type can be properly stored and retrieved from array metadata. -- Reversibly serializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for +- Serializing and deserializing scalar values to Zarr V2 or Zarr V3 metadata. This is important for storing a fill value for an array in a manner that is valid for the data type. -To add support for a new data type in Zarr, you should subclass the wrapper class and adapt its methods +You can add support for a new data type in Zarr by subclassing ``ZDType`` wrapper class and adapt its methods to support your native data type. The wrapper class must be added to a data type registry -(defined elsewhere) before ``create_array`` can properly handle the new data type. +(defined elsewhere) before array creation routines or array reading routines can use your new data +type. """ from __future__ import annotations from abc import ABC, abstractmethod -from collections.abc import Mapping, Sequence from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -38,10 +38,9 @@ import numpy as np -from zarr.core.dtype.common import DataTypeValidationError - if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 # This the upper bound for the scalar types we support. It's numpy scalars + str, # because the new variable-length string dtype in numpy does not have a corresponding scalar type @@ -56,24 +55,19 @@ TScalar_co = TypeVar("TScalar_co", bound=TBaseScalar, covariant=True) TDType_co = TypeVar("TDType_co", bound=TBaseDType, covariant=True) -# These types should include all JSON-serializable types that can be used to represent a data type. -DTypeJSON_V2 = str | Sequence[object] -DTypeJSON_V3 = str | Mapping[str, object] - @dataclass(frozen=True, kw_only=True, slots=True) -class ZDType(Generic[TDType_co, TScalar_co], ABC): +class ZDType(ABC, Generic[TDType_co, TScalar_co]): """ Abstract base class for wrapping native array data types, e.g. numpy dtypes Attributes ---------- dtype_cls : ClassVar[type[TDType]] - The wrapped dtype class. This is a class variable. Instances of this class cannot set it. + The wrapped dtype class. This is a class variable. _zarr_v3_name : ClassVar[str] - The name given to the wrapped data type by a zarr v3 data type specification. Note that this - is not necessarily the same name that will appear in metadata documents, as some data types - have names that depend on their configuration. + The name given to the data type by a Zarr v3 data type specification. This is a + class variable, and it should generally be unique across different data types. """ # this class will create a native data type @@ -84,9 +78,11 @@ class ZDType(Generic[TDType_co, TScalar_co], ABC): _zarr_v3_name: ClassVar[str] @classmethod - def check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: + def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ - Check that a data type matches the dtype_cls class attribute. Used as a type guard. + Check that a native data type matches the dtype_cls class attribute. + + Used as a type guard. Parameters ---------- @@ -101,194 +97,87 @@ def check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_c return type(dtype) is cls.dtype_cls @classmethod + @abstractmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ - Wrap a dtype object. + Create a ZDType instance from a native data type. + + This method is used when taking a user-provided native data type, like a NumPy data type, + and creating the corresponding ZDType instance from them. Parameters ---------- dtype : TDType - The dtype object to wrap. + The native data type object to wrap. Returns ------- Self - The wrapped dtype. + The ZDType that wraps the native data type. Raises ------ TypeError - If the dtype does not match the dtype_cls class attribute. - """ - if cls.check_native_dtype(dtype): - return cls._from_native_dtype_unsafe(dtype) - raise DataTypeValidationError( - f"Invalid dtype: {dtype}. Expected an instance of {cls.dtype_cls}." - ) - - @classmethod - @abstractmethod - def _from_native_dtype_unsafe(cls: type[Self], dtype: TBaseDType) -> Self: - """ - Wrap a native dtype without checking. - - Parameters - ---------- - dtype : TDType - The native dtype to wrap. - - Returns - ------- - Self - The wrapped dtype. + If the native data type is not consistent with the wrapped data type. """ - ... + raise NotImplementedError # pragma: no cover @abstractmethod def to_native_dtype(self: Self) -> TDType_co: """ - Return an instance of the wrapped dtype. + Return an instance of the wrapped data type. This operation inverts ``from_native_dtype``. Returns ------- TDType - The unwrapped dtype. - """ - ... - - def cast_scalar(self, data: object) -> TScalar_co: - """ - Cast a scalar to the wrapped scalar type. The type is first checked for compatibility. If - it's incompatible with the associated scalar type, a ``TypeError`` will be raised. - - Parameters - ---------- - data : TScalar - The scalar value to cast. - - Returns - ------- - TScalar - The cast value. - """ - if self.check_scalar(data): - return self._cast_scalar_unchecked(data) - msg = ( - f"The value {data!r} failed a type check. " - f"It cannot be safely cast to a scalar compatible with {self}. " - f"Consult the documentation for {self} to determine the possible values that can " - "be cast to scalars of the wrapped data type." - ) - raise TypeError(msg) - - @abstractmethod - def check_scalar(self, data: object) -> bool: - """ - Check that a scalar is a valid value for the wrapped data type. - - Parameters - ---------- - data : object - A value to check. - - Returns - ------- - Bool - True if the value is valid, False otherwise. - """ - ... - - @abstractmethod - def _cast_scalar_unchecked(self, data: object) -> TScalar_co: + The native data type wrapped by this ZDType. """ - Cast a scalar to the wrapped data type. This method should not perform any input validation. - - Parameters - ---------- - data : TScalar - The scalar value to cast. - - Returns - ------- - TScalar - The cast value. - """ - ... + raise NotImplementedError # pragma: no cover + @classmethod @abstractmethod - def default_scalar(self) -> TScalar_co: - """ - Get the default scalar value for the wrapped data type. This is a method, rather than an attribute, - because the default value for some data types may depend on parameters that are not known - until a concrete data type is wrapped. For example, data types parametrized by a length like - fixed-length strings or bytes will generate scalars consistent with that length. - - Returns - ------- - TScalar - The default value for this data type. - """ - ... + def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: + raise NotImplementedError # pragma: no cover @classmethod @abstractmethod - def check_json_v2( - cls: type[Self], data: JSON, *, object_codec_id: str | None = None - ) -> TypeGuard[DTypeJSON_V2]: - """ - Check that a JSON representation of a data type is consistent with the ZDType class. - - Parameters - ---------- - data : JSON - The JSON representation of the data type. - - object_codec_id : str | None - The object codec ID, if applicable. Object codecs are specific numcodecs codecs that - zarr-python 2.x used to serialize numpy "Object" scalars. For example, a dtype field set - to ``"|O"`` with an object codec ID of "vlen-utf8" indicates that the data type is a - variable-length string. - - Zarr V3 has no such logic, so this parameter is only used for Zarr V2 compatibility. - - Returns - ------- - Bool - True if the JSON representation matches, False otherwise. - """ - ... + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + raise NotImplementedError # pragma: no cover @classmethod - @abstractmethod - def check_json_v3(cls: type[Self], data: JSON) -> TypeGuard[DTypeJSON_V3]: + def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self: """ - Check that a JSON representation of a data type matches the dtype_cls class attribute. Used - as a type guard. This base implementation checks that the input is a dictionary, - that the key "name" is in that dictionary, and that the value of "name" - matches the _zarr_v3_name class attribute. + Create an instance of this ZDType from JSON data. Parameters ---------- - data : JSON + data : DTypeJSON The JSON representation of the data type. + zarr_format : ZarrFormat + The zarr format version. + Returns ------- - Bool - True if the JSON representation matches, False otherwise. + Self + An instance of this data type. """ - ... + if zarr_format == 2: + return cls._from_json_v2(data) + if zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeJSON_V2: ... + def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeJSON_V3: ... + def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... @abstractmethod - def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: + def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: """ - Convert the wrapped data type to a JSON-serializable form. + Serialize this ZDType to JSON. Parameters ---------- @@ -300,112 +189,115 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeJSON_V2 | DTypeJSON_V3: DTypeJSON_V2 | DTypeJSON_V3 The JSON-serializable representation of the wrapped data type """ - ... + raise NotImplementedError # pragma: no cover - @classmethod - def from_json_v3(cls: type[Self], data: JSON) -> Self: + @abstractmethod + def _check_scalar(self, data: object) -> bool: """ - Wrap a Zarr V3 JSON representation of a data type. + Check that an python object is a valid scalar value for the wrapped data type. Parameters ---------- - data : JSON - The JSON representation of the data type. + data : object + A value to check. Returns ------- - Self - The wrapped data type. + Bool + True if the object is valid, False otherwise. """ - if cls.check_json_v3(data): - return cls._from_json_unchecked(data, zarr_format=3) - raise DataTypeValidationError(f"Invalid JSON representation of data type {cls}: {data}") + raise NotImplementedError # pragma: no cover - @classmethod - def from_json_v2(cls: type[Self], data: JSON, *, object_codec_id: str | None) -> Self: + @abstractmethod + def cast_scalar(self, data: object) -> TScalar_co: """ - Wrap a Zarr V2 JSON representation of a data type. + Cast a python object to the wrapped scalar type. + + The type of the provided scalar is first checked for compatibility. + If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. Parameters ---------- - data : JSON - The JSON representation of the data type. + data : object + The python object to cast. Returns ------- - Self - The wrapped data type. + TScalar + The cast value. """ - if cls.check_json_v2(data, object_codec_id=object_codec_id): - return cls._from_json_unchecked(data, zarr_format=2) - raise DataTypeValidationError( - f"Invalid JSON representation of data type {cls}: {data!r}, object_codec_id={object_codec_id!r}" - ) - - @classmethod - @overload - def _from_json_unchecked(cls, data: DTypeJSON_V2, *, zarr_format: Literal[2]) -> Self: ... - @classmethod - @overload - def _from_json_unchecked(cls, data: DTypeJSON_V3, *, zarr_format: Literal[3]) -> Self: ... + raise NotImplementedError # pragma: no cover - @classmethod @abstractmethod - def _from_json_unchecked( - cls, data: DTypeJSON_V2 | DTypeJSON_V3, *, zarr_format: ZarrFormat - ) -> Self: + def default_scalar(self) -> TScalar_co: """ - Create a ZDType instance from a JSON representation of a data type. - - This method should be called after input has been type checked, and so it should not perform - any input validation. + Get the default scalar value for the wrapped data type. - Parameters - ---------- - data : JSON - The JSON representation of the data type. + This is a method, rather than an attribute, because the default value for some data types depends on parameters that are + not known until a concrete data type is wrapped. For example, data types parametrized by a + length like fixed-length strings or bytes will generate scalars consistent with that length. Returns ------- - Self - The wrapped data type. + TScalar + The default value for this data type. """ - ... + raise NotImplementedError # pragma: no cover @abstractmethod - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: + def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: """ - Convert a single value to JSON-serializable format. + Read a JSON-serializable value as a scalar. Parameters ---------- - data : object - The value to convert. + data : JSON + A JSON representation of a scalar value. zarr_format : ZarrFormat - The zarr format version. + The zarr format version. This is specified because the JSON serialization of scalars + differs between Zarr V2 and Zarr V3. Returns ------- - JSON - The JSON-serializable form of the scalar. + TScalar + The deserialized scalar value. """ - ... + raise NotImplementedError # pragma: no cover @abstractmethod - def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TScalar_co: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ - Read a JSON-serializable value as a scalar. + Serialize a python object to the JSON representation of a scalar. + + The value will first be cast to the scalar type associated with this ZDType, then serialized + to JSON. Parameters ---------- - data : JSON - The JSON-serializable value. + data : object + The value to convert. zarr_format : ZarrFormat - The zarr format version. + The zarr format version. This is specified because the JSON serialization of scalars + differs between Zarr V2 and Zarr V3. Returns ------- - TScalar - The native scalar value. + JSON + The JSON-serialized scalar. """ - ... + raise NotImplementedError # pragma: no cover + + +def scalar_failed_type_check_msg( + cls_instance: ZDType[TBaseDType, TBaseScalar], bad_scalar: object +) -> str: + """ + Generate an error message reporting that a particular value failed a type check when attempting + to cast that value to a scalar. + """ + return ( + f"The value {bad_scalar!r} failed a type check. " + f"It cannot be safely cast to a scalar compatible with {cls_instance}. " + f"Consult the documentation for {cls_instance} to determine the possible values that can " + "be cast to scalars of the wrapped data type." + ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index fd9b4071e4..38bc7cae85 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -5,15 +5,13 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - -from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec +from zarr.abc.codec import ArrayArrayCodec, Codec from zarr.abc.metadata import Metadata from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.dtype.common import OBJECT_CODEC_IDS from zarr.registry import get_codec if TYPE_CHECKING: @@ -23,18 +21,17 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.common import DTypeSpec_V2 from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, TDType_co, TScalar_co, - ZDType, ) import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -47,6 +44,9 @@ parse_shapelike, ) from zarr.core.config import config, parse_indexing_order +from zarr.core.dtype.wrapper import ( + ZDType, +) from zarr.core.metadata.common import parse_attributes @@ -169,7 +169,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # which could be in filters or as a compressor. # we will reference a hard-coded collection of object codec ids for this search. - _filters, _compressor = (_data.get("filters"), _data.get("compressor")) + _filters, _compressor = (data.get("filters"), data.get("compressor")) if _filters is not None: _filters = cast("tuple[dict[str, JSON], ...]", _filters) object_codec_id = get_object_codec_id(tuple(_filters) + (_compressor,)) @@ -177,8 +177,9 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: object_codec_id = get_object_codec_id((_compressor,)) # we add a layer of indirection here around the dtype attribute of the array metadata # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { - "name": _data["dtype"], + "name": data["dtype"], "object_codec_id": object_codec_id, } dtype = get_data_type_from_json(dtype_spec, zarr_format=2) @@ -196,9 +197,11 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: expected |= {"dtype", "chunks"} # check if `filters` is an empty sequence; if so use None instead and raise a warning + filters = _data.get("filters") if ( - isinstance(_filters, Sequence) - and len(_filters) == 0 + isinstance(filters, Sequence) + and not isinstance(filters, (str, bytes)) + and len(filters) == 0 ): msg = ( "Found an empty list of filters in the array metadata document. " @@ -231,7 +234,7 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["fill_value"] = fill_value # serialize the dtype after fill value-specific JSON encoding - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2) # type: ignore[assignment] + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] # type: ignore[assignment] return zarray_dict @@ -330,3 +333,21 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: ) raise ValueError(msg) return data + + +def get_object_codec_id(maybe_object_codecs: Sequence[JSON]) -> str | None: + """ + Inspect a sequence of codecs / filters for an "object codec", i.e. a codec + that can serialize object arrays to contiguous bytes. Zarr python + maintains a hard-coded set of object codec ids. If any element from the input + has an id that matches one of the hard-coded object codec ids, that id + is returned immediately. + """ + object_codec_id = None + for maybe_object_codec in maybe_object_codecs: + if ( + isinstance(maybe_object_codec, dict) + and maybe_object_codec.get("id") in OBJECT_CODEC_IDS + ): + return cast("str", maybe_object_codec["id"]) + return object_codec_id diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 96e5f10bdc..abb4ef50c6 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,7 +37,7 @@ from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec, get_codec_class +from zarr.registry import get_codec def parse_zarr_format(data: object) -> Literal[3]: @@ -97,7 +97,7 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseSc # TODO: use codec ID instead of class name codec_class_name = abc.__class__.__name__ # TODO: Fix typing here - if isinstance(dtype, VariableLengthString) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] + if isinstance(dtype, VariableLengthUTF8) and not codec_class_name == "VLenUTF8Codec": # type: ignore[unreachable] raise ValueError( f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) @@ -306,7 +306,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _ = parse_node_type_array(_data.pop("node_type")) data_type_json = _data.pop("data_type") - data_type = get_data_type_from_json_v3(data_type_json) + if not check_dtype_spec_v3(data_type_json): + raise ValueError(f"Invalid data_type: {data_type_json!r}") + data_type = get_data_type_from_json(data_type_json, zarr_format=3) # check that the fill value is consistent with the data type try: diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 6e3789543b..79f3aa3a0f 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -1,3 +1,87 @@ -from zarr.core.dtype import ZDType, data_type_registry +from zarr.core.dtype import ( + Bool, + Complex64, + Complex128, + DataTypeValidationError, + DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, + FixedLengthUTF32, + FixedLengthUTF32JSON_V2, + FixedLengthUTF32JSON_V3, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + NullTerminatedBytes, + NullterminatedBytesJSON_V2, + NullTerminatedBytesJSON_V3, + RawBytes, + RawBytesJSON_V2, + RawBytesJSON_V3, + Structured, + StructuredJSON_V2, + StructuredJSON_V3, + TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthBytes, + VariableLengthBytesJSON_V2, + VariableLengthUTF8, + VariableLengthUTF8JSON_V2, + ZDType, + data_type_registry, + parse_data_type, +) -__all__ = ["ZDType", "data_type_registry"] +__all__ = [ + "Bool", + "Complex64", + "Complex128", + "DataTypeValidationError", + "DateTime64", + "DateTime64JSON_V2", + "DateTime64JSON_V3", + "FixedLengthUTF32", + "FixedLengthUTF32JSON_V2", + "FixedLengthUTF32JSON_V3", + "Float16", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", + "NullterminatedBytesJSON_V2", + "RawBytes", + "RawBytesJSON_V2", + "RawBytesJSON_V3", + "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", + "TimeDelta64", + "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "VariableLengthBytes", + "VariableLengthBytesJSON_V2", + "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", + "ZDType", + "data_type_registry", + "data_type_registry", + "parse_data_type", +] diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 7b07a3e0a1..996733972c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -2,7 +2,6 @@ import warnings from collections import defaultdict -from collections.abc import Mapping from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar @@ -11,7 +10,7 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.codecs.numcodec import Numcodec + from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -56,10 +55,6 @@ def register(self, cls: type[T], qualname: str | None = None) -> None: self[qualname] = cls -__filter_registries: dict[str, Registry[ArrayArrayCodec]] = defaultdict(Registry) -__serializer_registries: dict[str, Registry[ArrayBytesCodec]] = defaultdict(Registry) -__compressor_registries: dict[str, Registry[BytesBytesCodec]] = defaultdict(Registry) - __codec_registries: dict[str, Registry[Codec]] = defaultdict(Registry) __pipeline_registry: Registry[CodecPipeline] = Registry() __buffer_registry: Registry[Buffer] = Registry() @@ -100,8 +95,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( @@ -124,15 +119,18 @@ def _collect_entrypoints() -> list[Registry[Any]]: def _reload_config() -> None: config.refresh() + def fully_qualified_name(cls: type) -> str: module = cls.__module__ return module + "." + cls.__qualname__ + def register_codec(key: str, codec_cls: type[Codec]) -> None: if key not in __codec_registries: __codec_registries[key] = Registry() __codec_registries[key].register(codec_cls) + def register_pipeline(pipe_cls: type[CodecPipeline]) -> None: __pipeline_registry.register(pipe_cls) @@ -144,6 +142,7 @@ def register_ndbuffer(cls: type[NDBuffer], qualname: str | None = None) -> None: def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) + def _get_codec_class( key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False ) -> type[Codec]: @@ -188,8 +187,9 @@ def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec: raise TypeError( f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" ) - codec_name = request["id"] - codec_config = {k: v for k, v in request.items() if k != "id"} + else: + codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} elif zarr_format == 3: if isinstance(request, str): codec_name = request @@ -210,6 +210,7 @@ def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec: codec = get_numcodec_class(codec_name)(**codec_config) return NumcodecsWrapper(codec=codec) + def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: return _get_codec_class(key, __codec_registries, reload_config=reload_config) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 38ef7119db..d0726c3dd9 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -16,7 +16,7 @@ from zarr.core.array import Array from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.common import ZarrFormat +from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata from zarr.core.sync import sync @@ -43,21 +43,7 @@ def paths(draw: st.DrawFn, *, max_num_nodes: int | None = None) -> str: return draw(st.just("/") | keys(max_num_nodes=max_num_nodes)) -def v3_dtypes() -> st.SearchStrategy[np.dtype[Any]]: - return ( - npst.boolean_dtypes() - | npst.integer_dtypes(endianness="=") - | npst.unsigned_integer_dtypes(endianness="=") - | npst.floating_dtypes(endianness="=") - | npst.complex_number_dtypes(endianness="=") - | npst.byte_string_dtypes(endianness="=") - | npst.unicode_string_dtypes(endianness="=") - | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="=") - ) - - -def v2_dtypes() -> st.SearchStrategy[np.dtype[Any]]: +def dtypes() -> st.SearchStrategy[np.dtype[Any]]: return ( npst.boolean_dtypes() | npst.integer_dtypes(endianness="=") @@ -152,7 +138,7 @@ def array_metadata( shape = draw(array_shapes()) ndim = len(shape) chunk_shape = draw(array_shapes(min_dims=ndim, max_dims=ndim)) - np_dtype = draw(v3_dtypes()) + np_dtype = draw(dtypes()) dtype = get_data_type_from_native_dtype(np_dtype) fill_value = draw(npst.from_dtype(np_dtype)) if zarr_format == 2: diff --git a/tests/conftest.py b/tests/conftest.py index 1abfb24076..4d300a1fd4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,7 +19,7 @@ _parse_chunk_key_encoding, ) from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition -from zarr.core.common import JSON, parse_shapelike +from zarr.core.common import JSON, DimensionNames, parse_shapelike from zarr.core.config import config as zarr_config from zarr.core.dtype import ( get_data_type_from_native_dtype, diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 4f507ab457..ae86378cb5 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -1,5 +1,4 @@ -from collections.abc import Iterable -from typing import Any, Literal, Self +from __future__ import annotations from typing import TYPE_CHECKING @@ -10,9 +9,16 @@ from zarr.abc.codec import ArrayBytesCodec, CodecInput, CodecPipeline from zarr.codecs import BytesCodec from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.common import DataTypeValidationError, DTypeJSON, DTypeSpec_V2 from zarr.core.dtype.npy.bool import Bool +if TYPE_CHECKING: + from collections.abc import Iterable + from typing import Any, ClassVar, Literal, Self + + from zarr.core.array_spec import ArraySpec + from zarr.core.common import ZarrFormat + class TestEntrypointCodec(ArrayBytesCodec): is_fixed_size = True @@ -75,13 +81,21 @@ class TestDataType(Bool): This is a "data type" that serializes to "test" """ - _zarr_v3_name = "test" # type: ignore[assignment] + _zarr_v3_name: ClassVar[Literal["test"]] = "test" # type: ignore[assignment] @classmethod - def from_json(cls, data: JSON, zarr_format: Literal[2, 3]) -> Self: - if data == cls._zarr_v3_name: # type: ignore[has-type] + def from_json(cls, data: DTypeJSON, *, zarr_format: Literal[2, 3]) -> Self: + if zarr_format == 2 and data == {"name": cls._zarr_v3_name, "object_codec_id": None}: return cls() - raise ValueError - - def to_json(self, zarr_format: ZarrFormat) -> str: # type: ignore[override] - return self._zarr_v3_name # type: ignore[no-any-return, has-type] + if zarr_format == 3 and data == cls._zarr_v3_name: + return cls() + raise DataTypeValidationError( + f"Invalid JSON representation of {cls.__name__}. Got {data!r}" + ) + + def to_json(self, zarr_format: ZarrFormat) -> str | DTypeSpec_V2: # type: ignore[override] + if zarr_format == 2: + return {"name": self._zarr_v3_name, "object_codec_id": None} + if zarr_format == 3: + return self._zarr_v3_name + raise ValueError("zarr_format must be 2 or 3") diff --git a/tests/test_array.py b/tests/test_array.py index fce204ce38..6ffbab2566 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,25 +35,29 @@ _parse_chunk_encoding_v3, chunks_initialized, create_array, - default_filters_v2, - default_serializer_v3, ) from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition -from zarr.core.common import JSON, MemoryOrder, ZarrFormat -from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.core.dtype.common import Endianness -from zarr.core.dtype.npy.common import endianness_from_numpy_str -from zarr.core.dtype.npy.float import Float32, Float64 -from zarr.core.dtype.npy.int import Int16, UInt8 -from zarr.core.dtype.npy.sized import ( +from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype import ( + DateTime64, + Float32, + Float64, + Int16, Structured, + TimeDelta64, + UInt8, + VariableLengthUTF8, + ZDType, + parse_data_type, ) -from zarr.core.dtype.npy.string import VariableLengthString -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 -from zarr.core.dtype.wrapper import ZDType +from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr +from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer, ceildiv +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath @@ -61,8 +65,6 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: - from zarr.core.array_spec import ArrayConfigLike - from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata @@ -458,6 +460,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) expected = ArrayInfo( _zarr_format=2, _data_type=arr._async_array._zdtype, + _fill_value=arr.fill_value, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=None, @@ -475,6 +478,7 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) expected = ArrayInfo( _zarr_format=3, _data_type=arr._async_array._zdtype, + _fill_value=arr.fill_value, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -500,6 +504,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | expected = ArrayInfo( _zarr_format=3, _data_type=arr._async_array._zdtype, + _fill_value=arr.fill_value, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -535,6 +540,7 @@ async def test_info_v2_async( expected = ArrayInfo( _zarr_format=2, _data_type=Float64(), + _fill_value=arr.metadata.fill_value, _shape=(8, 8), _chunk_shape=(2, 2), _shard_shape=None, @@ -560,6 +566,7 @@ async def test_info_v3_async( expected = ArrayInfo( _zarr_format=3, _data_type=arr._zdtype, + _fill_value=arr.metadata.fill_value, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -587,6 +594,7 @@ async def test_info_complete_async( expected = ArrayInfo( _zarr_format=3, _data_type=arr._zdtype, + _fill_value=arr.metadata.fill_value, _shape=(8, 8), _chunk_shape=chunks, _shard_shape=shards, @@ -1023,7 +1031,7 @@ def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFor # Structured dtypes do not have a numpy string representation that uniquely identifies them if not isinstance(dtype, Structured): - if isinstance(dtype, VariableLengthString): + if isinstance(dtype, VariableLengthUTF8): # in numpy 2.3, StringDType().str becomes the string 'StringDType()' which numpy # does not accept as a string representation of the dtype. c = zarr.create_array( @@ -1060,6 +1068,7 @@ def test_dtype_roundtrip( assert a.dtype == b.dtype @staticmethod + @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", ["uint8", "float32", "U3", "S4", "V1"]) @pytest.mark.parametrize( "compressors", @@ -1070,8 +1079,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3, "checksum": True}}, - ({"name": "zstd", "configuration": {"level": 3, "checksum": True}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": False}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": False}},), ], ) @pytest.mark.parametrize( @@ -1273,7 +1282,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype) + filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected @@ -1287,9 +1296,9 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthString()]) + @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) async def test_default_filters_compressors( - store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthString, zarr_format: ZarrFormat + store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthUTF8, zarr_format: ZarrFormat ) -> None: """ Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with ``filters`` and ``compressors`` unspecified. @@ -1448,17 +1457,18 @@ async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) -> ) @staticmethod - @pytest.mark.parametrize("endianness", get_args(Endianness)) + @pytest.mark.parametrize("endianness", ENDIANNESS_STR) def test_default_endianness( - store: Store, zarr_format: ZarrFormat, endianness: Endianness + store: Store, zarr_format: ZarrFormat, endianness: EndiannessStr ) -> None: """ Test that that endianness is correctly set when creating an array when not specifying a serializer """ dtype = Int16(endianness=endianness) arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format) - assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness # type: ignore[union-attr] - + byte_order: str = arr[:].dtype.byteorder # type: ignore[union-attr] + assert byte_order in NUMPY_ENDIANNESS_STR + assert endianness_from_numpy_str(byte_order) == endianness # type: ignore[arg-type] @pytest.mark.parametrize("value", [1, 1.4, "a", b"a", np.array(1)]) diff --git a/tests/test_config.py b/tests/test_config.py index 92882f4381..d4b1b0496f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import os from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import Any from unittest import mock from unittest.mock import Mock @@ -18,13 +18,11 @@ Crc32cCodec, ShardingCodec, ) -from zarr.core.array import create_array from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.buffer.core import Buffer from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config -from zarr.core.dtype import Int8, VariableLengthString from zarr.core.indexing import SelectorTuple from zarr.registry import ( fully_qualified_name, @@ -44,9 +42,6 @@ TestNDArrayLike, ) -if TYPE_CHECKING: - from zarr.core.dtype.wrapper import ZDType - def test_config_defaults_set() -> None: # regression test for available defaults @@ -58,27 +53,6 @@ def test_config_defaults_set() -> None: "array": { "order": "C", "write_empty_chunks": False, - "v2_default_compressor": { - "default": {"id": "zstd", "level": 0, "checksum": False}, - "variable-length-string": {"id": "zstd", "level": 0, "checksum": False}, - }, - "v2_default_filters": { - "default": None, - "variable-length-string": [{"id": "vlen-utf8"}], - }, - "v3_default_filters": {"default": [], "variable-length-string": []}, - "v3_default_serializer": { - "default": {"name": "bytes", "configuration": {"endian": "little"}}, - "variable-length-string": {"name": "vlen-utf8"}, - }, - "v3_default_compressors": { - "default": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], - "variable-length-string": [ - {"name": "zstd", "configuration": {"level": 0, "checksum": False}} - ], - }, }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, @@ -99,8 +73,8 @@ def test_config_defaults_set() -> None: "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec", "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec", }, - "buffer": "zarr.core.buffer.cpu.Buffer", - "ndbuffer": "zarr.core.buffer.cpu.NDBuffer", + "buffer": "zarr.buffer.cpu.Buffer", + "ndbuffer": "zarr.buffer.cpu.NDBuffer", } ] ) @@ -206,18 +180,7 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[ - { - "name": "blosc", - "configuration": { - "typesize": 1, - "cname": "lz4", - "clevel": 1, - "blocksize": 1, - "shuffle": "bitshuffle", - }, - } - ], + compressors=[BloscCodec(typesize=1, shuffle="bitshuffle").to_json(zarr_format=3)], ) arr[:] = range(100) _mock.call.assert_called() @@ -332,28 +295,31 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"]) -async def test_default_codecs(dtype_category: str) -> None: +@pytest.mark.parametrize( + "key", + [ + "array.v2_default_compressor.numeric", + "array.v2_default_compressor.string", + "array.v2_default_compressor.bytes", + "array.v2_default_filters.string", + "array.v2_default_filters.bytes", + "array.v3_default_filters.numeric", + "array.v3_default_filters.raw", + "array.v3_default_filters.bytes", + "array.v3_default_serializer.numeric", + "array.v3_default_serializer.string", + "array.v3_default_serializer.bytes", + "array.v3_default_compressors.string", + "array.v3_default_compressors.bytes", + "array.v3_default_compressors", + ], +) +def test_deprecated_config(key: str) -> None: """ - Test that the default compressors are sensitive to the current setting of the config. + Test that a valuerror is raised when setting the default chunk encoding for a given + data type category """ - zdtype: ZDType[Any, Any] - if dtype_category == "variable-length-string": - zdtype = VariableLengthString() - else: - zdtype = Int8() - expected_compressors = (GzipCodec(),) - new_conf = { - f"array.v3_default_compressors.{dtype_category}": [ - c.to_dict() for c in expected_compressors - ] - } - with config.set(new_conf): - arr = await create_array( - shape=(100,), - chunks=(100,), - dtype=zdtype, - zarr_format=3, - store=MemoryStore(), - ) - assert arr.compressors == expected_compressors + + with pytest.raises(ValueError): + with zarr.config.set({key: "foo"}): + pass diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index b2aa89afd7..0650d143c6 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -6,7 +6,7 @@ from zarr.core.dtype import data_type_registry from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.sized import Structured +from zarr.core.dtype.npy.structured import Structured from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType @@ -65,7 +65,4 @@ class TestB(TestExample): for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): params = getattr(metafunc.cls, fixture_name) - if len(params) == 0: - msg = f"{metafunc.cls}.{fixture_name} is empty. Please provide a non-empty sequence of values." - raise ValueError(msg) - metafunc.parametrize(fixture_name, params, scope="class") + metafunc.parametrize(fixture_name, params, scope="class", ids=str) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 03dc550a9d..da30214b3b 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -2,7 +2,7 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.bool import Bool @@ -15,7 +15,7 @@ class TestBool(BaseTestZDType): np.dtype(np.float64), np.dtype(np.uint16), ) - valid_json_v2 = (V2JsonTestParams(dtype="|b1"),) + valid_json_v2 = ({"name": "|b1", "object_codec_id": None},) valid_json_v3 = ("bool",) invalid_json_v2 = ( "|b1", @@ -38,4 +38,5 @@ class TestBool(BaseTestZDType): (Bool(), np.True_, np.True_), (Bool(), np.False_, np.False_), ) + invalid_scalar_params = (None,) item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py index c4a82e22b0..d39d308112 100644 --- a/tests/test_dtype/test_npy/test_common.py +++ b/tests/test_dtype/test_npy/test_common.py @@ -9,9 +9,9 @@ import numpy as np import pytest -from zarr.core.dtype.common import Endianness, JSONFloatV2, SpecialFloatStrings +from zarr.core.dtype.common import ENDIANNESS_STR, JSONFloatV2, SpecialFloatStrings from zarr.core.dtype.npy.common import ( - EndiannessNumpy, + NumpyEndiannessStr, bytes_from_json, bytes_to_json, check_json_bool, @@ -67,10 +67,10 @@ def test_endianness_from_numpy_str(data: str, expected: str | None) -> None: Test that endianness_from_numpy_str correctly converts a numpy str literal to a human-readable literal value. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(EndiannessNumpy): + if data in get_args(NumpyEndiannessStr): assert endianness_from_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {get_args(EndiannessNumpy)}" + msg = f"Invalid endianness: {data!r}. Expected one of {get_args(NumpyEndiannessStr)}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_from_numpy_str(data) # type: ignore[arg-type] @@ -84,10 +84,10 @@ def test_endianness_to_numpy_str(data: str | None, expected: str) -> None: Test that endianness_to_numpy_str correctly converts a human-readable literal value to a numpy str literal. This test also checks that an invalid string input raises a ``ValueError`` """ - if data in get_args(Endianness) + (None,): + if data in ENDIANNESS_STR: assert endianness_to_numpy_str(data) == expected # type: ignore[arg-type] else: - msg = f"Invalid endianness: {data!r}. Expected one of {get_args(Endianness)}" + msg = f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}" with pytest.raises(ValueError, match=re.escape(msg)): endianness_to_numpy_str(data) # type: ignore[arg-type] diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index fd216d8415..b4ce42be58 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -4,7 +4,7 @@ import numpy as np -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 @@ -23,7 +23,10 @@ class TestComplex64(_BaseTestFloat): np.dtype(np.float64), np.dtype(np.complex128), ) - valid_json_v2 = (V2JsonTestParams(dtype=">c8"), V2JsonTestParams(dtype="c8", "object_codec_id": None}, + {"name": "c16"), V2JsonTestParams(dtype="c16", "object_codec_id": None}, + {"name": "f2"), V2JsonTestParams(dtype="f2", "object_codec_id": None}, + {"name": "f4"), V2JsonTestParams(dtype="f4", "object_codec_id": None}, + {"name": "f8"), V2JsonTestParams(dtype="f8", "object_codec_id": None}, + {"name": "i1", @@ -34,6 +34,7 @@ class TestInt8(BaseTestZDType): (Int8(), 1, np.int8(1)), (Int8(), -1, np.int8(-1)), ) + invalid_scalar_params = ((Int8(), {"set!"}), (Int8(), ("tuple",))) item_size_params = (Int8(),) @@ -46,7 +47,10 @@ class TestInt16(BaseTestZDType): np.dtype(np.uint16), np.dtype(np.float64), ) - valid_json_v2 = (V2JsonTestParams(dtype=">i2"), V2JsonTestParams(dtype="i2", "object_codec_id": None}, + {"name": "i4"), np.dtype("i4"), np.dtype("i4"), V2JsonTestParams(dtype="i4", "object_codec_id": None}, + {"name": "i8"), V2JsonTestParams(dtype="i8", "object_codec_id": None}, + {"name": "u2"), V2JsonTestParams(dtype="u2", "object_codec_id": None}, + {"name": "u4"), V2JsonTestParams(dtype="u4", "object_codec_id": None}, + {"name": "u8"), V2JsonTestParams(dtype="u8", "object_codec_id": None}, + {"name": "U10"), V2JsonTestParams(dtype="U10", "object_codec_id": None}, + {"name": " None: + """ + Test that we get a warning when serializing a dtype without a zarr v3 spec to json + when zarr_format is 3 + """ + with pytest.raises(UnstableSpecificationWarning): + zdtype.to_json(zarr_format=3) + + +def test_invalid_size() -> None: + """ + Test that it's impossible to create a data type that has no length + """ + length = 0 + msg = f"length must be >= 1, got {length}." + with pytest.raises(ValueError, match=msg): + FixedLengthUTF32(length=length) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index 96281434cd..b94b600cbf 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from tests.test_dtype.test_wrapper import BaseTestZDType, V2JsonTestParams +from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.common import DateTimeUnit from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int @@ -35,10 +35,10 @@ class TestDateTime64(_TestTimeBase): np.dtype("timedelta64[ns]"), ) valid_json_v2 = ( - V2JsonTestParams(dtype=">M8"), - V2JsonTestParams(dtype=">M8[s]"), - V2JsonTestParams(dtype="M8", "object_codec_id": None}, + {"name": ">M8[s]", "object_codec_id": None}, + {"name": "m8", "object_codec_id": None}, + {"name": ">m8[s]", "object_codec_id": None}, + {"name": " None: """ -@dataclass(frozen=True, kw_only=True, slots=True) -class V2JsonTestParams: - dtype: str | dict[str, object] | list[object] - object_codec_id: str | None = None - - class BaseTestZDType: """ A base class for testing ZDType subclasses. This class works in conjunction with the custom @@ -66,6 +60,16 @@ class BaseTestZDType: A tuple of invalid JSON representations for Zarr format version 3. cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. + scalar_v2_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v2 + scalar_v3_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v3 + invalid_scalar_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, value) tuples, where each value is expected to fail ZDType.cast_value. + item_size_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, expected) tuples for testing ZDType.item_size """ test_cls: type[ZDType[TBaseDType, TBaseScalar]] @@ -73,20 +77,23 @@ class BaseTestZDType: valid_dtype: ClassVar[tuple[TBaseDType, ...]] = () invalid_dtype: ClassVar[tuple[TBaseDType, ...]] = () - valid_json_v2: ClassVar[tuple[V2JsonTestParams, ...]] = () + valid_json_v2: ClassVar[tuple[DTypeSpec_V2, ...]] = () invalid_json_v2: ClassVar[tuple[str | dict[str, object] | list[object], ...]] = () - valid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () + valid_json_v3: ClassVar[tuple[DTypeSpec_V3, ...]] = () invalid_json_v3: ClassVar[tuple[str | dict[str, object], ...]] = () # for testing scalar round-trip serialization, we need a tuple of (data type json, scalar json) # pairs. the first element of the pair is used to create a dtype instance, and the second # element is the json serialization of the scalar that we want to round-trip. - scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + scalar_v2_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] - item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] + cast_value_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any, Any], ...]] = () + # Some data types, like bool and string, can consume any python object as a scalar. + # So we allow passing None in to this test to indicate that it should be skipped. + invalid_scalar_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...] | tuple[None]] = () + item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] = () def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: # An equality check for json-encoded scalars. This defaults to regular equality, @@ -99,25 +106,22 @@ def scalar_equals(self, scalar1: object, scalar2: object) -> bool: return scalar1 == scalar2 def test_check_dtype_valid(self, valid_dtype: TBaseDType) -> None: - assert self.test_cls.check_native_dtype(valid_dtype) + assert self.test_cls._check_native_dtype(valid_dtype) def test_check_dtype_invalid(self, invalid_dtype: object) -> None: - assert not self.test_cls.check_native_dtype(invalid_dtype) # type: ignore[arg-type] + assert not self.test_cls._check_native_dtype(invalid_dtype) # type: ignore[arg-type] def test_from_dtype_roundtrip(self, valid_dtype: Any) -> None: zdtype = self.test_cls.from_native_dtype(valid_dtype) assert zdtype.to_native_dtype() == valid_dtype - def test_from_json_roundtrip_v2(self, valid_json_v2: V2JsonTestParams) -> None: - zdtype = self.test_cls.from_json_v2( - valid_json_v2.dtype, # type: ignore[arg-type] - object_codec_id=valid_json_v2.object_codec_id, - ) - assert zdtype.to_json(zarr_format=2) == valid_json_v2.dtype + def test_from_json_roundtrip_v2(self, valid_json_v2: DTypeSpec_V2) -> None: + zdtype = self.test_cls.from_json(valid_json_v2, zarr_format=2) + assert zdtype.to_json(zarr_format=2) == valid_json_v2 @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - def test_from_json_roundtrip_v3(self, valid_json_v3: Any) -> None: - zdtype = self.test_cls.from_json_v3(valid_json_v3) + def test_from_json_roundtrip_v3(self, valid_json_v3: DTypeSpec_V3) -> None: + zdtype = self.test_cls.from_json(valid_json_v3, zarr_format=3) assert zdtype.to_json(zarr_format=3) == valid_json_v3 def test_scalar_roundtrip_v2(self, scalar_v2_params: tuple[ZDType[Any, Any], Any]) -> None: @@ -134,6 +138,21 @@ def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) zdtype, value, expected = cast_value_params observed = zdtype.cast_scalar(value) assert self.scalar_equals(expected, observed) + # check that casting is idempotent + assert self.scalar_equals(zdtype.cast_scalar(observed), observed) + + def test_invalid_scalar( + self, invalid_scalar_params: tuple[ZDType[Any, Any], Any] | None + ) -> None: + if invalid_scalar_params is None: + pytest.skip(f"No test data provided for {self}.{__name__}") + zdtype, data = invalid_scalar_params + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {zdtype}." + ) + with pytest.raises(TypeError, match=re.escape(msg)): + zdtype.cast_scalar(data) def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: """ @@ -143,4 +162,4 @@ def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: if isinstance(item_size_params, HasItemSize): assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize else: - pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") + pytest.skip(f"Data type {item_size_params} does not implement HasItemSize") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index c4225874a4..95ede9e1d7 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -21,12 +21,12 @@ Int16, TBaseDType, TBaseScalar, + VariableLengthUTF8, ZDType, data_type_registry, - get_data_type_from_json_v3, + get_data_type_from_json, parse_data_type, ) -from zarr.core.dtype.common import HasObjectCodec if TYPE_CHECKING: from collections.abc import Generator @@ -85,14 +85,14 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non """ Test that match_dtype raises an error if the dtype is not registered. """ - outside_dtype = "int8" - with pytest.raises( - ValueError, match=f"No data type wrapper found that matches dtype '{outside_dtype}'" - ): - data_type_registry_fixture.match_dtype(np.dtype(outside_dtype)) + outside_dtype_name = "int8" + outside_dtype = np.dtype(outside_dtype_name) + msg = f"No Zarr data type found that matches dtype '{outside_dtype!r}'" + with pytest.raises(ValueError, match=re.escape(msg)): + data_type_registry_fixture.match_dtype(outside_dtype) with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype) + data_type_registry_fixture.get(outside_dtype_name) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -110,23 +110,12 @@ def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) def test_registered_dtypes_match_json( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat ) -> None: - if zarr_format == 2: - if isinstance(zdtype, HasObjectCodec): - object_codec_id = zdtype.object_codec_id - else: - object_codec_id = None - assert ( - data_type_registry.match_json_v2( - zdtype.to_json(zarr_format=zarr_format), # type: ignore[arg-type] - object_codec_id=object_codec_id, - ) - == zdtype - ) - else: - skip_object_dtype(zdtype) - assert ( - data_type_registry.match_json_v3(zdtype.to_json(zarr_format=zarr_format)) == zdtype # type: ignore[arg-type] + assert ( + data_type_registry.match_json( + zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format ) + == zdtype + ) @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -148,14 +137,14 @@ def test_match_dtype_unique( dtype_instance = zdtype.to_native_dtype() - msg = f"No data type wrapper found that matches dtype '{dtype_instance}'" + msg = f"No Zarr data type found that matches dtype '{dtype_instance!r}'" with pytest.raises(ValueError, match=re.escape(msg)): data_type_registry_fixture.match_dtype(dtype_instance) instance_dict = zdtype.to_json(zarr_format=zarr_format) - msg = f"No data type wrapper found that matches {instance_dict}" + msg = f"No Zarr data type found that matches {instance_dict!r}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json_v3(instance_dict) # type: ignore[arg-type] + data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) # this is copied from the registry tests -- we should deduplicate @@ -178,16 +167,18 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry.lazy_load() + data_type_registry._lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) - assert get_data_type_from_json_v3(dtype_json) == instance + assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance data_type_registry.unregister(TestDataType._zarr_v3_name) @pytest.mark.parametrize( ("dtype_params", "expected", "zarr_format"), [ + ("str", VariableLengthUTF8(), 2), + ("str", VariableLengthUTF8(), 3), ("int8", Int8(), 3), (Int8(), Int8(), 3), (">i2", Int16(endianness="big"), 2), diff --git a/tests/test_examples.py b/tests/test_examples.py index 620a82da59..8b5705c317 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -73,7 +73,9 @@ def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: # This allows the example to be useful to users who don't have Zarr installed, but also testable. # --refresh ensures that uv doesn't use a cached build of our local package resave_script(script_path, dest_path) - result = subprocess.run(["uv", "run", "--refresh", str(dest_path)], capture_output=True, text=True) + result = subprocess.run( + ["uv", "run", "--refresh", str(dest_path)], capture_output=True, text=True + ) assert result.returncode == 0, ( f"Script at {script_path} failed to run. Output: {result.stdout} Error: {result.stderr}" ) diff --git a/tests/test_group.py b/tests/test_group.py index c522406db1..e6d382321c 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -23,6 +23,7 @@ from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config +from zarr.core.dtype.common import unpack_dtype_json from zarr.core.dtype.npy.int import UInt8 from zarr.core.group import ( ConsolidatedMetadata, @@ -516,7 +517,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": dtype.to_json(zarr_format=zarr_format), + "dtype": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, "shape": (1,), "chunks": (1,), @@ -555,7 +556,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat for c in default_compressors_v3(dtype) ], ), - "data_type": dtype.to_json(zarr_format=zarr_format), + "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, "node_type": "array", "shape": (1,), diff --git a/tests/test_gzip.py b/tests/test_gzip.py index a72210f5a9..ae7b68be5d 100644 --- a/tests/test_gzip.py +++ b/tests/test_gzip.py @@ -4,13 +4,12 @@ import zarr from zarr.codecs import GzipCodec -from zarr.core.common import ZarrFormat @pytest.mark.parametrize("zarr_format", [2, 3]) -def test_gzip_compression(zarr_format): +def test_gzip_compression(zarr_format) -> None: store = {} - arr_in = zarr.create_array( + zarr.create_array( store=store, dtype="int", shape=(1,), @@ -24,4 +23,4 @@ def test_gzip_compression(zarr_format): else: print(json.dumps(json.loads(store["zarr.json"].to_bytes()), indent=2)) - arr_out = zarr.open_array(store=store, zarr_format=zarr_format) + zarr.open_array(store=store, zarr_format=zarr_format) diff --git a/tests/test_image_codecs.py b/tests/test_image_codecs.py index c4017b65de..3372ac9bdf 100644 --- a/tests/test_image_codecs.py +++ b/tests/test_image_codecs.py @@ -14,4 +14,4 @@ z_r = zarr.open_array(store=store, zarr_format=3) -print(z_r.metadata.to_dict()["codecs"]) \ No newline at end of file +print(z_r.metadata.to_dict()["codecs"]) diff --git a/tests/test_info.py b/tests/test_info.py index b45828c2cd..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -54,6 +54,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: info = ArrayInfo( _zarr_format=zarr_format, _data_type=Int32(), + _fill_value=0, _shape=(100, 100), _chunk_shape=(10, 100), _order="C", @@ -66,6 +67,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Type : Array Zarr format : {zarr_format} Data type : Int32(endianness='little') + Fill value : 0 Shape : (100, 100) Chunk shape : (10, 100) Order : C @@ -92,6 +94,7 @@ def test_array_info_complete( info = ArrayInfo( _zarr_format=zarr_format, _data_type=Int32(), + _fill_value=0, _shape=(100, 100), _chunk_shape=(10, 100), _order="C", @@ -107,6 +110,7 @@ def test_array_info_complete( Type : Array Zarr format : {zarr_format} Data type : Int32(endianness='little') + Fill value : 0 Shape : (100, 100) Chunk shape : (10, 100) Order : C diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 88fb107433..a2894529aa 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -3,6 +3,7 @@ import json from typing import TYPE_CHECKING, Literal +import numpy as np import pytest import zarr.api.asynchronous diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 68f53ded5f..4f385afa6d 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -20,7 +20,7 @@ parse_dimension_names, parse_zarr_format, ) -from zarr.errors import MetadataValidationError +from zarr.errors import MetadataValidationError, NodeTypeValidationError if TYPE_CHECKING: from collections.abc import Sequence diff --git a/tests/test_properties.py b/tests/test_properties.py index c752721108..27f847fa69 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -75,10 +75,11 @@ def deep_equal(a: Any, b: Any) -> bool: return a == b -@given(data=st.data(), zarr_format=zarr_formats) -def test_array_roundtrip(data: st.DataObject, zarr_format: int) -> None: - nparray = data.draw(numpy_arrays(zarr_formats=st.just(zarr_format))) - zarray = data.draw(arrays(arrays=st.just(nparray), zarr_formats=st.just(zarr_format))) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@given(data=st.data()) +def test_array_roundtrip(data: st.DataObject) -> None: + nparray = data.draw(numpy_arrays()) + zarray = data.draw(arrays(arrays=st.just(nparray))) assert_array_equal(nparray, zarray[:]) diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py deleted file mode 100644 index a1d13510c3..0000000000 --- a/tests/test_regression/test_regression.py +++ /dev/null @@ -1,144 +0,0 @@ -import subprocess -from dataclasses import dataclass -from itertools import product -from pathlib import Path -from typing import TYPE_CHECKING - -import numcodecs -import numpy as np -import pytest -from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd - -import zarr -from zarr.core.array import Array -from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding -from zarr.core.dtype.npy.string import VariableLengthString -from zarr.core.dtype.npy.vlen_bytes import VariableLengthBytes -from zarr.storage import LocalStore - -if TYPE_CHECKING: - from zarr.core.dtype import ZDTypeLike - - -def runner_installed() -> bool: - """ - Check if a PEP-723 compliant python script runner is installed. - """ - try: - subprocess.check_output(["uv", "--version"]) - return True # noqa: TRY300 - except FileNotFoundError: - return False - - -@dataclass(kw_only=True) -class ArrayParams: - values: np.ndarray[tuple[int], np.dtype[np.generic]] - fill_value: np.generic | str | int | bytes - filters: tuple[numcodecs.abc.Codec, ...] = () - compressor: numcodecs.abc.Codec - - -basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() -basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" -string_dtypes = ">S1", "U4" - -basic_array_cases = [ - ArrayParams(values=np.arange(4, dtype=dtype), fill_value=1, compressor=codec) - for codec, dtype in product(basic_codecs, basic_dtypes) -] -datetime_array_cases = [ - ArrayParams(values=np.ones((4,), dtype=dtype), fill_value=1, compressor=codec) - for codec, dtype in product(basic_codecs, datetime_dtypes) -] -string_array_cases = [ - ArrayParams( - values=np.array(["aaaa", "bbbb", "ccccc", "dddd"], dtype=dtype), - fill_value="foo", - compressor=codec, - ) - for codec, dtype in product(basic_codecs, string_dtypes) -] -vlen_string_cases = [ - ArrayParams( - values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), - fill_value="1", - filters=(VLenUTF8(),), - compressor=GZip(), - ) -] -vlen_bytes_cases = [ - ArrayParams( - values=np.array([b"a", b"bb", b"ccc", b"dddd"], dtype="O"), - fill_value=b"1", - filters=(VLenBytes(),), - compressor=GZip(), - ) -] -array_cases = ( - basic_array_cases - + datetime_array_cases - + string_array_cases - + vlen_string_cases - + vlen_bytes_cases -) - - -@pytest.fixture -def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: - dest = tmp_path / "in" - store = LocalStore(dest) - array_params: ArrayParams = request.param - compressor = array_params.compressor - chunk_key_encoding = V2ChunkKeyEncoding(separator="/") - dtype: ZDTypeLike - if array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenUTF8(),): - dtype = VariableLengthString() # type: ignore[assignment] - elif array_params.values.dtype == np.dtype("|O") and array_params.filters == (VLenBytes(),): - dtype = VariableLengthBytes() - else: - dtype = array_params.values.dtype - z = zarr.create_array( - store, - shape=array_params.values.shape, - dtype=dtype, - chunks=array_params.values.shape, - compressors=compressor, - filters=array_params.filters, - fill_value=array_params.fill_value, - order="C", - chunk_key_encoding=chunk_key_encoding, - write_data=True, - zarr_format=2, - ) - z[:] = array_params.values - return z - - -# TODO: make this dynamic based on the installed scripts -script_paths = [Path(__file__).resolve().parent / "scripts" / "v2.18.py"] - - -@pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") -@pytest.mark.parametrize( - "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) -) -@pytest.mark.parametrize("script_path", script_paths) -def test_roundtrip(source_array: Array, tmp_path: Path, script_path: Path) -> None: - out_path = tmp_path / "out" - copy_op = subprocess.run( - [ - "uv", - "run", - script_path, - str(source_array.store).removeprefix("file://"), - str(out_path), - ], - capture_output=True, - text=True, - ) - assert copy_op.returncode == 0 - out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) - assert source_array.metadata.to_dict() == out_array.metadata.to_dict() - assert np.array_equal(source_array[:], out_array[:]) diff --git a/tests/test_v2.py b/tests/test_v2.py index 392ebc2d69..4d17305995 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -16,8 +16,10 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import FixedLengthASCII, FixedLengthUTF32, Structured, VariableLengthString +from zarr.core.dtype import FixedLengthUTF32, Structured, VariableLengthUTF8 +from zarr.core.dtype.npy.bytes import NullTerminatedBytes from zarr.core.dtype.wrapper import ZDType +from zarr.core.group import Group from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -68,35 +70,31 @@ def test_codec_pipeline() -> None: ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), ], ) -async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_json) -> None: - with config.set( - { - "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], - "array.v2_default_compressor.bytes": None, - } - ): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None - ) +async def test_v2_encode_decode( + dtype: str, expected_dtype: str, fill_value: bytes, fill_value_json: str +) -> None: + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None + ) result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) assert result is not None - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": expected_dtype, - "fill_value": fill_value_json, - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": expected_dtype, + "fill_value": fill_value_json, + "filters": None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected data = zarr.open_array(store=store, path="foo")[:] np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) @@ -108,12 +106,12 @@ async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_js @pytest.mark.parametrize( ("dtype", "value"), [ - (FixedLengthASCII(length=1), b"Y"), + (NullTerminatedBytes(length=1), b"Y"), (FixedLengthUTF32(length=1), "Y"), - (VariableLengthString(), "Y"), + (VariableLengthUTF8(), "Y"), ], ) -def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str): +def test_v2_encode_decode_with_data(dtype: ZDType[Any, Any], value: str) -> None: expected = np.full((3,), value, dtype=dtype.to_native_dtype()) a = zarr.create( shape=(3,), @@ -229,7 +227,7 @@ def test_v2_non_contiguous(numpy_order: Literal["C", "F"], zarr_order: Literal[" def test_default_compressor_deprecation_warning() -> None: with pytest.warns(DeprecationWarning, match="default_compressor is deprecated"): - zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" + zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" # type: ignore[attr-defined] @pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) diff --git a/tests/test_x.py b/tests/test_x.py deleted file mode 100644 index f38d5e4e00..0000000000 --- a/tests/test_x.py +++ /dev/null @@ -1,4 +0,0 @@ -from zarr.registry import get_codec - -def test(): - c = get_codec('gzip', {"level": 1}) From a2bc6555976b57579dfcf605b44ac3abb80237c5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 21 Jul 2025 22:35:15 +0200 Subject: [PATCH 128/129] remove off-target changes --- src/zarr/core/array_spec.py | 2 +- src/zarr/core/buffer/core.py | 2 +- src/zarr/core/chunk_grids.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 5d4321da82..279bf6edf0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -63,7 +63,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast(Literal["order", "write_empty_chunks"], f.name) + field_name = cast("Literal['order', 'write_empty_chunks']", f.name) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index a8577b282b..19125b838f 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -475,7 +475,7 @@ def as_scalar(self) -> ScalarType: """Returns the buffer as a scalar value""" if self._data.size != 1: raise ValueError("Buffer does not contain a single scalar value") - return cast(ScalarType, self.as_numpy_array()[()]) + return cast("ScalarType", self.as_numpy_array()[()]) @property def dtype(self) -> np.dtype[Any]: diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 6701aca182..4bf03c89de 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -63,7 +63,10 @@ def _guess_chunks( """ if isinstance(shape, int): shape = (shape,) - typesize = max(typesize, 1) + + if typesize == 0: + return shape + ndims = len(shape) # require chunks to have non-zero length for all dimensions chunks = np.maximum(np.array(shape, dtype="=f8"), 1) From 50c6b483b04ca37982a78f484917ba0f80596422 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 22 Jul 2025 11:24:35 +0200 Subject: [PATCH 129/129] update imagecodecs example --- examples/image_codecs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/image_codecs.py b/examples/image_codecs.py index 3a2f8e2c26..2020e49e3b 100644 --- a/examples/image_codecs.py +++ b/examples/image_codecs.py @@ -1,7 +1,7 @@ # /// script # requires-python = ">=3.11" # dependencies = [ -# "zarr @ file:///home/bennettd/dev/zarr-python/", +# "zarr @ git+https://github.com/d-v-b/zarr-python.git@a2bc6555", # "imagecodecs==2025.3.30", # "pytest" # ] @@ -25,21 +25,22 @@ def test(zarr_format: Literal[2, 3]) -> None: store = {} if zarr_format == 2: - zarr.create_array( + z_w = zarr.create_array( store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), compressors=jpg_codec, zarr_format=zarr_format, ) else: - zarr.create_array( + z_w = zarr.create_array( store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=zarr_format, ) - + z_w[:] = 2 z_r = zarr.open_array(store=store, zarr_format=zarr_format) + assert np.all(z_r[:] == 2) if zarr_format == 2: print(z_r.metadata.to_dict()["compressor"]) else: