From c1d155037df4dcf318b7f800a8cd1e1e9be721b2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Jun 2025 18:46:29 +0200 Subject: [PATCH 01/42] add int2 example, and expand dtype docs --- docs/user-guide/data_types.rst | 95 +++++++++++++++++++ examples/custom_dtype.py | 167 +++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 examples/custom_dtype.py diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 87c8efc1f5..33aa09374b 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -170,3 +170,98 @@ Deserialize a scalar value from JSON: >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) + +Adding new data types +~~~~~~~~~~~~~~~~~~~~~ + +Each Zarr data type is a separate Python class that inherits from +`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. You can define a custom data type by +writing your own subclass of `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ and adding +your data type to the data type registry. A complete example of this process is included below. + +The source code for this example can be found in the ``examples/custom_dtype.py`` file in the Zarr +Python project directory. + +.. literalinclude:: ../../examples/custom_dtype.py + :language: python + + +Data type resolution +~~~~~~~~~~~~~~~~~~~~ + +Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array +with a NumPy data type object: + +.. code-block:: python + + >>> from zarr import create_array + >>> import numpy as np + >>> a = create_array({}, shape=(10,), dtype=np.dtype('int')) + >>> a + + +Or a string representation of a NumPy data type: + +.. code-block:: python + + >>> a = create_array({}, shape=(10,), dtype='>> a + + +The ``Array`` object presents itself like a NumPy array, including exposing a NumPy +data type as its ``dtype`` attribute: + +.. code-block:: python + + >>> type(a.dtype) + + +But if we inspect the metadata for the array, we can see the Zarr data type object: + +.. code-block:: python + + >>> type(a.metadata.data_type) + + +This example illustrates a general problem Zarr Python has to solve -- how can we allow users to +specify a data type as a string, or a NumPy ``dtype`` object, and produce the right Zarr data type +from that input? We call this process "data type resolution". Zarr Python also performs data type +resolution when reading stored arrays, although in this case the input is a ``JSON`` value instead +of a NumPy data type. + +For simple data types like ``int`` the solution could be extremely simple: just +maintain a lookup table that relates a NumPy data type to the Zarr data type equivalent. But not all +data types are so simple. Consider this case: + +.. code-block:: python + + >>> from zarr import create_array + >>> import numpy as np + >>> a = create_array({}, shape=(10,), dtype=[('a', np.dtype('float')), ('b', 'i8')]) + >>> a.dtype # this is the NumPy data type + dtype([('a', '>> a.metadata.data_type # this is the Zarr data type + Structured(fields=(('a', Float64(endianness='little')), ('b', Int64(endianness='little')))) + +In this example, we created a +`NumPy structured data type `_. +This data type is a container that can contain any NumPy data type, which makes it recursive. It is +not possible to make a lookup table that relates all NumPy structured data types to their Zarr +equivalents, as there is a nearly unbounded number of different structured data types. So instead of +a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. + +Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry", +is essentially a dict where the keys are strings (a canonical name for each data type), and the values are +the data type classes themselves. Dynamic data type resolution entails iterating over these data +type classes, invoking a special class constructor defined on each one, and returning a concrete +data type instance if and only if exactly 1 of those constructor invocations was successful. + +In plain language, we take some user input (a NumPy array), offer it to all the known data type +classes, and return an instance of the one data type class that could accept that user input. + +We want to avoid a situation where the same NumPy data type matches multiple Zarr data types. I.e., +a NumPy data type should uniquely specify a single Zarr data type. But data type resolution is +dynamic, so it's not possible to guarantee this uniqueness constraint. So we attempt data type +resolution against every data type class, and if for some reason a NumPy data type matches multiple +Zarr data types, we treat this as an error and raise an exception. + diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py new file mode 100644 index 0000000000..8d10bc299f --- /dev/null +++ b/examples/custom_dtype.py @@ -0,0 +1,167 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr@git+https://github.com/zarr-developers/zarr-python.git@main", +# "ml_dtypes==0.5.1", +# "pytest==8.4.1" +# ] +# /// +# + +""" +Demonstrate how to extend Zarr Python by defining a new data type +""" + +import json +import sys +from pathlib import Path +from typing import ClassVar, Literal, Self, TypeGuard + +import ml_dtypes # necessary to add extra dtypes to NumPy +import numpy as np +import pytest + +import zarr +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype import ZDType, data_type_registry +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + check_dtype_spec_v2, +) + +int2_dtype_cls = type(np.dtype("int2")) +int2_scalar_cls = ml_dtypes.int2 + + +class Int2(ZDType[int2_dtype_cls, int2_scalar_cls]): + """ + This class provides a Zarr compatibility layer around the int2 data type and the int2 + scalar type. + """ + + # This field is as the key for the data type in the internal data type registry, and also + # as the identifier for the data type when serializaing the data type to disk for zarr v3 + _zarr_v3_name: ClassVar[Literal["int2"]] = "int2" + # this field will be used internally + _zarr_v2_name: ClassVar[Literal["int2"]] = "int2" + + # we bind a class variable to the native data type class so we can create instances of it + dtype_cls = int2_dtype_cls + + @classmethod + def from_native_dtype(cls, dtype: np.dtype) -> Self: + """Create an instance of this ZDType from a native dtype.""" + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self: Self) -> int2_dtype_cls: + """Create an int2 dtype instance from this ZDType""" + return self.dtype_cls() + + @classmethod + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: + """Type check for Zarr v2-flavored JSON""" + return ( + check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None + ) + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]: + """Type check for Zarr v3-flavored JSON""" + return data == cls._zarr_v3_name + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from zarr v3-flavored JSON. + """ + if cls._check_json_v2(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from zarr v3-flavored JSON. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["int2"], None] | Literal["int2"]: + """Serialize this ZDType to v2- or v3-flavored JSON""" + if zarr_format == 2: + return {"name": "int2", "object_codec_id": None} + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def _check_scalar(self, data: object) -> TypeGuard[int]: + """Check if a python object is a valid scalar""" + return isinstance(data, (int, int2_scalar_cls)) + + def cast_scalar(self, data: object) -> ml_dtypes.int2: + """ + Attempt to cast a python object to an int2. Might fail pending a type check. + """ + if self._check_scalar(data): + return ml_dtypes.int2(data) + msg = f"Cannot convert object with type {type(data)} to a 2-bit integer." + raise TypeError(msg) + + def default_scalar(self) -> ml_dtypes.int2: + """Get the default scalar value""" + return ml_dtypes.int2(0) + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + """Convert a python object to a scalar.""" + return int(data) + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> ml_dtypes.int2: + """ + Read a JSON-serializable value as a scalar. The base definition of this method + requires that it take a zarr_format parameter, because some data types serialize scalars + differently in zarr v2 and v3 + """ + if self._check_scalar(data): + return ml_dtypes.int2(data) + raise TypeError(f"Invalid type: {data}. Expected an int.") + + +# after defining dtype class, it must be registered with the data type registry so zarr can use it +data_type_registry.register(Int2._zarr_v3_name, Int2) + + +# this parametrized function will create arrays in zarr v2 and v3 using our new data type +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_custom_dtype(tmp_path: Path, zarr_format: Literal[2, 3]) -> None: + # create array and write values + z_w = zarr.create_array( + store=tmp_path, shape=(4,), dtype="int2", zarr_format=zarr_format, compressors=None + ) + z_w[:] = [-1, -2, 0, 1] + + # open the array + z_r = zarr.open_array(tmp_path, mode="r") + + print(z_r.info_complete()) + + # look at the array metadata + if zarr_format == 2: + meta_file = tmp_path / ".zarray" + else: + meta_file = tmp_path / "zarr.json" + print(json.dumps(json.loads(meta_file.read_text()), indent=2)) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-s", __file__, f"-c {__file__}"])) From 6e4a93810316761c545569de3a801c01bac65414 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Jun 2025 19:06:35 +0200 Subject: [PATCH 02/42] specify zarr with a direct local file reference for the dtype example --- examples/custom_dtype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py index 8d10bc299f..0e83431ba0 100644 --- a/examples/custom_dtype.py +++ b/examples/custom_dtype.py @@ -1,7 +1,7 @@ # /// script # requires-python = ">=3.11" # dependencies = [ -# "zarr@git+https://github.com/zarr-developers/zarr-python.git@main", +# "zarr @ {root}", # "ml_dtypes==0.5.1", # "pytest==8.4.1" # ] From 8d18eed595c911bec06d3d22ebe15e37546eefdb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Jun 2025 19:09:44 +0200 Subject: [PATCH 03/42] add comment on pep-723 metadata --- examples/custom_dtype.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py index 0e83431ba0..354fabe850 100644 --- a/examples/custom_dtype.py +++ b/examples/custom_dtype.py @@ -7,6 +7,9 @@ # ] # /// # +# Note: the zarr version must be changed in order to run this outside of the +# zarr source tree. For example, to make this script truly stand-alone, specify the zarr +# dependency as just "zarr" """ Demonstrate how to extend Zarr Python by defining a new data type From bfb20889ae7787d488335fa078da9c4b820658e8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Jun 2025 19:23:02 +0200 Subject: [PATCH 04/42] ignore future warning in docs --- docs/user-guide/data_types.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 33aa09374b..55057c0bdf 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -236,8 +236,10 @@ data types are so simple. Consider this case: .. code-block:: python >>> from zarr import create_array + >>> import warnings >>> import numpy as np - >>> a = create_array({}, shape=(10,), dtype=[('a', np.dtype('float')), ('b', 'i8')]) + >>> warnings.simplefilter("ignore", category=FutureWarning) + >>> a = create_array({}, shape=(10,), dtype=[('a', 'f8'), ('b', 'i8')]) >>> a.dtype # this is the NumPy data type dtype([('a', '>> a.metadata.data_type # this is the Zarr data type From 893540ff8ba18a1ef264c775d9865b2d0b316a35 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 22 Jun 2025 18:38:01 +0200 Subject: [PATCH 05/42] re-export vlen-bytes --- src/zarr/core/dtype/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 735690d4bc..d970d3da8a 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -58,6 +58,7 @@ "UInt16", "UInt32", "UInt64", + "VariableLengthBytes", "VariableLengthUTF8", "ZDType", "data_type_registry", From 15ebfa6536bf8b8d4126e07ad7ffb910a4517dc3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 22 Jun 2025 18:46:42 +0200 Subject: [PATCH 06/42] make examples stand-alone and testable via script dependency modification at test time --- examples/custom_dtype.py | 111 ++++++++++++++++++++++++++++++++------- pyproject.toml | 3 ++ tests/test_examples.py | 74 ++++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 18 deletions(-) create mode 100644 tests/test_examples.py diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py index 354fabe850..0534f38685 100644 --- a/examples/custom_dtype.py +++ b/examples/custom_dtype.py @@ -1,7 +1,7 @@ # /// script # requires-python = ">=3.11" # dependencies = [ -# "zarr @ {root}", +# "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", # "ml_dtypes==0.5.1", # "pytest==8.4.1" # ] @@ -18,7 +18,7 @@ import json import sys from pathlib import Path -from typing import ClassVar, Literal, Self, TypeGuard +from typing import ClassVar, Literal, Self, TypeGuard, overload import ml_dtypes # necessary to add extra dtypes to NumPy import numpy as np @@ -34,14 +34,17 @@ check_dtype_spec_v2, ) +# This is the int2 array data type int2_dtype_cls = type(np.dtype("int2")) + +# This is the int2 scalar type int2_scalar_cls = ml_dtypes.int2 class Int2(ZDType[int2_dtype_cls, int2_scalar_cls]): """ - This class provides a Zarr compatibility layer around the int2 data type and the int2 - scalar type. + This class provides a Zarr compatibility layer around the int2 data type (the ``dtype`` of a + NumPy array of type int2) and the int2 scalar type (the ``dtype`` of the scalar value inside an int2 array). """ # This field is as the key for the data type in the internal data type registry, and also @@ -68,53 +71,104 @@ def to_native_dtype(self: Self) -> int2_dtype_cls: @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """Type check for Zarr v2-flavored JSON""" + """ + Type check for Zarr v2-flavored JSON. + + This will check that the input is a dict like this: + .. code-block:: json + + { + "name": "int2", + "object_codec_id": None + } + + Note that this representation differs from the ``dtype`` field looks like in zarr v2 metadata. + Specifically, whatever goes into the ``dtype`` field in metadata is assigned to the ``name`` field here. + + See the Zarr docs for more information about the JSON encoding for data types. + """ return ( check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None ) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]: - """Type check for Zarr v3-flavored JSON""" + """ + Type check for Zarr V3-flavored JSON. + + Checks that the input is the string "int2". + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ - Create an instance of this ZDType from zarr v3-flavored JSON. + Create an instance of this ZDType from Zarr V3-flavored JSON. """ if cls._check_json_v2(data): return cls() + # This first does a type check on the input, and if that passes we create an instance of the ZDType. msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" raise DataTypeValidationError(msg) @classmethod def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: """ - Create an instance of this ZDType from zarr v3-flavored JSON. + Create an instance of this ZDType from Zarr V3-flavored JSON. + + This first does a type check on the input, and if that passes we create an instance of the ZDType. """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["int2"], None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["int2"]: ... + def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["int2"], None] | Literal["int2"]: - """Serialize this ZDType to v2- or v3-flavored JSON""" + """ + Serialize this ZDType to v2- or v3-flavored JSON + + If the zarr_format is 2, then return a dict like this: + .. code-block:: json + + { + "name": "int2", + "object_codec_id": None + } + + If the zarr_format is 3, then return the string "int2" + + """ if zarr_format == 2: return {"name": "int2", "object_codec_id": None} elif zarr_format == 3: return self._zarr_v3_name raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[int]: - """Check if a python object is a valid scalar""" + def _check_scalar(self, data: object) -> TypeGuard[int | ml_dtypes.int2]: + """ + Check if a python object is a valid int2-compatible scalar + + The strictness of this type check is an implementation degree of freedom. + You could be strict here, and only accept int2 values, or be open and accept any integer + or any object and rely on exceptions from the int2 constructor that will be called in + cast_scalar. + """ return isinstance(data, (int, int2_scalar_cls)) def cast_scalar(self, data: object) -> ml_dtypes.int2: """ - Attempt to cast a python object to an int2. Might fail pending a type check. + Attempt to cast a python object to an int2. + + We first perform a type check to ensure that the input type is appropriate, and if that + passes we call the int2 scalar constructor. """ if self._check_scalar(data): return ml_dtypes.int2(data) @@ -122,18 +176,35 @@ def cast_scalar(self, data: object) -> ml_dtypes.int2: raise TypeError(msg) def default_scalar(self) -> ml_dtypes.int2: - """Get the default scalar value""" + """ + Get the default scalar value. This will be used when automatically selecting a fill value. + """ return ml_dtypes.int2(0) def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: - """Convert a python object to a scalar.""" - return int(data) + """ + Convert a python object to a JSON representation of an int2 scalar. + This is necessary for taking user input for the ``fill_value`` attribute in array metadata. + + In this implementation, we optimistically convert the input to an int, + and then check that it lies in the acceptable range for this data type. + """ + # We could add a type check here, but we don't need to for this example + val: int = int(data) # type: ignore[call-overload] + if val not in (-2, -1, 0, 1): + raise ValueError("Invalid value. Expected -2, -1, 0, or 1.") + return val def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> ml_dtypes.int2: """ - Read a JSON-serializable value as a scalar. The base definition of this method - requires that it take a zarr_format parameter, because some data types serialize scalars - differently in zarr v2 and v3 + Read a JSON-serializable value as an int2 scalar. + + We first perform a type check to ensure that the JSON value is well-formed, then call the + int2 scalar constructor. + + The base definition of this method requires that it take a zarr_format parameter because + other data types serialize scalars differently in zarr v2 and v3, but we don't use this here. + """ if self._check_scalar(data): return ml_dtypes.int2(data) @@ -167,4 +238,8 @@ def test_custom_dtype(tmp_path: Path, zarr_format: Literal[2, 3]) -> None: if __name__ == "__main__": + # Run the example with printed output, and a dummy pytest configuration file specified. + # Without the dummy configuration file, at test time pytest will attempt to use the + # configuration file in the project root, which will error because Zarr is using some + # plugins that are not installed in this example. sys.exit(pytest.main(["-s", __file__, f"-c {__file__}"])) diff --git a/pyproject.toml b/pyproject.toml index 6c18563a1f..f34b220c9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,9 @@ test = [ "mypy", "hypothesis", "pytest-xdist", + "packaging", + "tomlkit", + "uv" ] remote_tests = [ 'zarr[remote]', diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 0000000000..8e26785c46 --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +import pytest +import tomlkit +from packaging.requirements import Requirement + +examples_dir = "examples" +script_paths = Path(examples_dir).glob("*.py") + +PEP_723_REGEX: Final = r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$" + +# This is the absolute path to the local Zarr installation. Moving this test to a different directory will break it. +ZARR_PROJECT_PATH = Path(".").absolute() + + +def set_dep(script: str, dependency: str) -> str: + """ + Set a dependency in a PEP-723 script header. + If the package is already in the list, it will be replaced. + If the package is not already in the list, it will be added. + + Source code modified from + https://packaging.python.org/en/latest/specifications/inline-script-metadata/#reference-implementation + """ + match = re.search(PEP_723_REGEX, script) + + if match is None: + raise ValueError(f"PEP-723 header not found in {script}") + + content = "".join( + line[2:] if line.startswith("# ") else line[1:] + for line in match.group("content").splitlines(keepends=True) + ) + + config = tomlkit.parse(content) + for idx, dep in enumerate(tuple(config["dependencies"])): + if Requirement(dep).name == Requirement(dependency).name: + config["dependencies"][idx] = dependency + + new_content = "".join( + f"# {line}" if line.strip() else f"#{line}" + for line in tomlkit.dumps(config).splitlines(keepends=True) + ) + + start, end = match.span("content") + return script[:start] + new_content + script[end:] + + +def resave_script(source_path: Path, dest_path: Path) -> None: + """ + Read a script from source_path and save it to dest_path after inserting the absolute path to the + local Zarr project directory in the PEP-723 header. + """ + source_text = source_path.read_text() + dest_text = set_dep(source_text, f"zarr @ file:///{ZARR_PROJECT_PATH}") + dest_path.write_text(dest_text) + + +@pytest.mark.parametrize("script_path", script_paths) +def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: + dest_path = tmp_path / script_path.name + # We resave the script after inserting the absolute path to the local Zarr project directory, + # and then test its behavior. + # This allows the example to be useful to users who don't have Zarr installed, but also testable. + resave_script(script_path, dest_path) + result = subprocess.run(["uv", "run", str(dest_path)], capture_output=True, text=True) + assert result.returncode == 0, ( + f"Script at {script_path} failed to run. Output: {result.stdout} Error: {result.stderr}" + ) From 383acfc4cf4bb57ad18f8300261cc6cc036ee056 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 23 Jun 2025 14:12:00 +0200 Subject: [PATCH 07/42] docstrings --- docs/conf.py | 4 +- docs/user-guide/data_types.rst | 228 ++++++-- src/zarr/core/dtype/common.py | 14 +- src/zarr/core/dtype/npy/bool.py | 165 +++++- src/zarr/core/dtype/npy/bytes.py | 911 ++++++++++++++++++++++++++++- src/zarr/core/dtype/npy/complex.py | 197 ++++++- src/zarr/core/dtype/npy/float.py | 180 +++++- src/zarr/core/dtype/npy/int.py | 866 +++++++++++++++++++++++++-- src/zarr/core/dtype/npy/string.py | 384 ++++++++++++ src/zarr/core/dtype/npy/time.py | 425 +++++++++++++- src/zarr/core/group.py | 2 +- src/zarr/dtype.py | 60 +- 12 files changed, 3284 insertions(+), 152 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 68bf003ad5..c565c97b54 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,7 +56,7 @@ autoapi_member_order = "groupwise" autoapi_root = "api" autoapi_keep_files = True -autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] +autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', 'inherited-members'] def skip_submodules( app: sphinx.application.Sphinx, @@ -124,7 +124,7 @@ def skip_submodules( # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "talks"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "talks", "api"] # The reST default role (used for this markup: `text`) to use for all # documents. diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 55057c0bdf..eed121a313 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,33 +1,54 @@ -Data types -========== +Array Data types +================ Zarr's data type model ---------------------- -Every Zarr array has a "data type", which defines the meaning and physical layout of the -array's elements. As Zarr Python is tightly integrated with `NumPy `_, -it's easy to create arrays with NumPy data types: +Zarr is designed for interperability with NumPy, so if you are familiar with NumPy or any other +N-dimensional array library then Zarr's model for array data types should seem familiar. But Zarr +data types have extra constraints that are important to be aware of. -.. code-block:: python +Zarr arrays operate under an essential design constraint: Unlike NumPy arrays, Zarr arrays +are designed to stored and accessed by other Zarr implementations. This means that, among other things, +Zarr data types must be serializable to metadata documents in accordance with the Zarr specifications, +and this adds some unique texture to the Zarr data type model. - >>> import zarr - >>> import numpy as np - >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) - >>> z - +The following sections will explain Zarr's data type model in greater detail, and demonstrate the +Zarr Python APIs for working with Zarr data types. + +Array data types +^^^^^^^^^^^^^^^^ + +Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data +type is encoded in the metadata for the array, which means that a declaration of the +data type of an array must be `JSON`-serializable. + +In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. +Zarr V3 changed the name of this field to ``data_type``, and also defined new rules for the values +assignable to the ``data_type`` field. + +For example, in Zarr V2, the boolean array data type was represented in array metadata as the +string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool"``. + +Scalars +^^^^^^^ + +Zarr also defines how array elements, i.e. scalars, are encoded in array metadata. This is necessary +because Zarr uses a field in array metadata to define a default value for chunks that are not stored. +This field, called ``fill_value`` in both Zarr V2 and Zarr V3 metadata documents, contains a +``JSON`` value that can be decoded to a scalar value compatible with the array's data type. + +For the boolean data type, the scalar encoding is simple -- booleans are natively supported by +``JSON``, so zarr saves booleans as ``JSON`` booleans. Other scalars, like floats or raw bytes, have +more elaborate encoding schemes, and in some cases this scheme depends on the Zarr format version. -Unlike NumPy arrays, Zarr arrays are designed to accessed by Zarr -implementations in different programming languages. This means Zarr data types must be interpreted -correctly when clients read an array. Each Zarr data type defines procedures for -encoding and decoding both the data type itself, and scalars from that data type to and from Zarr array metadata. And these serialization procedures -depend on the Zarr format. Data types in Zarr version 2 ----------------------------- Version 2 of the Zarr format defined its data types relative to `NumPy's data types `_, -and added a few non-NumPy data types as well. Thus the JSON identifier for a NumPy-compatible data +and added a few non-NumPy data types as well. With one exception, the Zarr V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: .. code-block:: python @@ -38,45 +59,89 @@ type is just the NumPy ``str`` attribute of that data type: >>> >>> store = {} >>> np_dtype = np.dtype('int64') + >>> np_dtype.str + '>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> dtype_meta '>> assert dtype_meta == np_dtype.str .. note:: + The ``<`` character in the data type metadata encodes the `endianness `_, or "byte order", of the data type. Following NumPy's example, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. -In addition to defining a representation of the data type itself (which in the example above was -just a simple string ``"`_, and +`"object" <#object-data-type>` data types. + +Structured data type +^^^^^^^^^^^^^^^^^^^^ + +NumPy allows the construction of a so-called "structured" data type comprised of an ordered collection +of named fields, where each field is itself a distinct NumPy data type. See the NumPy documentation +`here `_ . + +Crucially, NumPy does not use a special data type for structured data types -- instead, NumPy +implemented structured data types as an option feature of the so-called "Void" data type, which models +arbitrary fixed-size byte strings. Since the ``str`` attribute of an unstructured NumPy void +data type is the same as the ``str`` of Numpy structured data type, Zarr V2 uses a special data type +encoding for structured data types that distinguishes the two. + +For example: + +.. code-block:: python + + >>> store = {} + >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', '>f4')]) + >>> np_dtype.str + '|V6' + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> dtype_meta + [['field_a', '>i2'], ['field_b', '>f4']] + +Object data type +^^^^^^^^^^^^^^^^ -More broadly, each Zarr data type defines its own rules for how scalars of that type are stored in -``JSON``. +The NumPy "object" is essentially an array of references to arbitrary Python objects. +It can model arrays of variable-length UTF-8 strings, or arrays of +variable-length byte strings, or even arrays of variable-length arrays, each with their own distinct data type. +This makes the "object" data type expressive, but also complicated for Zarr V2. Remember that, with +the exception of "structured" data types, Zarr V2 uses the NumPy string representation of a data type +to identify it in metadata. + +An "object" array of variable-length UTF-8 strings, and an "object" array of variable-length byte strings, +have logically separate data types, but in NumPy they would both have the same array data type: "object". + +So Zarr V2 disambiguated different "object" data type arrays on the basis of their chunk encoding, +i.e. the codecs declared in the ``filters`` and ``compressor`` attributes of array metadata. + +If an array with data type "object" used the `"vlen-utf8"` codec, then it was interpreted as an +array of variable-length strings. If an array with data type "object" used the ``"vlen-bytes"`` +codec, then it was interpreted as an array of variable-length byte strings. + +This means that the ``dtype`` field alone does not specify a data type in Zarr V2. The name of the +object codec used, if one was used, is also required. Although this fact can be ignored for many +simple numeric data types, any comprehensive approach to Zarr V2 data types must either reject +the "object" data types, or include the "object codec" identifier in the ``JSON`` form +of the basic data type model. Data types in Zarr version 3 ----------------------------- -Zarr V3 brings several key changes to how data types are represented: +The NumPy-based Zarr V2 data type representation was effective for simple data types but struggled +for more complex data types, like "object" and "structured" data types. To address these limitations, +Zarr V3 brought several key changes to how data types are represented: -- Zarr V3 identifies the basic data types as strings like ``"int8"``, ``"int16"``, etc. +- Instead of copying NumPy character codecs, Zarr V3 defines an identifier for each data type. + The basic data types are identified by strings like ``"int8"``, ``"int16"``, etc, and data types + that require a configuration can be identified by a ``JSON`` object. - By contrast, Zarr V2 uses the NumPy character code representation for data types: - In Zarr V2, ``int8`` is represented as ``"|i1"``. -- A Zarr V3 data type does not have endianness. This is a departure from Zarr V2, where multi-byte - data types are defined with endianness information. Instead, Zarr V3 requires that endianness, - where applicable, is specified in the ``codecs`` attribute of array metadata. -- While some Zarr V3 data types are identified by strings, others can be identified by a ``JSON`` - object. For example, consider this specification of a ``datetime`` data type: + For example, this ``JSON`` object declares a ``datetime`` data type: .. code-block:: json @@ -89,34 +154,79 @@ Zarr V3 brings several key changes to how data types are represented: } - Zarr V2 generally uses structured string representations to convey the same information. The - data type given in the previous example would be represented as the string ``">M[10s]"`` in - Zarr V2. This is more compact, but can be harder to parse. +- Zarr V3 data types does not have endianness. This is a departure from Zarr V2, where multi-byte + data types are defined with endianness information. Instead, Zarr V3 requires that the endianness + of encoded array chunks is specified in the ``codecs`` attribute of array metadata. And the Zarr + V3 spec leaves the in-memory endianness of decoded array chunks as an implementation detail. For more about data types in Zarr V3, see the `V3 specification `_. + Data types in Zarr Python ------------------------- The two Zarr formats that Zarr Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as NumPy-compatible strings, while data types in Zarr version -3 are encoded as either strings or ``JSON`` objects, -and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. +data types in Zarr version 2 are encoded as NumPy-compatible strings +(or lists, in the case of structured data types), while data types in Zarr V3 are encoded as either +strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness +information, unlike Zarr V2 data types. -To abstract over these syntactical and semantic differences, Zarr Python uses a class called -`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ provide Zarr V2 and Zarr V3 compatibility -routines for ""native" data types. In this context, a "native" data type is a Python class, -typically defined in another library, that models an array's data type. For example, ``np.uint8`` is a native -data type defined in NumPy, which Zarr Python wraps with a ``ZDType`` instance called -`UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. +Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences +We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_, +which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. + +In this context, a "native" data type is a Python class, typically defined in another library, that +models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. +Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. + +As of this writing the only native data types Zarr Python supports are NumPy data types. We could +avoid the "native data type" jargon and just say "NumPy data type", but we don't want to rule out the +possibility of using non-NumPy array backends in the future. Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an API for the following operations: -- Wrapping / unwrapping a native data type +- Encoding / decoding a native data type - Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. - Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. +- Casting a Python object to a scalar value consistent with the data type + +The following section lists the data types built in to Zarr Python. + +Boolean types +^^^^^^^^^^^^^ +- `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ + +Integral types +^^^^^^^^^^^^^^ +- `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ +- `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ +- `Signed 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int32>`_ +- `Signed 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int64>`_ +- `Unsigned 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt8>`_ +- `Unsigned 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt16>`_ +- `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ +- `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ + +Floating-point types +^^^^^^^^^^^^^^^^^^^^ +- `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ +- `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ +- `64-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float64>`_ +- `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ +- `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ + +String types +^^^^^^^^^^^^ +- `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ +- `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ + +Byte string types +^^^^^^^^^^^^^^^^^ +- `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ +- `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ +- `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ Example Usage @@ -145,13 +255,27 @@ Get the default scalar value for the data type: >>> assert default_value == np.int8(0) -Serialize to JSON for Zarr V2 and V3 +Serialize to JSON for Zarr V2 .. code-block:: python >>> json_v2 = int8.to_json(zarr_format=2) >>> json_v2 {'name': '|i1', 'object_codec_id': None} + +.. note:: + + The representation returned by ``to_json`` is more abstract than the literal contents of Zarr V2 + array metadata, because the ``JSON`` representation used by the `ZDType` classes must be distinct across + different data types. Zarr V2 identifies multiple distinct data types with the "object" data type + identifier ``"|O"``, which means extra information is needed to disambiguate these data types from + one another. That's the reason for the ``object_codec_id`` field you see here. See the + `section <#object-data-type>`_ on the "object" data type for more information. + +And V3: + +.. code-block:: python + >>> json_v3 = int8.to_json(zarr_format=3) >>> json_v3 'int8' @@ -261,9 +385,9 @@ data type instance if and only if exactly 1 of those constructor invocations was In plain language, we take some user input (a NumPy array), offer it to all the known data type classes, and return an instance of the one data type class that could accept that user input. -We want to avoid a situation where the same NumPy data type matches multiple Zarr data types. I.e., +We want to avoid a situation where the same native data type matches multiple Zarr data types. I.e., a NumPy data type should uniquely specify a single Zarr data type. But data type resolution is dynamic, so it's not possible to guarantee this uniqueness constraint. So we attempt data type -resolution against every data type class, and if for some reason a NumPy data type matches multiple +resolution against every data type class, and if for some reason a native data type matches multiple Zarr data types, we treat this as an error and raise an exception. diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 6f61b6775e..59ee500dc5 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -151,17 +151,23 @@ class DataTypeValidationError(ValueError): ... class ScalarTypeValidationError(ValueError): ... -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasLength: """ A mix-in class for data types with a length attribute, such as fixed-size collections of unicode strings, or bytes. + + Attributes + ---------- + length : int + The length of the scalars belonging to this data type. Note that this class does not assign + a unit to the length. Child classes may assign units. """ length: int -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasEndianness: """ A mix-in class for data types with an endianness attribute @@ -170,7 +176,7 @@ class HasEndianness: endianness: EndiannessStr = "little" -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasItemSize: """ A mix-in class for data types with an item size attribute. @@ -183,7 +189,7 @@ def item_size(self) -> int: raise NotImplementedError -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasObjectCodec: """ A mix-in class for data types that require an object codec id. diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index d8d52468bf..62202e74de 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -21,14 +21,23 @@ @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ - Wrapper for numpy boolean dtype. + The boolean data type. Attributes ---------- - name : str - The name of the dtype. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] - The numpy dtype class. + _zarr_v3_name : Literal["bool"] = "bool" + The Zarr v3 name of the dtype. + _zarr_v2_name : Literal["|b1"] = "|b1" + The Zarr v2 name of the dtype, which is also a string representation + of the boolean dtype used by NumPy. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + The NumPy dtype class. + + Notes + ----- + This class implements the boolean data type defined in Zarr V2 and V3. + You can read the formal specification of that data type in the respective + `specification document `_ """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" @@ -38,7 +47,22 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Bool from a np.dtype('bool') instance. + Create an instance of Bool from an instance of np.dtypes.BoolDType. + + Parameters + ---------- + dtype : TBaseDType + The NumPy boolean dtype instance to convert. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the provided dtype is not compatible with this ZDType. """ if cls._check_native_dtype(dtype): return cls() @@ -48,7 +72,12 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def to_native_dtype(self: Self) -> np.dtypes.BoolDType: """ - Create a NumPy boolean dtype instance from this ZDType + Create a NumPy boolean dtype instance from this ZDType. + + Returns + ------- + np.dtypes.BoolDType + The NumPy boolean dtype. """ return self.dtype_cls() @@ -59,6 +88,16 @@ def _check_json_v2( ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: """ Check that the input is a valid JSON representation of a Bool. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -68,10 +107,41 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: + """ + Check that the input is a valid JSON representation of a Bool in Zarr V3 format. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a Bool. + """ if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" @@ -79,6 +149,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a Bool. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -93,6 +181,24 @@ def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: + """ + Serialize this Bool instance to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + DTypeConfig_V2[Literal["|b1"], None] or Literal["bool"] + The JSON representation of the Bool instance. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: return {"name": self._zarr_v2_name, "object_codec_id": None} elif zarr_format == 3: @@ -100,10 +206,40 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> bool: - # Anything can become a bool + """ + Check if the input can be cast to a boolean scalar. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + bool + True if the input can be cast to a boolean scalar, False otherwise. + """ return True def cast_scalar(self, data: object) -> np.bool_: + """ + Cast the input to a numpy boolean scalar. + + Parameters + ---------- + data : object + The data to cast. + + Returns + ------- + np.bool_ + The numpy boolean scalar. + + Raises + ------ + TypeError + If the input cannot be converted to a numpy boolean. + """ if self._check_scalar(data): return np.bool_(data) msg = f"Cannot convert object with type {type(data)} to a numpy boolean." @@ -153,6 +289,11 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: ------- np.bool_ The numpy boolean scalar. + + Raises + ------ + TypeError + If the input is not a valid boolean type. """ if self._check_scalar(data): return np.bool_(data) @@ -160,4 +301,12 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: @property def item_size(self) -> int: + """ + Return the item size of the boolean dtype. + + Returns + ------- + int + The item size in bytes. + """ return 1 diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index e363c75053..470dbbc770 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -3,7 +3,7 @@ import base64 import re from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np @@ -34,11 +34,81 @@ class FixedLengthBytesConfig(TypedDict): @dataclass(frozen=True, kw_only=True) class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): + """ + A Zarr data type for arrays containing null-terminated bytes. This class wraps the NumPy + ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of ``np.bytes_``. + + This data type is parametrized by an integral length which specifies size in bytes of each + scalar. Because this data type uses null-terminated semantics, indexing into + NumPy arrays with this data type may return fewer than ``length`` bytes. + + Attributes + ---------- + length : int + The length of the bytes. + + dtype_cls: ClassVar[type[np.dtypes.BytesDType[int]]] = np.dtypes.BytesDType + The NumPy data type wrapped by this ZDType. + + Methods + ------- + to_json(zarr_format) : dict + Convert the NullTerminatedBytes to JSON data. + + from_json(data, zarr_format) : NullTerminatedBytes + Create NullTerminatedBytes from JSON data. + + cast_scalar(data) : np.bytes_ + Cast a python object to np.bytes_. + + default_scalar() : np.bytes_ + Return the default scalar value. + + to_json_scalar(data, zarr_format) : str + Convert input to a scalar and return as JSON data. + + from_json_scalar(data, zarr_format) : np.bytes_ + Create np.bytes_ from JSON data. + + item_size : int + Return the item size, in bytes, of the data type. + + Notes + ----- + This data type is designed for compatibility with NumPy arrays that use the NumPy ``bytes`` data type. + It may not be desirable for usage outside of that context. If compatibility + with the NumPy ``bytes`` data type is not essential, consider using the ``RawBytes`` + or ``VariableLengthBytes`` data types instead. + """ + dtype_cls = np.dtypes.BytesDType _zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes" @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of NullTerminatedBytes from an instance of np.dtypes.BytesDType. + + This method checks if the provided data type is an instance of np.dtypes.BytesDType. + If so, it returns a new instance of NullTerminatedBytes with a length equal to the + length of input data type. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + NullTerminatedBytes + An instance of NullTerminatedBytes with the specified length. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with NullTerminatedBytes. + """ + if cls._check_native_dtype(dtype): return cls(length=dtype.itemsize) raise DataTypeValidationError( @@ -46,14 +116,36 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.BytesDType[int]: + """ + Create a NumPy bytes dtype from this NullTerminatedBytes ZDType. + + Returns + ------- + np.dtypes.BytesDType[int] + A NumPy data type object representing null-terminated bytes with a specified length. + """ + return self.dtype_cls(self.length) @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid representation of a numpy S dtype. We expect - something like ``{"name": "|S10", "object_codec_id": None}`` + Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2. + + The input data must be a mapping that contains a "name" key that matches the pattern + "|S" and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input data is a valid representation, False otherwise. """ + return ( check_dtype_spec_v2(data) and isinstance(data["name"], str) @@ -63,16 +155,62 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3]: + """ + Check that the input is a valid representation of NullTerminatedBytes in Zarr V3. + + The input must be a mapping with the following structure: + + { + "name": "null_terminated_bytes", + "configuration": { + "length_bytes": + } + } + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of NullTerminatedBytes in Zarr V3, False + otherwise. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and isinstance(data["configuration"], dict) and "length_bytes" in data["configuration"] + and isinstance(data["configuration"]["length_bytes"], int) ) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of NullTerminatedBytes from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of + NullTerminatedBytes in Zarr V2. If so, it returns a new instance of + NullTerminatedBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + NullTerminatedBytes + An instance of this ZDType. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of NullTerminatedBytes. + """ + if cls._check_json_v2(data): name = data["name"] return cls(length=int(name[2:])) @@ -81,6 +219,28 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of NullTerminatedBytes from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + NullTerminatedBytes in Zarr V3. If so, it returns a new instance of + NullTerminatedBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + NullTerminatedBytes + An instance of this NullTerminatedBytes. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of NullTerminatedBytes. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -95,6 +255,31 @@ def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSONV3: ... def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3: + """ + Generate a JSON representation of NullTerminatedBytes. + + If zarr_format is 2, the return value will be a dictionary with the form + { + "name": "|S", + "object_codec_id": None + } + + If zarr_format is 3, the resulting JSON will be a dictionary with the form + { + "name": "null_terminated_bytes", + "configuration": {"length_bytes": self.length} + } + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3 + The JSON-serializable representation of the data type + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -106,36 +291,134 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: - # this is generous for backwards compatibility + """ + Check if the provided data is of type BytesLike. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes numpy bytes, strings, bytes, + and integers. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[BytesLike] + True if the data is bytes-like, False otherwise. + """ + return isinstance(data, BytesLike) def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('S3').type('hello world') - # >>> x - # np.bytes_(b'hello world') - # >>> x.dtype - # dtype('S11') + """ + Cast the provided scalar data to ``np.bytes_``, truncating if necessary. + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + np.bytes_ + The casted data as a numpy bytes scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be bytes-like. + """ if isinstance(data, int): return self.to_native_dtype().type(str(data)[: self.length]) else: return self.to_native_dtype().type(data[: self.length]) def cast_scalar(self, data: object) -> np.bytes_: + """ + Attempt to cast a given object to a numpy bytes scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a numpy bytes scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a numpy bytes scalar. + + Returns + ------- + np.bytes_ + The data cast as a numpy bytes scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy bytes scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy bytes scalar." raise TypeError(msg) def default_scalar(self) -> np.bytes_: + """ + Return a default scalar value, which for this data type is an empty byte string. + + Returns + ------- + np.bytes_ + The default scalar value. + """ return np.bytes_(b"") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + This method encodes the given scalar as a numpy bytes scalar and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ as_bytes = self.cast_scalar(data) return base64.standard_b64encode(as_bytes).decode("ascii") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + """ + Read a JSON-serializable value as np.bytes_. + + Parameters + ---------- + data : JSON + The JSON-serializable base64-encoded string. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.bytes_ + The numpy bytes scalar obtained from decoding the base64 string. + + Raises + ------ + TypeError + If the input data is not a base64-encoded string. + """ + if check_json_str(data): return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError( @@ -144,6 +427,14 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @property def item_size(self) -> int: + """ + Get the item size of the raw bytes. + + Returns + ------- + int + The size of each item in bytes, equivalent to the length attribute. + """ return self.length @@ -152,14 +443,65 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize # np.dtypes.VoidDType is specified in an odd way in numpy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here + + """ + A Zarr data type for arrays containing raw bytes. This class wraps the NumPy ``void`` data type. + Scalars for this data type are instances of ``np.void``. + + This data type is parametrized by an integral length which specifies size in bytes of each + scalar belonging to this data type. + + Attributes + ---------- + length : int + The length of the bytes. + + dtype_cls: ClassVar[type[np.dtypes.VoidDType[int]]] = np.dtypes.VoidDtype + The NumPy data type wrapped by this ZDType. + + Methods + ------- + to_json(zarr_format) : dict + Convert RawBytes to JSON data. + + from_json(data, zarr_format) : NullTerminatedBytes + Create RawBytes from JSON data. + + cast_scalar(data) : np.void_ + Cast a python object to np.void. + + default_scalar() : np.void_ + Return the default scalar value. + + to_json_scalar(data, zarr_format) : str + Convert input to a scalar and return as JSON data. + + from_json_scalar(data, zarr_format) : np.bytes_ + Create a np.void from JSON data. + + item_size : int + Return the item size, in bytes, of the data type. + + Notes + ----- + Although the NumPy "Void" data type is used to create "structured" data types in NumPy, this + class does not support structured data types. + + See the ``Structured`` data type for this functionality. + + """ + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" @classmethod def _check_native_dtype( cls: type[Self], dtype: TBaseDType - ) -> TypeGuard[np.dtypes.VoidDType[Any]]: + ) -> TypeGuard[np.dtypes.VoidDType[int]]: """ + Check that the input is a NumPy void dtype with no fields. + + Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, @@ -168,18 +510,42 @@ def _check_native_dtype( Parameters ---------- - dtype : TDType + dtype : TDBaseDType The dtype to check. Returns ------- Bool - True if the dtype matches, False otherwise. + True if the dtype is an instance of np.dtypes.VoidDType with no fields, False otherwise. """ return cls.dtype_cls is type(dtype) and dtype.fields is None # type: ignore[has-type] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of RawBytes from an instance of np.dtypes.VoidDType. + + This method checks if the provided data type is compatible with RawBytes. The input + must be an instance of np.dtypes.VoidDType, and have no fields. If the input is compatible, + this method returns an instance of RawBytes with the specified length. + + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + RawBytes + An instance of RawBytes with the specified length. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with RawBytes. + """ + if cls._check_native_dtype(dtype): return cls(length=dtype.itemsize) raise DataTypeValidationError( @@ -187,6 +553,14 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + """ + Create a NumPy void dtype from this RawBytes ZDType. + + Returns + ------- + np.dtypes.VoidDType[int] + A NumPy data type object representing raw bytes with a specified length. + """ # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @@ -194,8 +568,24 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid representation of a numpy S dtype. We expect - something like ``{"name": "|V10", "object_codec_id": None}`` + Check that the input is a valid representation of RawBytes in Zarr V2. + + The input data must be a mapping with the following structure: + + { + "name": "|V", + "object_codec_id": None + } + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of RawBytes in Zarr V3, False otherwise. + """ return ( check_dtype_spec_v2(data) @@ -206,16 +596,61 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSONV3]: + """ + Check that the input is a valid representation of RawBytes in Zarr V3. + + The input must be a mapping with the following structure: + + { + "name": "raw_bytes", + "configuration": { + "length_bytes": + } + } + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of RawBytes in Zarr V3, False + otherwise. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) ) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of RawBytes from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of + RawBytes in Zarr V2. If so, it returns a new instance of + RawBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + RawBytes + An instance of this ZDType. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this RawBytes. + """ if cls._check_json_v2(data): name = data["name"] return cls(length=int(name[2:])) @@ -224,6 +659,28 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of RawBytes from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + RawBytes in Zarr V3. If so, it returns a new instance of + RawBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + RawBytes + An instance of RawBytes. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of RawBytes. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -236,6 +693,31 @@ def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawBytesJSONV3: + """ + Generate a JSON representation of RawBytes. + + If zarr_format is 2, the return value will be a dictionary with the form + { + "name": "|V", + "object_codec_id": None + } + + If zarr_format is 3, the resulting JSON will be a dictionary with the form + { + "name": "raw_bytes", + "configuration": {"length_bytes": self.length} + } + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + DTypeConfig_V2[str, None] | RawBytesJSONV3 + The JSON-serializable representation of the data type. + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -243,10 +725,44 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawByt return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[np.bytes_ | str | bytes | np.void]: + """ + Check if the provided data can be cast to np.void. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes np.bytes_, np.void, strings, and bytes objects. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[np.bytes_ | str | bytes | np.void] + True if the data is void-scalar-like, False otherwise. + """ return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_scalar_unchecked(self, data: object) -> np.void: + """ + Cast the provided scalar data to np.void. + + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + np.void + The casted data as a numpy void scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be castable to np.void. + """ native_dtype = self.to_native_dtype() # Without the second argument, numpy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, @@ -254,35 +770,172 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: return native_dtype.type(data, native_dtype) def cast_scalar(self, data: object) -> np.void: + """ + Attempt to cast a given object to a numpy void scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a numpy void scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a numpy void scalar. + + Returns + ------- + np.void + The data cast as a numpy void scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy void scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy void scalar." raise TypeError(msg) def default_scalar(self) -> np.void: + """ + Return the default scalar value for this data type. + + The default scalar is a numpy void scalar of the same length as the data type, + filled with zero bytes. + + Returns + ------- + np.void + The default scalar value. + """ return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") + """ + Convert a scalar to a JSON-serializable string representation. + + This method converts the given scalar to bytes and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ + as_bytes = self.cast_scalar(data) + return base64.standard_b64encode(as_bytes.tobytes()).decode("ascii") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + """ + Read a JSON-serializable value as a np.void. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.void + The numpy void scalar. + + Raises + ------ + TypeError + If the data is not a string, or if the string is not a valid base64 encoding. + """ if check_json_str(data): return self.to_native_dtype().type(base64.standard_b64decode(data)) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover @property def item_size(self) -> int: + """ + Get the item size of the raw bytes. + + Returns + ------- + int + The size of each item in bytes, equivalent to the length attribute. + """ return self.length @dataclass(frozen=True, kw_only=True) class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): + """ + A Zarr data type for arrays containing variable-length bytes. This class wraps the NumPy + "object" data type. Scalars for this data type are instances of plain python bytes. + + Attributes + ---------- + dtype_cls: ClassVar[type[np.dtypes.ObjectDType]] = np.dtypes.ObjectDType + The NumPy data type wrapped by this ZDType. + + Methods + ------- + to_json(zarr_format) : dict + Convert the VariableLengthBytes to JSON data. + + from_json(data, zarr_format) : VariableLengthBytes + Create VariableLengthBytes from JSON data. + + cast_scalar(data) : bytes + Cast a python object to bytes. + + default_scalar() : bytes + Return the default scalar value. + + to_json_scalar(data, zarr_format) : str + Convert input to a scalar and return as JSON data. + + from_json_scalar(data, zarr_format) : bytes + Create bytes from JSON data. + + Notes + ----- + Because this data type uses the NumPy "object" data type, it does not guarantee a compact memory + representation of array data. Therefore a "vlen-bytes" codec is needed to ensure that the array + data can be persisted to storage. + """ + dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of VariableLengthBytes from an instance of np.dtypes.ObjectDType. + + This method checks if the provided data type is an instance of np.dtypes.ObjectDType. + If so, it returns an instance of VariableLengthBytes. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + VariableLengthBytes + An instance of VariableLengthBytes. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with VariableLengthBytes. + """ if cls._check_native_dtype(dtype): return cls() raise DataTypeValidationError( @@ -290,6 +943,14 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthBytes ZDType. + + Returns + ------- + np.dtypes.ObjectDType + A NumPy data type object representing variable-length bytes. + """ return self.dtype_cls() @classmethod @@ -298,21 +959,73 @@ def _check_json_v2( data: DTypeJSON, ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]]: """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. + Check that the input is a valid JSON representation of a NumPy O dtype, and that the + object codec id is appropriate for variable-length bytes strings. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of VariableLengthBytes in Zarr V2, False + otherwise. """ - return ( - check_dtype_spec_v2(data) - and data["name"] == "|O" - and data["object_codec_id"] == cls.object_codec_id - ) + # Check that the input is a valid JSON representation of a Zarr v2 data type spec. + if not check_dtype_spec_v2(data): + return False + + # Check that the object codec id is appropriate for variable-length bytes strings. + if data["name"] != "|O": + return False + return data["object_codec_id"] == cls.object_codec_id @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: + """ + Check that the input is a valid representation of VariableLengthBytes in Zarr V3. + + This method verifies that the provided data matches the expected Zarr V3 representation + for VariableLengthBytes, which is the string "variable_length_bytes". + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[Literal["variable_length_bytes"]] + True if the input is a valid representation of VariableLengthBytes in Zarr V3, False otherwise. + """ + return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of this VariableLengthBytes + in Zarr V2. If so, it returns a new instance VariableLengthBytes. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + VariableLengthBytes + An instance of this ZDType. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ + if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" @@ -320,6 +1033,29 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of VariableLengthBytes from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + VariableLengthBytes in Zarr V3. If so, it returns a new instance of + VariableLengthBytes. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + VariableLengthBytes + An instance of VariableLengthBytes. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of VariableLengthBytes. + """ + if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -336,6 +1072,27 @@ def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]: + """ + Convert the variable-length bytes data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. Accepted values are 2 and 3. + + Returns + ------- + DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"] + The JSON-serializable representation of the variable-length bytes data type. + For zarr_format 2, returns a dictionary with "name" and "object_codec_id". + For zarr_format 3, returns a string identifier "variable_length_bytes". + + Raises + ------ + ValueError + If zarr_format is not 2 or 3. + """ + if zarr_format == 2: return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: @@ -344,25 +1101,131 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_scalar(self) -> bytes: + """ + Return the default scalar value for the variable-length bytes data type. + + Returns + ------- + bytes + The default scalar value, which is an empty byte string. + """ + return b"" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + This method encodes the given scalar as bytes and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: + """ + Decode a base64-encoded JSON string to bytes. + + Parameters + ---------- + data : JSON + The JSON-serializable base64-encoded string. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bytes + The decoded bytes from the base64 string. + + Raises + ------ + TypeError + If the input data is not a base64-encoded string. + """ + if check_json_str(data): return base64.standard_b64decode(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: + """ + Check if the provided data is of type BytesLike. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes numpy bytes, strings, bytes, + and integers. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[BytesLike] + True if the data is bytes-like, False otherwise. + """ return isinstance(data, BytesLike) def _cast_scalar_unchecked(self, data: BytesLike) -> bytes: + """ + Cast the provided scalar data to bytes. + + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + bytes + The casted data as bytes. + + Notes + ----- + This method does not perform any type checking. + The input data must be bytes-like. + """ if isinstance(data, str): return bytes(data, encoding="utf-8") return bytes(data) def cast_scalar(self, data: object) -> bytes: + """ + Attempt to cast a given object to a bytes scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a bytes scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a bytes scalar. + + Returns + ------- + bytes + The data cast as a bytes scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a bytes scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to bytes." diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 38e506f1bc..5dc316ed0a 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -41,11 +41,33 @@ @dataclass(frozen=True) class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): + """ + A base class for complex data types + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a NumPy complex dtype. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + Self + An instance of this data type with the specified endianness. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with this data type. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -53,6 +75,15 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> TComplexDType_co: + """ + Convert this complex data type to a NumPy complex dtype with the appropriate byte order. + + Returns + ------- + TComplexDType_co + A NumPy data type object representing the complex data type with the specified byte order. + """ + byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -60,6 +91,19 @@ def to_native_dtype(self) -> TComplexDType_co: def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. + + The input data must be a mapping that contains a "name" key that is one of + the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -69,10 +113,45 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this data type in Zarr V3. + + This method verifies that the provided data matches the expected Zarr V3 + representation, which is the string specified by the class-level attribute _zarr_v3_name. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this data type in Zarr V3, False otherwise. + """ + return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this complex data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this complex data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this complex data type. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -83,6 +162,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this complex data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this complex data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this complex data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." @@ -96,18 +193,25 @@ def to_json(self, zarr_format: Literal[3]) -> str: ... def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ - Convert the wrapped data type to a JSON-serializable form. + Serialize this complex data type to a JSON-compatible representation. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] | str + If `zarr_format` is 2, a dictionary with "name" and "object_codec_id" is returned. + If `zarr_format` is 3, a string representation of the complex data type is returned. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ + if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -115,12 +219,61 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: + """ + Check that the input is a scalar complex value. + + Parameters + ---------- + data : object + The value to check. + + Returns + ------- + TypeGuard[ComplexLike] + True if the input is a scalar complex value, False otherwise. + """ return isinstance(data, ComplexLike) def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: + """ + Cast the provided scalar data to the native scalar type of this complex data type. + + Parameters + ---------- + data : ComplexLike + The data to cast. + + Returns + ------- + TComplexScalar_co + The casted data as a numpy complex scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be a scalar complex value. + """ return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TComplexScalar_co: + """ + Attempt to cast a given object to a numpy complex scalar. + + Parameters + ---------- + data : object + The data to be cast to a numpy complex scalar. + + Returns + ------- + TComplexScalar_co + The data cast as a numpy complex scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy complex scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." @@ -193,9 +346,25 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + This class wraps the NumPy ``np.dtypes.Complex64DType`` data type. Scalars for this data type + are instances of ``np.complex64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex64DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8", "c8"], Literal["c8", " int: @@ -204,9 +373,25 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + This class wraps the NumPy ``np.dtypes.Complex128DType`` data type. Scalars for this data type + are instances of ``np.complex128``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex128DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex128"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c16"], Literal["c16", "c16"], Literal["c16", " int: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 7b7243993f..3ea5076599 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -35,11 +35,33 @@ @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): + """ + Base class for numpy float data types. + + Attributes + ---------- + _zarr_v2_names : ClassVar[tuple[str, ...]] + The possible Zarr V2 JSON names for the data type. + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this ZDType from a numpy data type. + + Parameters + ---------- + dtype : TBaseDType + The numpy data type. + + Returns + ------- + Self + An instance of this ZDType. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -47,6 +69,14 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> TFloatDType_co: + """ + Convert the wrapped data type to a numpy data type. + + Returns + ------- + TFloatDType_co + The numpy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -54,6 +84,16 @@ def to_native_dtype(self) -> TFloatDType_co: def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid JSON representation of this data type, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -63,10 +103,36 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[str] + True if the input is a valid JSON representation of this data type, False otherwise. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this ZDType. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -77,6 +143,19 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this ZDType. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." @@ -109,12 +188,51 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: + """ + Check that the input is a valid scalar value. + + Parameters + ---------- + data : object + The input to check. + + Returns + ------- + TypeGuard[FloatLike] + True if the input is a valid scalar value, False otherwise. + """ return isinstance(data, FloatLike) def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: + """ + Cast a scalar value to a numpy float scalar. + + Parameters + ---------- + data : FloatLike + The scalar value to cast. + + Returns + ------- + TFloatScalar_co + The numpy float scalar. + """ return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TFloatScalar_co: + """ + Cast a scalar value to a numpy float scalar. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + TFloatScalar_co + The numpy float scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." @@ -122,7 +240,7 @@ def cast_scalar(self, data: object) -> TFloatScalar_co: def default_scalar(self) -> TFloatScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this dtype. Returns ------- @@ -133,7 +251,7 @@ def default_scalar(self) -> TFloatScalar_co: def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ - Read a JSON-serializable value as a numpy float. + Read a JSON-serializable value as a numpy float scalar. Parameters ---------- @@ -144,8 +262,8 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- - TScalar_co - The numpy float. + TFloatScalar_co + The numpy float scalar. """ if zarr_format == 2: if check_json_float_v2(data): @@ -191,32 +309,86 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st @dataclass(frozen=True, kw_only=True) class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + """ + A Zarr data type for arrays containing 16-bit floating point numbers. This class wraps the + NumPy np.dtypes.Float16DType data type. Scalars for this data type are instances of + ``np.float16``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float16DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["float16"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">f2"], Literal["f2"], Literal["f2", " int: + """ + Return the item size of the data type in bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): + """ + A Zarr data type for arrays containing 32-bit floating point numbers. This class wraps the + NumPy np.dtypes.Float32DType data type. Scalars for this data type are instances of + ``np.float32``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float32DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["float32"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">f4"], Literal["f4"], Literal["f4", " int: + """ + Return the item size of the data type in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): + """ + A Zarr data type for arrays containing 64-bit floating point numbers. This class wraps the + NumPy np.dtypes.Float64DType data type. Scalars for this data type are instances of + ``np.float64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float64DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["float64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">f8"], Literal["f8"], Literal["f8", " int: + """ + Return the item size of the data type in bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 79d3ce2d47..4c44352e68 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -53,14 +53,60 @@ @dataclass(frozen=True) class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): - # This attribute holds the possible zarr V2 JSON names for the data type + """ + A base class for integer data types in Zarr. + + This class provides methods for serialization and deserialization of integer types + in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. + + Attributes + ---------- + _zarr_v2_names : ClassVar[tuple[str, ...]] + Possible Zarr V2 JSON names for the data type. + + Methods + ------- + _check_json_v2(data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: + Check if input is a valid JSON representation for Zarr v2. + _check_json_v3(data: object) -> TypeGuard[str]: + Check if JSON value is consistent with Zarr v3 for this data type. + _check_scalar(data: object) -> TypeGuard[IntLike]: + Check if a Python object is IntLike. + _cast_scalar_unchecked(data: IntLike) -> TIntScalar_co: + Create an integer without type checking of the input. + cast_scalar(data: object) -> TIntScalar_co: + Convert object to numpy integer, raising TypeError if invalid. + default_scalar() -> TIntScalar_co: + Get the default value, which is 0 cast to this dtype. + from_json_scalar(data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: + Read a JSON-serializable value as a numpy int scalar. + to_json_scalar(data: object, *, zarr_format: ZarrFormat) -> int: + Convert an object to JSON-serializable scalar. + """ + _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid JSON representation of this data type. + Check that the input is a valid JSON representation of this integer data type in Zarr V2. + + This method verifies that the provided data matches the expected Zarr V2 representation + for this data type. The input data must be a mapping that contains a "name" key that is + one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid representation of this integer data type in Zarr V2, + False otherwise. """ + return ( check_dtype_spec_v2(data) and data["name"] in cls._zarr_v2_names @@ -70,23 +116,82 @@ def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: @classmethod def _check_json_v3(cls, data: object) -> TypeGuard[str]: """ - Check that a JSON value is consistent with the zarr v3 spec for this data type. + Check if JSON value is consistent with Zarr v3 for this data type. + + This method verifies whether the provided data matches the expected Zarr v3 + representation for this data type, which is the string specified by the + class-level attribute _zarr_v3_name. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this data type in Zarr v3, + False otherwise. """ return data == cls._zarr_v3_name def _check_scalar(self, data: object) -> TypeGuard[IntLike]: """ - Check that a python object is IntLike + Check if the input object is of an IntLike type. + + This method verifies whether the provided data can be considered as an integer-like + value, which includes objects supporting integer conversion. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[IntLike] + True if the data is IntLike, False otherwise. """ + return isinstance(data, IntLike) def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: """ - Create an integer without any type checking of the input. + Casts a given scalar value to the native integer scalar type without type checking. + + Parameters + ---------- + data : IntLike + The scalar value to cast. + + Returns + ------- + TIntScalar_co + The casted integer scalar of the native dtype. """ + return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TIntScalar_co: + """ + Attempt to cast a given object to a numpy integer scalar. + + Parameters + ---------- + data : object + The data to be cast to a numpy integer scalar. + + Returns + ------- + TIntScalar_co + The data cast as a numpy integer scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy integer scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy integer." @@ -94,11 +199,11 @@ def cast_scalar(self, data: object) -> TIntScalar_co: def default_scalar(self) -> TIntScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this dtype. Returns ------- - Int scalar + TIntScalar_co The default value. """ return self._cast_scalar_unchecked(0) @@ -116,8 +221,13 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar Returns ------- - TScalar_co - The numpy scalar. + TIntScalar_co + The numpy int scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. """ if check_json_int(data): return self._cast_scalar_unchecked(data) @@ -125,11 +235,12 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ - Convert an object to JSON-serializable scalar. + Convert an object to a JSON serializable scalar. For the integer data types, + the JSON form is a plain integer. Parameters ---------- - data : _BaseScalar + data : object The value to convert. zarr_format : ZarrFormat The zarr format version. @@ -144,6 +255,22 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + """ + A Zarr data type for 8-bit signed integers. + + This class wraps the NumPy np.dtypes.Int8DType data type. Scalars for this data type are + instances of np.int8. + + Attributes + ---------- + dtype_cls : np.dtypes.Int8DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["int8"]] = "int8" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal["|i1"]]] = ("|i1",) + The names of this data type in Zarr V2. + """ + dtype_cls = np.dtypes.Int8DType _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) @@ -151,7 +278,22 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Int8 from a np.dtype('int8') instance. + Create an Int8 from a np.dtype('int8') instance. + + Parameters + ---------- + dtype : TBaseDType + The np.dtype('int8') instance. + + Returns + ------- + Self + An instance of Int8. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of an Int8. """ if cls._check_native_dtype(dtype): return cls() @@ -160,10 +302,36 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self: Self) -> np.dtypes.Int8DType: + """ + Convert the Int8 instance to a np.dtype('int8') instance. + + Returns + ------- + np.dtypes.Int8DType + The np.dtype('int8') instance. + """ return self.dtype_cls() @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of Int8. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an Int8. + """ if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" @@ -171,6 +339,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of Int8. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an Int8. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -186,7 +372,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- @@ -195,8 +381,13 @@ def to_json( Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2 or str + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. """ if zarr_format == 2: return {"name": self._zarr_v2_names[0], "object_codec_id": None} @@ -206,11 +397,33 @@ def to_json( @property def item_size(self) -> int: + """ + Return the size of the item in bytes. + + Returns + ------- + int + The size of the item in bytes. + """ return 1 @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + """ + A Zarr data type for arrays containing 8-bit unsigned integers. This class wraps the NumPy + np.dtypes.UInt8DType data type. Scalars for this data type are instances np.uint8. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt8DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["uint8"]] = "uint8" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal["|u1"]]] = ("|u1",) + The names of this data type in Zarr V2. + """ + dtype_cls = np.dtypes.UInt8DType _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) @@ -218,7 +431,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Bool from a np.dtype('uint8') instance. + Create a UInt8 from a np.dtype('uint8') instance. """ if cls._check_native_dtype(dtype): return cls() @@ -227,10 +440,38 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: + """ + Create a NumPy unsigned 8-bit integer dtype instance from this UInt8 ZDType. + + Returns + ------- + np.dtypes.UInt8DType + The NumPy unsigned 8-bit integer dtype. + """ + return self.dtype_cls() @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ + if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" @@ -238,6 +479,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -253,37 +512,80 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"] + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ if zarr_format == 2: + # For Zarr format version 2, return a dictionary with the name and object codec ID. return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: + # For Zarr format version 3, return the v3 name as a string. return self._zarr_v3_name + # Raise an error if the zarr_format is neither 2 nor 3. raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @property def item_size(self) -> int: + """ + The size of the item in bytes. For an unsigned 8-bit integer, this is always 1 byte. + """ return 1 @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): + """ + A Zarr data type for arrays containing 16-bit signed integers. This class wraps the NumPy + np.dtypes.Int16DType data type. Scalars for this data type are instances np.int16. + + Attributes + ---------- + dtype_cls : np.dtypes.Int16DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["int16"]] = "int16" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">i2"], Literal["i2", "i2"], Literal["i2", " Self: + """ + Create an instance of this data type from a np.dtype('int16') instance. + + Parameters + ---------- + dtype : np.dtype + The instance of np.dtype('int16') to create from. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('int16'). + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -291,11 +593,37 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.Int16DType: + """ + Convert the data type to a np.dtype('int16') instance. + + Returns + ------- + np.dtype + The np.dtype('int16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -306,6 +634,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -321,17 +667,32 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">i2", "i2" or "i2", " int: + """ + The size of the item in bytes. For an unsigned 16-bit integer, this is always 2 bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + """ + A Zarr data type for arrays containing 16-bit unsigned integers. + + This class wraps the NumPy np.dtypes.UInt16DType data type. + Scalars for this data type are instances np.uint16. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt16DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["uint16"]] = "uint16" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">u2"], Literal["u2", "u2"], Literal["u2", " Self: + """ + Create an instance of this data type from a np.dtype('uint16') instance. + + Parameters + ---------- + dtype : np.dtype + The numpy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('uint16'). + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -360,11 +758,37 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.UInt16DType: + """ + Convert the data type to a np.dtype('uint16') instance. + + Returns + ------- + np.dtype + The np.dtype('uint16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -375,6 +799,24 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -390,17 +832,32 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">u2", "u2" or "u2", " int: + """ + The size of the item in bytes. For an unsigned 16-bit integer, this is always 2 bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + """ + A Zarr data type for arrays containing 32-bit signed integers. This class wraps the NumPy + np.dtypes.Int32DType data type. Scalars for this data type are instances np.int32. + + Attributes + ---------- + dtype_cls : np.dtypes.Int32DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["int32"]] = "int32" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">i4"], Literal["i4", "i4"], Literal["i4", " Self: + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an Int32 from a np.dtype('int32') instance. + + Parameters + ---------- + dtype : TBaseDType + The np.dtype('int32') instance. + + Returns + ------- + Int32 + The Int32 instance. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.Int32DType: + def to_native_dtype(self: Self) -> np.dtypes.Int32DType: + """ + Convert the Int32 instance to a np.dtype('int32') instance. + + Returns + ------- + np.dtypes.Int32DType + The np.dtype('int32') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Int32 + The Int32 instance. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an Int32. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." raise DataTypeValidationError(msg) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Int32 + The Int32 instance. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an Int32. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -459,7 +990,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">i4", " int: + """ + The size of the item in bytes. For an signed 32-bit integer, this is always 4 bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): + """ + A Zarr data type for arrays containing 32-bit unsigned integers. + + This class wraps the NumPy np.dtypes.UInt32DType data type. Scalars for this data type are + instances np.uint32. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt32DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["uint32"]] = "uint32" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">u4"], Literal["u4", "u4"], Literal["u4", " Self: + """ + Create a UInt32 from a np.dtype('uint32') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of a 32-bit unsigned + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -498,11 +1067,38 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.UInt32DType: + """ + Create a NumPy unsigned 32-bit integer dtype instance from this UInt32 ZDType. + + Returns + ------- + np.dtypes.UInt32DType + The NumPy unsigned 32-bit integer dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a 32-bit unsigned + integer. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -513,6 +1109,25 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a 32-bit unsigned + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -526,7 +1141,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">u4", " int: + """ + The size of the item in bytes. For an unsigned 32-bit integer, this is always 4 bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit signed integers. + + This class wraps the NumPy np.dtypes.Int64DType data type. Scalars for this data type are + instances np.int64. + + Attributes + ---------- + dtype_cls : np.dtypes.Int64DType + The class of the underlying numpy dtype. + _zarr_v3_name : ClassVar[Literal["int64"]] = "int64" + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">i8"], Literal["i8", "i8"], Literal["i8", " Self: + """ + Create an Int64 from a np.dtype('int64') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of a 64-bit signed + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -565,11 +1218,38 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.Int64DType: + """ + Create a NumPy signed 64-bit integer dtype instance from this Int64 ZDType. + + Returns + ------- + np.dtypes.Int64DType + The NumPy signed 64-bit integer dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a 64-bit signed + integer. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -580,6 +1260,25 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a 64-bit signed + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -593,7 +1292,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">i8", " int: + """ + The size of the item in bytes. For a signed 64-bit integer, this is always 8 bytes. + """ return 8 @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit unsigned integers. + + This data type wraps the NumPy np.dtypes.UInt64DType data type. Scalars for this + data type are instances of np.uint64. + + + Attributes + ---------- + dtype_cls: np.dtypes.UInt64DType + The class of the underlying numpy dtype. + _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" + The name of this data type in Zarr V3. + _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", "u8"], Literal["u8", " np.dtypes.UInt64DType: + """ + Convert the data type to a native numpy dtype. + + Returns + ------- + np.dtypes.UInt64DType + The native numpy dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an unsigned 64-bit + integer. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -639,6 +1385,25 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an unsigned 64-bit + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" @@ -646,7 +1411,6 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: @overload # type: ignore[override] def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... @@ -654,7 +1418,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">u8", " Self: + """ + Create an instance of this data type from a native numpy dtype. + + Parameters + ---------- + dtype : TBaseDType + The native numpy dtype. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input dtype is not a valid representation of an unsigned 64-bit + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -683,4 +1466,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: @property def item_size(self) -> int: + """ + The size of the item in bytes. For an unsigned 64-bit integer, this is always 8 bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 4a1114617a..3399eb3ae4 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -59,12 +59,40 @@ class LengthBytesConfig(TypedDict): class FixedLengthUTF32( ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): + """ + A Zarr data type for arrays containing fixed-length UTF-32 strings. + + This class wraps the NumPy np.dtypes.StrDType data type. Scalars for this data type are instances of np.str_. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.StrDType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["fixed_length_utf32"]] + The name of this data type in Zarr V3. + code_point_bytes : ClassVar[int] = 4 + The number of bytes per code point in UTF-32. + """ + dtype_cls = np.dtypes.StrDType _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32" code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a FixedLengthUTF32 from a NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_native_dtype(dtype): endianness = get_endianness_from_numpy_dtype(dtype) return cls( @@ -76,6 +104,14 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.StrDType[int]: + """ + Convert the FixedLengthUTF32 instance to a NumPy data type. + + Returns + ------- + np.dtypes.StrDType[int] + The NumPy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order) @@ -83,6 +119,16 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of a numpy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + Whether the input is a valid JSON representation of a numpy U dtype. """ return ( check_dtype_spec_v2(data) @@ -93,6 +139,19 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + """ + Check that the input is a valid JSON representation of a numpy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[FixedLengthUTF32JSONV3] + Whether the input is a valid JSON representation of a numpy U dtype. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -112,6 +171,19 @@ def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3: + """ + Convert the FixedLengthUTF32 instance to a JSON representation. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3 + The JSON representation of the data type. + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -124,6 +196,19 @@ def to_json( @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a numpy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v2(data): # Construct the numpy dtype instead of string parsing. name = data["name"] @@ -134,27 +219,104 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a numpy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) def default_scalar(self) -> np.str_: + """ + Return the default scalar value for this data type. + + Returns + ------- + np.str_ + The default scalar value. + """ return np.str_("") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert the scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + str + The JSON representation of the scalar value. + """ return str(data) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + """ + Convert the JSON representation of a scalar value to the native scalar value. + + Parameters + ---------- + data : JSON + The JSON data. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + np.str_ + The native scalar value. + """ if check_json_str(data): return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + TypeGuard[str | np.str_ | bytes | int] + Whether the input is a valid scalar value for this data type. + """ # this is generous for backwards compatibility return isinstance(data, str | np.str_ | bytes | int) def cast_scalar(self, data: object) -> np.str_: + """ + Cast the scalar value to the native scalar value. + + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + np.str_ + The native scalar value. + """ if self._check_scalar(data): # We explicitly truncate before casting because of the following numpy behavior: # >>> x = np.dtype('U3').type('hello world') @@ -173,6 +335,14 @@ def cast_scalar(self, data: object) -> np.str_: @property def item_size(self) -> int: + """ + Return the size of each item in the data type. + + Returns + ------- + int + The size of each item in the data type. + """ return self.length * self.code_point_bytes @@ -193,6 +363,18 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): """ A base class for the variable length UTF-8 string data type. This class should not be used as data type, but as a base class for other variable length string data types. + + This class is a generic implementation of a variable length UTF-8 string data type. It is + intended to be used as a base class for other variable length string data types. + + Attributes + ---------- + dtype_cls : TDType_co + The class of the underlying NumPy dtype. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "variable_length_utf8" + The object codec ID for this data type. """ _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" @@ -200,6 +382,19 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this class from a numpy dtype. + + Parameters + ---------- + dtype : numpy.dtype + The numpy dtype to create an instance from. + + Returns + ------- + Self + An instance of this class. + """ if cls._check_native_dtype(dtype): return cls() raise DataTypeValidationError( @@ -214,6 +409,17 @@ def _check_json_v2( """ Check that the input is a valid JSON representation of a numpy O dtype, and that the object codec id is appropriate for variable-length UTF-8 strings. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]] + Whether the input is a valid JSON representation of a numpy O dtype, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ return ( check_dtype_spec_v2(data) @@ -223,10 +429,38 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: + """ + Check that the input is a valid JSON representation of a variable length UTF-8 string + data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[Literal["variable_length_utf8"]] + Whether the input is a valid JSON representation of a variable length UTF-8 string + data type. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a numpy O dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. + + Returns + ------- + Self + An instance of this class. + """ if cls._check_json_v2(data): return cls() msg = ( @@ -236,6 +470,20 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a variable length UTF-8 + string data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. + + Returns + ------- + Self + An instance of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." @@ -251,6 +499,19 @@ def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: . def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]: + """ + Convert this data type to a JSON representation. + + Parameters + ---------- + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"] + The JSON representation of this data type. + """ if zarr_format == 2: return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: @@ -259,25 +520,102 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_scalar(self) -> str: + """ + Return the default scalar value for this data type. + + Returns + ------- + str + The default scalar value. + """ return "" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value to convert. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The JSON representation of the scalar value. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a string.") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Convert a JSON representation of a scalar value to the native scalar type. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The native scalar type of the scalar value. + """ if not check_vlen_string_json_scalar(data): raise TypeError(f"Invalid type: {data}. Expected a string or number.") return str(data) def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value to check. + + Returns + ------- + TypeGuard[SupportsStr] + Whether the input is a valid scalar value for this data type. + """ return isinstance(data, SupportsStr) def _cast_scalar_unchecked(self, data: SupportsStr) -> str: + """ + Cast a scalar value to a string. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + str + The string representation of the scalar value. + """ return str(data) def cast_scalar(self, data: object) -> str: + """ + Cast a scalar value to the native scalar type. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + str + The native scalar type of the scalar value. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Cannot convert object with type {type(data)} to a python string.") @@ -287,16 +625,62 @@ def cast_scalar(self, data: object) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. This class wraps the + NumPy np.dtypes.StringDType data type. Scalars for this data type are python strings. + + + Attributes + ---------- + dtype_cls : Type[np.dtypes.StringDType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ + dtype_cls = np.dtypes.StringDType def to_native_dtype(self) -> np.dtypes.StringDType: + """ + Create a NumPy string dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.StringDType + The NumPy string dtype. + """ return self.dtype_cls() else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. This class wraps the + NumPy np.dtypes.ObjecDType data type. Scalars for this data type are python strings. + + + Attributes + ---------- + dtype_cls : Type[np.dtypes.ObjectDType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ + dtype_cls = np.dtypes.ObjectDType def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.ObjectDType + The NumPy object dtype. + """ return self.dtype_cls() diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 1f9080475c..62efdf4106 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -111,6 +111,26 @@ class TimeConfig(TypedDict): @dataclass(frozen=True, kw_only=True, slots=True) class TimeDTypeBase(ZDType[BaseTimeDType_co, BaseTimeScalar_co], HasEndianness, HasItemSize): + """ + A base class for data types that represent time via the NumPy TimeDelta64 and DateTime64 data + types. + + Attributes + ---------- + scale_factor : int + The scale factor for the time unit. + unit : str + The unit of time. + item_size : int + The size of one item in bytes. + _zarr_v2_names : tuple + The names of this data type in Zarr V2. + _zarr_v3_name : str + The name of this data type in Zarr V3. + _numpy_name : str + The name of this data type in NumPy. + """ + _zarr_v2_names: ClassVar[tuple[str, ...]] # this attribute exists so that we can programmatically create a numpy dtype instance # because the particular numpy dtype we are wrapping does not allow direct construction via @@ -129,6 +149,26 @@ def __post_init__(self) -> None: @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this class from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native NumPy dtype to convert. + + Returns + ------- + Self + An instance of this class configured with the unit, scale factor, and endianness + derived from the provided dtype. + + Raises + ------ + DataTypeValidationError + If the dtype is not a valid representation of a NumPy temporal data type. + """ + if cls._check_native_dtype(dtype): unit, scale_factor = np.datetime_data(dtype.name) unit = cast("DateTimeUnit", unit) @@ -145,6 +185,17 @@ def to_native_dtype(self) -> BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. + """ + Convert this data type to a NumPy temporal data type with the appropriate + unit and scale factor. + + Returns + ------- + BaseTimeDType_co + A NumPy data type object representing the time data type with + the specified unit, scale factor, and byte order. + """ + dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] @@ -156,6 +207,24 @@ def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSON def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: name = self.to_native_dtype().str return {"name": name, "object_codec_id": None} @@ -170,20 +239,54 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + """ + Convert a python object to a JSON representation of a datetime64 or timedelta64 scalar. + + Parameters + ---------- + data : object + The python object to convert. + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + int + The JSON representation of the scalar. + """ return datetimelike_to_int(data) # type: ignore[arg-type] @property def item_size(self) -> int: + """ + The size of each item in the data type, in bytes. + + Returns + ------- + int + The size of each item in the data type, in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True, slots=True) class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): """ - A wrapper for the ``TimeDelta64`` data type defined in numpy. - Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. - Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the - unit for ``TimeDelta64`` is optional. + A Zarr data type for arrays containing NumPy TimeDelta64 data. + + This class wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + are instances of ``np.timedelta64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.TimeDelta64DType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["numpy.timedelta64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : tuple + The names of this data type in Zarr V2. + _numpy_name : ClassVar[Literal["timedelta64"]] = "timedelta64" + The literate NumPy name of this data type. """ # mypy infers the type of np.dtypes.TimeDelta64DType to be @@ -191,12 +294,28 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" _zarr_v2_names = (">m8", " TypeGuard[DTypeConfig_V2[str, None]]: + """ + Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, + which could be in the form of strings like "m8[10s]". This method serves as a type + guard, helping to refine the type of unknown JSON input by confirming its adherence to the + expected format for NumPy timedelta64 data types. + + The JSON input should contain a "name" key with a value that matches the expected string + pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed + within square brackets, following the base type identifier. + + Returns + ------- + bool + True if the JSON input is a valid representation of a NumPy timedelta64 data type, + otherwise False. + """ if not check_dtype_spec_v2(data): return False name = data["name"] @@ -215,6 +334,26 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + """ + Check that JSON input is a dict with a 'name' key with the value 'numpy.timedelta64', and a + 'configuration' key with a value of a dict with a 'unit' key and a 'scale_factor' key. The + 'unit' key should map to a string describing the unit of time, and the 'scale_factor' key + should map to an integer describing the scale factor. + + For example, the following is a valid JSON representation of a TimeDelta64 in Zarr V3: + + .. code-block:: json + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "generic", + "scale_factor": 1 + } + } + + This function can be used as a type guard to narrow the type of unknown JSON input. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -225,6 +364,24 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TimeDelta64 + An instance of TimeDelta64. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of a TimeDelta64. + """ if cls._check_json_v2(data): name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -236,6 +393,26 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V3-flavored JSON. + + The JSON representation of a TimeDelta64 in Zarr V3 is a dict with a 'name' key + with the value 'numpy.timedelta64', and a 'configuration' key with a value of a dict + with a 'unit' key and a 'scale_factor' key. + + For example: + + .. code-block:: json + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "generic", + "scale_factor": 1 + } + } + + """ if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -249,23 +426,83 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: + """ + Check if the input is a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[TimeDeltaLike] + True if the input is a scalar of this data type, False otherwise. + """ if data is None: return True return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: + """ + Cast the provided scalar input to a numpy timedelta64 without any type checking. + + This method assumes that the input data is already a valid scalar of this data type, + and does not perform any validation or type checks. It directly casts the input + to a numpy timedelta64 scalar using the unit and scale factor defined in the class. + + Parameters + ---------- + data : TimeDeltaLike + The scalar input data to cast. + + Returns + ------- + np.timedelta64 + The input data cast as a numpy timedelta64 scalar. + """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") def cast_scalar(self, data: object) -> np.timedelta64: + """ + Cast the input to a numpy timedelta64 scalar. If the input is not a scalar of this data type, + raise a TypeError. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." raise TypeError(msg) def default_scalar(self) -> np.timedelta64: + """ + Return a default scalar of this data type. + + This method provides a default value for the timedelta64 scalar, which is + a 'Not-a-Time' (NaT) value. + """ return np.timedelta64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + """ + Create a scalar of this data type from JSON input. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + np.timedelta64 + The scalar value of this data type. + + Raises + ------ + TypeError + If the input JSON is not a valid representation of a scalar of this data type. + """ if check_json_time(data): return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover @@ -273,19 +510,46 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedel @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + """ + A Zarr data type for arrays containing NumPy Datetime64 data. + + This class wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + are instances of ``np.datetime64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypesTimeDelta64DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["numpy.timedelta64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">m8"], Literal["M8", " TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that JSON input is a string representation of a NumPy datetime64 data type, like "M8[10s]". This function can be used as a type guard to narrow the type of unknown JSON - input. + Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid JSON representation of a NumPy datetime64 data type, + otherwise False. """ if not check_dtype_spec_v2(data): return False @@ -303,6 +567,30 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + """ + Check that the input is a valid JSON representation of this data type. + + The input must be a dictionary with the following structure: + + { + "name": "numpy.datetime64", + "configuration": { + "unit": , + "scale_factor": + } + } + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DateTime64JSONV3] + True if the input is a valid JSON representation of a numpy datetime64 data type in Zarr V3, False otherwise. + """ + return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -313,6 +601,29 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V2-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of the data type. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ + if cls._check_json_v2(data): name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -324,6 +635,28 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V3-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of the data type. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this data type. + """ if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -337,23 +670,97 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + """ + Check if the input is a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[DateTimeLike] + True if the input is a scalar of this data type, False otherwise. + """ if data is None: return True return isinstance(data, str | int | bytes | np.datetime64 | datetime) def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + """ + Cast the provided scalar data to np.datetime64 without checking. + + This method does not perform any type checking. + The input data must be a scalar of this data type. + + Parameters + ---------- + data : DateTimeLike + The scalar data to cast. + + Returns + ------- + np.datetime64 + The casted data as a numpy datetime scalar. + """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") def cast_scalar(self, data: object) -> np.datetime64: + """ + Cast a scalar value to a numpy datetime scalar. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + np.datetime64 + The data cast as a numpy datetime scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy datetime scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." raise TypeError(msg) def default_scalar(self) -> np.datetime64: + """ + Return a default scalar of this data type. + + This method provides a default value for the datetime64 scalar, which is + a 'Not-a-Time' (NaT) value. + """ + return np.datetime64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + """ + Read a JSON-serializable value as a numpy datetime scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.datetime64 + The numpy datetime scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. + """ if check_json_time(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4c8ced21f4..bad710ed43 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -2181,7 +2181,7 @@ def create_hierarchy( group already exists at path ``a``, then this function will leave the group at ``a`` as-is. Yields - ------- + ------ tuple[str, Array | Group]. Examples diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 6e3789543b..1e6f322264 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -1,3 +1,59 @@ -from zarr.core.dtype import ZDType, data_type_registry +from zarr.core.dtype import ( + Bool, + Complex64, + Complex128, + DataTypeValidationError, + DateTime64, + FixedLengthUTF32, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + NullTerminatedBytes, + RawBytes, + Structured, + TimeDelta64, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthBytes, + VariableLengthUTF8, + ZDType, + data_type_registry, + parse_data_type, +) -__all__ = ["ZDType", "data_type_registry"] +__all__ = [ + "Bool", + "Complex64", + "Complex128", + "DataTypeValidationError", + "DateTime64", + "FixedLengthUTF32", + "Float16", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "NullTerminatedBytes", + "RawBytes", + "Structured", + "TimeDelta64", + "TimeDelta64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "VariableLengthBytes", + "VariableLengthUTF8", + "ZDType", + "data_type_registry", + "data_type_registry", + "parse_data_type", +] From 2e96ecaf643a0cf469f02a53d26ca912958dbdab Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 23 Jun 2025 14:12:47 +0200 Subject: [PATCH 08/42] changelog --- changes/3157.doc.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/3157.doc.rst diff --git a/changes/3157.doc.rst b/changes/3157.doc.rst new file mode 100644 index 0000000000..6132b195ec --- /dev/null +++ b/changes/3157.doc.rst @@ -0,0 +1,2 @@ +Add a self-contained example of data type extension to the ``examples`` directory, and expanded +the documentation for data types. \ No newline at end of file From b9a510a8cad496457d9ba33e292e65b109069ec3 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 24 Jun 2025 10:15:59 +0200 Subject: [PATCH 09/42] docstring style --- src/zarr/core/dtype/npy/int.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index ee84d3537c..d71e938511 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -411,8 +411,9 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): """ - A Zarr data type for arrays containing 8-bit unsigned integers. Wraps the NumPy - np.dtypes.UInt8DType data type. Scalars for this data type are instances np.uint8. + A Zarr data type for arrays containing 8-bit unsigned integers. + + Wraps the NumPy ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. Attributes ---------- From e34d18e7c10e730e7f21ead7787f4ed9b8ecc243 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 11:00:58 +0200 Subject: [PATCH 10/42] add docstrings and polish interfaces --- docs/user-guide/data_types.rst | 215 ++++++++++++------------- src/zarr/core/dtype/common.py | 3 + src/zarr/core/dtype/npy/bool.py | 9 +- src/zarr/core/dtype/npy/bytes.py | 74 ++++----- src/zarr/core/dtype/npy/complex.py | 26 ++- src/zarr/core/dtype/npy/float.py | 93 +++++++---- src/zarr/core/dtype/npy/int.py | 212 +++++++++++++++--------- src/zarr/core/dtype/npy/string.py | 110 +++++++------ src/zarr/core/dtype/npy/structured.py | 221 +++++++++++++++++++++++++- src/zarr/core/dtype/npy/time.py | 17 +- src/zarr/core/dtype/registry.py | 130 ++++++++++++++- src/zarr/registry.py | 4 +- tests/test_dtype_registry.py | 2 +- 13 files changed, 781 insertions(+), 335 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index eed121a313..2ea31bf860 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,31 +1,31 @@ -Array Data types +Array Data Types ================ -Zarr's data type model +Zarr's Data Type Model ---------------------- -Zarr is designed for interperability with NumPy, so if you are familiar with NumPy or any other -N-dimensional array library then Zarr's model for array data types should seem familiar. But Zarr -data types have extra constraints that are important to be aware of. +Zarr is designed for interoperability with NumPy, so if you are familiar with NumPy or any other +N-dimensional array library, Zarr's model for array data types should seem familiar. However, Zarr +data types have some unique features that are described in this document. -Zarr arrays operate under an essential design constraint: Unlike NumPy arrays, Zarr arrays -are designed to stored and accessed by other Zarr implementations. This means that, among other things, +Zarr arrays operate under an essential design constraint: unlike NumPy arrays, Zarr arrays +are designed to be stored and accessed by other Zarr implementations. This means that, among other things, Zarr data types must be serializable to metadata documents in accordance with the Zarr specifications, -and this adds some unique texture to the Zarr data type model. +which adds some unique aspects to the Zarr data type model. -The following sections will explain Zarr's data type model in greater detail, and demonstrate the +The following sections explain Zarr's data type model in greater detail and demonstrate the Zarr Python APIs for working with Zarr data types. -Array data types +Array Data Types ^^^^^^^^^^^^^^^^ Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data -type is encoded in the metadata for the array, which means that a declaration of the -data type of an array must be `JSON`-serializable. +type is encoded in the JSON metadata for the array. This means that the data type of an array must be +JSON-serializable. In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. -Zarr V3 changed the name of this field to ``data_type``, and also defined new rules for the values -assignable to the ``data_type`` field. +Zarr V3 changed the name of this field to ``data_type`` and also defined new rules for the values +that can be assigned to the ``data_type`` field. For example, in Zarr V2, the boolean array data type was represented in array metadata as the string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool"``. @@ -33,23 +33,22 @@ string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool Scalars ^^^^^^^ -Zarr also defines how array elements, i.e. scalars, are encoded in array metadata. This is necessary +Zarr also specifies how array elements, i.e., scalars, are encoded in array metadata. This is necessary because Zarr uses a field in array metadata to define a default value for chunks that are not stored. This field, called ``fill_value`` in both Zarr V2 and Zarr V3 metadata documents, contains a -``JSON`` value that can be decoded to a scalar value compatible with the array's data type. +JSON value that can be decoded to a scalar value compatible with the array's data type. -For the boolean data type, the scalar encoding is simple -- booleans are natively supported by -``JSON``, so zarr saves booleans as ``JSON`` booleans. Other scalars, like floats or raw bytes, have -more elaborate encoding schemes, and in some cases this scheme depends on the Zarr format version. +For the boolean data type, the scalar encoding is simple—booleans are natively supported by +JSON, so Zarr saves booleans as JSON booleans. Other scalars, like floats or raw bytes, have +more elaborate encoding schemes, and in some cases, this scheme depends on the Zarr format version. - -Data types in Zarr version 2 ------------------------------ +Data Types in Zarr Version 2 +---------------------------- Version 2 of the Zarr format defined its data types relative to `NumPy's data types `_, -and added a few non-NumPy data types as well. With one exception, the Zarr V2 JSON identifier for a data -type is just the NumPy ``str`` attribute of that data type: +and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr +V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: .. code-block:: python @@ -70,78 +69,82 @@ type is just the NumPy ``str`` attribute of that data type: The ``<`` character in the data type metadata encodes the `endianness `_, - or "byte order", of the data type. Following NumPy's example, + or "byte order," of the data type. As per the NumPy model, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. There are two special cases to consider: `"structured" data types <#structured-data-type>`_, and `"object" <#object-data-type>` data types. -Structured data type +Structured Data Type ^^^^^^^^^^^^^^^^^^^^ -NumPy allows the construction of a so-called "structured" data type comprised of an ordered collection +NumPy allows the construction of a so-called "structured" data types comprised of ordered collections of named fields, where each field is itself a distinct NumPy data type. See the NumPy documentation -`here `_ . +`here `_. -Crucially, NumPy does not use a special data type for structured data types -- instead, NumPy -implemented structured data types as an option feature of the so-called "Void" data type, which models -arbitrary fixed-size byte strings. Since the ``str`` attribute of an unstructured NumPy void -data type is the same as the ``str`` of Numpy structured data type, Zarr V2 uses a special data type -encoding for structured data types that distinguishes the two. +Crucially, NumPy does not use a special data type for structured data types—instead, NumPy +implements structured data types as an optional feature of the so-called "Void" data type, which models +arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void +data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` +attribute does not convey information about the fields contained in a structured data type. For +these reasons, Zarr V2 uses a special data type encoding for structured data types. They are stored +in JSON as lists of pairs, where the first element is a string, and the second element is a Zarr V2 +data type specification. This representation supports recursion. For example: .. code-block:: python >>> store = {} - >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', '>f4')]) + >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', [('subfield_c', '>f4'), ('subfield_d', 'i2')])]) >>> np_dtype.str '|V6' >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> dtype_meta - [['field_a', '>i2'], ['field_b', '>f4']] + [['field_a', '>i2'], ['field_b', [['subfield_c', '>f4'], ['subfield_d', '`_. - -Data types in Zarr Python +Data Types in Zarr Python ------------------------- -The two Zarr formats that Zarr Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as NumPy-compatible strings -(or lists, in the case of structured data types), while data types in Zarr V3 are encoded as either -strings or ``JSON`` objects, and the Zarr V3 data types don't have any associated endianness -information, unlike Zarr V2 data types. +The two Zarr formats that Zarr Python supports specify data types in different ways: data types in +Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data +types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data +types do not have any associated endianness information, unlike Zarr V2 data types. -Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences +Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences. We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_, which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. In this context, a "native" data type is a Python class, typically defined in another library, that models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. -Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. +Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called +`UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. -As of this writing the only native data types Zarr Python supports are NumPy data types. We could -avoid the "native data type" jargon and just say "NumPy data type", but we don't want to rule out the +As of this writing, the only native data types Zarr Python supports are NumPy data types. We could +avoid the "native data type" jargon and just say "NumPy data type," but we do not want to rule out the possibility of using non-NumPy array backends in the future. -Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an +Each data type supported by Zarr Python is modeled by a ``ZDType`` subclass, which provides an API for the following operations: -- Encoding / decoding a native data type -- Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. -- Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. +- Encoding and decoding a native data type +- Encoding and decoding a data type to and from Zarr V2 and Zarr V3 array metadata +- Encoding and decoding a scalar value to and from Zarr V2 and Zarr V3 array metadata - Casting a Python object to a scalar value consistent with the data type -The following section lists the data types built in to Zarr Python. +The following section lists the data types built into Zarr Python. -Boolean types +Boolean Types ^^^^^^^^^^^^^ - `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ -Integral types +Integral Types ^^^^^^^^^^^^^^ - `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ - `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ @@ -209,7 +210,7 @@ Integral types - `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ - `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ -Floating-point types +Floating-Point Types ^^^^^^^^^^^^^^^^^^^^ - `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ - `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ @@ -217,18 +218,17 @@ Floating-point types - `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ - `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ -String types +String Types ^^^^^^^^^^^^ - `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ - `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ -Byte string types +Byte String Types ^^^^^^^^^^^^^^^^^ - `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ - `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ - `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ - Example Usage ~~~~~~~~~~~~~ @@ -240,7 +240,7 @@ Create a ``ZDType`` from a native data type: >>> import numpy as np >>> int8 = Int8.from_native_dtype(np.dtype('int8')) -Convert back to native data type: +Convert back to a native data type: .. code-block:: python @@ -254,8 +254,7 @@ Get the default scalar value for the data type: >>> default_value = int8.default_scalar() >>> assert default_value == np.int8(0) - -Serialize to JSON for Zarr V2 +Serialize to JSON for Zarr V2: .. code-block:: python @@ -266,13 +265,13 @@ Serialize to JSON for Zarr V2 .. note:: The representation returned by ``to_json`` is more abstract than the literal contents of Zarr V2 - array metadata, because the ``JSON`` representation used by the `ZDType` classes must be distinct across + array metadata, because the JSON representation used by the `ZDType` classes must be distinct across different data types. Zarr V2 identifies multiple distinct data types with the "object" data type identifier ``"|O"``, which means extra information is needed to disambiguate these data types from one another. That's the reason for the ``object_codec_id`` field you see here. See the `section <#object-data-type>`_ on the "object" data type for more information. -And V3: +And for V3: .. code-block:: python @@ -295,7 +294,7 @@ Deserialize a scalar value from JSON: >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) -Adding new data types +Adding New Data Types ~~~~~~~~~~~~~~~~~~~~~ Each Zarr data type is a separate Python class that inherits from @@ -309,8 +308,7 @@ Python project directory. .. literalinclude:: ../../examples/custom_dtype.py :language: python - -Data type resolution +Data Type Resolution ~~~~~~~~~~~~~~~~~~~~ Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array @@ -347,14 +345,14 @@ But if we inspect the metadata for the array, we can see the Zarr data type obje >>> type(a.metadata.data_type) -This example illustrates a general problem Zarr Python has to solve -- how can we allow users to -specify a data type as a string, or a NumPy ``dtype`` object, and produce the right Zarr data type -from that input? We call this process "data type resolution". Zarr Python also performs data type -resolution when reading stored arrays, although in this case the input is a ``JSON`` value instead +This example illustrates a general problem Zarr Python has to solve: how can we allow users to +specify a data type as a string or a NumPy ``dtype`` object, and produce the right Zarr data type +from that input? We call this process "data type resolution." Zarr Python also performs data type +resolution when reading stored arrays, although in this case the input is a JSON value instead of a NumPy data type. -For simple data types like ``int`` the solution could be extremely simple: just -maintain a lookup table that relates a NumPy data type to the Zarr data type equivalent. But not all +For simple data types like ``int``, the solution could be extremely simple: just +maintain a lookup table that maps a NumPy data type to the Zarr data type equivalent. But not all data types are so simple. Consider this case: .. code-block:: python @@ -371,23 +369,22 @@ data types are so simple. Consider this case: In this example, we created a `NumPy structured data type `_. -This data type is a container that can contain any NumPy data type, which makes it recursive. It is +This data type is a container that can hold any NumPy data type, which makes it recursive. It is not possible to make a lookup table that relates all NumPy structured data types to their Zarr equivalents, as there is a nearly unbounded number of different structured data types. So instead of a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. -Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry", -is essentially a dict where the keys are strings (a canonical name for each data type), and the values are -the data type classes themselves. Dynamic data type resolution entails iterating over these data -type classes, invoking a special class constructor defined on each one, and returning a concrete -data type instance if and only if exactly 1 of those constructor invocations was successful. - -In plain language, we take some user input (a NumPy array), offer it to all the known data type -classes, and return an instance of the one data type class that could accept that user input. +Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," +is essentially a dictionary where the keys are strings (a canonical name for each data type), and the +values are the data type classes themselves. Dynamic data type resolution entails iterating over +these data type classes, invoking a special class constructor defined on each one, and returning a +concrete data type instance if and only if exactly one of those constructor invocations is successful. -We want to avoid a situation where the same native data type matches multiple Zarr data types. I.e., -a NumPy data type should uniquely specify a single Zarr data type. But data type resolution is -dynamic, so it's not possible to guarantee this uniqueness constraint. So we attempt data type -resolution against every data type class, and if for some reason a native data type matches multiple -Zarr data types, we treat this as an error and raise an exception. +In plain language, we take some user input, like a NumPy data type, offer it to all the +known data type classes, and return an instance of the one data type class that can accept that user input. +We want to avoid a situation where the same native data type matches multiple Zarr data types; that is, +a NumPy data type should *uniquely* specify a single Zarr data type. But data type resolution is +dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we +attempt data type resolution against *every* data type class, and if, for some reason, a native data +type matches multiple Zarr data types, we treat this as an error and raise an exception. \ No newline at end of file diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 59ee500dc5..ba0ebba91b 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -87,6 +87,9 @@ def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2 def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]: + """ + Check that all the elements of a sequence are valid zarr v2 structured dtype identifiers + """ return all(check_structured_dtype_v2_inner(d) for d in data) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 62202e74de..cb500f3db6 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -21,7 +21,8 @@ @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ - The boolean data type. + A Zarr data type for arrays containing booleans. Wraps the NumPy + ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of ``np.bool_``. Attributes ---------- @@ -96,7 +97,7 @@ def _check_json_v2( Returns ------- - bool + TypeGuard[DTypeConfig_V2[Literal["|b1"], None]] True if the input is a valid JSON representation, False otherwise. """ return ( @@ -302,11 +303,11 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: @property def item_size(self) -> int: """ - Return the item size of the boolean dtype. + The size of a single scalar in bytes. Returns ------- int - The item size in bytes. + The size of a single scalar in bytes. """ return 1 diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 470dbbc770..e259610489 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -35,7 +35,7 @@ class FixedLengthBytesConfig(TypedDict): @dataclass(frozen=True, kw_only=True) class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): """ - A Zarr data type for arrays containing null-terminated bytes. This class wraps the NumPy + A Zarr data type for arrays containing null-terminated bytes. Wraps the NumPy ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of ``np.bytes_``. This data type is parametrized by an integral length which specifies size in bytes of each @@ -202,8 +202,8 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- - NullTerminatedBytes - An instance of this ZDType. + Self + An instance of this data type. Raises ------ @@ -233,8 +233,8 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- - NullTerminatedBytes - An instance of this NullTerminatedBytes. + Self + An instance of this data type. Raises ------ @@ -295,7 +295,7 @@ def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: Check if the provided data is of type BytesLike. This method is used to verify if the input data can be considered as a - scalar of bytes-like type, which includes numpy bytes, strings, bytes, + scalar of bytes-like type, which includes NumPy bytes, strings, bytes, and integers. Parameters @@ -323,7 +323,7 @@ def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_: Returns ------- np.bytes_ - The casted data as a numpy bytes scalar. + The casted data as a NumPy bytes scalar. Notes ----- @@ -337,31 +337,31 @@ def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_: def cast_scalar(self, data: object) -> np.bytes_: """ - Attempt to cast a given object to a numpy bytes scalar. + Attempt to cast a given object to a NumPy bytes scalar. This method first checks if the provided data is a valid scalar that can be - converted to a numpy bytes scalar. If the check succeeds, the unchecked casting + converted to a NumPy bytes scalar. If the check succeeds, the unchecked casting operation is performed. If the data is not valid, a TypeError is raised. Parameters ---------- data : object - The data to be cast to a numpy bytes scalar. + The data to be cast to a NumPy bytes scalar. Returns ------- np.bytes_ - The data cast as a numpy bytes scalar. + The data cast as a NumPy bytes scalar. Raises ------ TypeError - If the data cannot be converted to a numpy bytes scalar. + If the data cannot be converted to a NumPy bytes scalar. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy bytes scalar." + msg = f"Cannot convert object with type {type(data)} to a NumPy bytes scalar." raise TypeError(msg) def default_scalar(self) -> np.bytes_: @@ -379,7 +379,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: """ Convert a scalar to a JSON-serializable string representation. - This method encodes the given scalar as a numpy bytes scalar and then + This method encodes the given scalar as a NumPy bytes scalar and then encodes the bytes as a base64-encoded string. Parameters @@ -411,7 +411,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: Returns ------- np.bytes_ - The numpy bytes scalar obtained from decoding the base64 string. + The NumPy bytes scalar obtained from decoding the base64 string. Raises ------ @@ -428,24 +428,24 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @property def item_size(self) -> int: """ - Get the item size of the raw bytes. + The size of a single scalar in bytes. Returns ------- int - The size of each item in bytes, equivalent to the length attribute. + The size of a single scalar in bytes. """ return self.length @dataclass(frozen=True, kw_only=True) class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): - # np.dtypes.VoidDType is specified in an odd way in numpy + # np.dtypes.VoidDType is specified in an odd way in NumPy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here """ - A Zarr data type for arrays containing raw bytes. This class wraps the NumPy ``void`` data type. + A Zarr data type for arrays containing raw bytes. Wraps the NumPy ``void`` data type. Scalars for this data type are instances of ``np.void``. This data type is parametrized by an integral length which specifies size in bytes of each @@ -643,8 +643,8 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- - RawBytes - An instance of this ZDType. + Self + An instance of this data type. Raises ------ @@ -756,7 +756,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: Returns ------- np.void - The casted data as a numpy void scalar. + The casted data as a NumPy void scalar. Notes ----- @@ -764,44 +764,44 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: The input data must be castable to np.void. """ native_dtype = self.to_native_dtype() - # Without the second argument, numpy will return a void scalar for dtype V1. + # Without the second argument, NumPy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. return native_dtype.type(data, native_dtype) def cast_scalar(self, data: object) -> np.void: """ - Attempt to cast a given object to a numpy void scalar. + Attempt to cast a given object to a NumPy void scalar. This method first checks if the provided data is a valid scalar that can be - converted to a numpy void scalar. If the check succeeds, the unchecked casting + converted to a NumPy void scalar. If the check succeeds, the unchecked casting operation is performed. If the data is not valid, a TypeError is raised. Parameters ---------- data : object - The data to be cast to a numpy void scalar. + The data to be cast to a NumPy void scalar. Returns ------- np.void - The data cast as a numpy void scalar. + The data cast as a NumPy void scalar. Raises ------ TypeError - If the data cannot be converted to a numpy void scalar. + If the data cannot be converted to a NumPy void scalar. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy void scalar." + msg = f"Cannot convert object with type {type(data)} to a NumPy void scalar." raise TypeError(msg) def default_scalar(self) -> np.void: """ Return the default scalar value for this data type. - The default scalar is a numpy void scalar of the same length as the data type, + The default scalar is a NumPy void scalar of the same length as the data type, filled with zero bytes. Returns @@ -847,7 +847,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Returns ------- np.void - The numpy void scalar. + The NumPy void scalar. Raises ------ @@ -861,12 +861,12 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: @property def item_size(self) -> int: """ - Get the item size of the raw bytes. + The size of a single scalar in bytes. Returns ------- int - The size of each item in bytes, equivalent to the length attribute. + The size of a single scalar in bytes. """ return self.length @@ -874,7 +874,7 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): """ - A Zarr data type for arrays containing variable-length bytes. This class wraps the NumPy + A Zarr data type for arrays containing variable-length bytes. Wraps the NumPy "object" data type. Scalars for this data type are instances of plain python bytes. Attributes @@ -1017,8 +1017,8 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- - VariableLengthBytes - An instance of this ZDType. + Self + An instance of this data type. Raises ------ @@ -1164,7 +1164,7 @@ def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: Check if the provided data is of type BytesLike. This method is used to verify if the input data can be considered as a - scalar of bytes-like type, which includes numpy bytes, strings, bytes, + scalar of bytes-like type, which includes NumPy bytes, strings, bytes, and integers. Parameters diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 5dc316ed0a..88ec58bb75 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -61,7 +61,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Returns ------- Self - An instance of this data type with the specified endianness. + An instance of this data type. Raises ------ @@ -145,7 +145,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this complex data type. + An instance of this data type. Raises ------ @@ -173,7 +173,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this complex data type. + An instance of this data type. Raises ------ @@ -349,7 +349,7 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): """ A Zarr data type for arrays containing 64 bit complex floats. - This class wraps the NumPy ``np.dtypes.Complex64DType`` data type. Scalars for this data type + Wraps the NumPy ``np.dtypes.Complex64DType`` data type. Scalars for this data type are instances of ``np.complex64``. Attributes @@ -368,6 +368,14 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @@ -376,7 +384,7 @@ class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndia """ A Zarr data type for arrays containing 64 bit complex floats. - This class wraps the NumPy ``np.dtypes.Complex128DType`` data type. Scalars for this data type + Wraps the NumPy ``np.dtypes.Complex128DType`` data type. Scalars for this data type are instances of ``np.complex128``. Attributes @@ -395,4 +403,12 @@ class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndia @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 16 diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 3ea5076599..bbc3cf7e93 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -36,7 +36,7 @@ @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): """ - Base class for numpy float data types. + Base class for NumPy float data types. Attributes ---------- @@ -50,17 +50,17 @@ class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemS @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create an instance of this ZDType from a numpy data type. + Create an instance of this ZDType from a NumPy data type. Parameters ---------- dtype : TBaseDType - The numpy data type. + The NumPy data type. Returns ------- Self - An instance of this ZDType. + An instance of this data type. """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) @@ -70,12 +70,12 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def to_native_dtype(self) -> TFloatDType_co: """ - Convert the wrapped data type to a numpy data type. + Convert the wrapped data type to a NumPy data type. Returns ------- TFloatDType_co - The numpy data type. + The NumPy data type. """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -131,10 +131,10 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this ZDType. + An instance of this data type. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -154,7 +154,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this ZDType. + An instance of this data type. """ if cls._check_json_v3(data): return cls() @@ -178,8 +178,13 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] or str + The JSON-serializable representation of the wrapped data type. + + Raises + ------ + ValueError + If zarr_format is not 2 or 3. """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} @@ -205,7 +210,7 @@ def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: """ - Cast a scalar value to a numpy float scalar. + Cast a scalar value to a NumPy float scalar. Parameters ---------- @@ -215,13 +220,13 @@ def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: Returns ------- TFloatScalar_co - The numpy float scalar. + The NumPy float scalar. """ return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TFloatScalar_co: """ - Cast a scalar value to a numpy float scalar. + Cast a scalar value to a NumPy float scalar. Parameters ---------- @@ -231,27 +236,27 @@ def cast_scalar(self, data: object) -> TFloatScalar_co: Returns ------- TFloatScalar_co - The numpy float scalar. + The NumPy float scalar. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." + msg = f"Cannot convert object with type {type(data)} to a NumPy float scalar." raise ScalarTypeValidationError(msg) def default_scalar(self) -> TFloatScalar_co: """ - Get the default value, which is 0 cast to this dtype. + Get the default value, which is 0 cast to this zdtype. Returns ------- - Int scalar + TFloatScalar_co The default value. """ return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ - Read a JSON-serializable value as a numpy float scalar. + Read a JSON-serializable value as a NumPy float scalar. Parameters ---------- @@ -263,7 +268,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- TFloatScalar_co - The numpy float scalar. + The NumPy float scalar. """ if zarr_format == 2: if check_json_float_v2(data): @@ -310,14 +315,15 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st @dataclass(frozen=True, kw_only=True) class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): """ - A Zarr data type for arrays containing 16-bit floating point numbers. This class wraps the - NumPy np.dtypes.Float16DType data type. Scalars for this data type are instances of - ``np.float16``. + A Zarr data type for arrays containing 16-bit floating point numbers. + + Wraps the NumPy ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances + of ``np.float16``. Attributes ---------- dtype_cls : Type[np.dtypes.Float16DType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["float16"]] The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">f2"], Literal[" int: """ - Return the item size of the data type in bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 2 @@ -339,14 +350,15 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): """ - A Zarr data type for arrays containing 32-bit floating point numbers. This class wraps the - NumPy np.dtypes.Float32DType data type. Scalars for this data type are instances of - ``np.float32``. + A Zarr data type for arrays containing 32-bit floating point numbers. + + Wraps the NumPy ``np.dtypes.Float32DType`` data type. Scalars for this data type are instances + of ``np.float32``. Attributes ---------- dtype_cls : Type[np.dtypes.Float32DType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["float32"]] The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">f4"], Literal[" int: """ - Return the item size of the data type in bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 4 @@ -368,14 +385,15 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): """ - A Zarr data type for arrays containing 64-bit floating point numbers. This class wraps the - NumPy np.dtypes.Float64DType data type. Scalars for this data type are instances of - ``np.float64``. + A Zarr data type for arrays containing 64-bit floating point numbers. + + Wraps the NumPy ``np.dtypes.Float64DType`` data type. Scalars for this data type are instances + of ``np.float64``. Attributes ---------- dtype_cls : Type[np.dtypes.Float64DType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["float64"]] The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">f8"], Literal[" int: """ - Return the item size of the data type in bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 4c44352e68..ee84d3537c 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -75,11 +75,11 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): _cast_scalar_unchecked(data: IntLike) -> TIntScalar_co: Create an integer without type checking of the input. cast_scalar(data: object) -> TIntScalar_co: - Convert object to numpy integer, raising TypeError if invalid. + Convert object to NumPy integer, raising TypeError if invalid. default_scalar() -> TIntScalar_co: Get the default value, which is 0 cast to this dtype. from_json_scalar(data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: - Read a JSON-serializable value as a numpy int scalar. + Read a JSON-serializable value as a NumPy int scalar. to_json_scalar(data: object, *, zarr_format: ZarrFormat) -> int: Convert an object to JSON-serializable scalar. """ @@ -174,27 +174,27 @@ def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: def cast_scalar(self, data: object) -> TIntScalar_co: """ - Attempt to cast a given object to a numpy integer scalar. + Attempt to cast a given object to a NumPy integer scalar. Parameters ---------- data : object - The data to be cast to a numpy integer scalar. + The data to be cast to a NumPy integer scalar. Returns ------- TIntScalar_co - The data cast as a numpy integer scalar. + The data cast as a NumPy integer scalar. Raises ------ TypeError - If the data cannot be converted to a numpy integer scalar. + If the data cannot be converted to a NumPy integer scalar. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy integer." + msg = f"Cannot convert object with type {type(data)} to a NumPy integer." raise TypeError(msg) def default_scalar(self) -> TIntScalar_co: @@ -210,19 +210,19 @@ def default_scalar(self) -> TIntScalar_co: def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Read a JSON-serializable value as a numpy int scalar. + Read a JSON-serializable value as a NumPy int scalar. Parameters ---------- data : JSON The JSON-serializable value. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- TIntScalar_co - The numpy int scalar. + The NumPy int scalar. Raises ------ @@ -243,7 +243,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: data : object The value to convert. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- @@ -258,13 +258,13 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): """ A Zarr data type for 8-bit signed integers. - This class wraps the NumPy np.dtypes.Int8DType data type. Scalars for this data type are - instances of np.int8. + Wraps the NumPy ``np.dtypes.Int8DType`` data type. Scalars for this data type are + instances of ``np.int8``. Attributes ---------- dtype_cls : np.dtypes.Int8DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["int8"]] = "int8" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal["|i1"]]] = ("|i1",) @@ -288,7 +288,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Returns ------- Self - An instance of Int8. + An instance of this data type. Raises ------ @@ -325,7 +325,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of Int8. + An instance of this data type. Raises ------ @@ -350,7 +350,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of Int8. + An instance of this data type. Raises ------ @@ -377,11 +377,11 @@ def to_json( Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - DTypeConfig_V2 or str + DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"] The JSON-serializable representation of the data type. Raises @@ -398,12 +398,12 @@ def to_json( @property def item_size(self) -> int: """ - Return the size of the item in bytes. + The size of a single scalar in bytes. Returns ------- int - The size of the item in bytes. + The size of a single scalar in bytes. """ return 1 @@ -411,13 +411,13 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): """ - A Zarr data type for arrays containing 8-bit unsigned integers. This class wraps the NumPy + A Zarr data type for arrays containing 8-bit unsigned integers. Wraps the NumPy np.dtypes.UInt8DType data type. Scalars for this data type are instances np.uint8. Attributes ---------- dtype_cls : np.dtypes.UInt8DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["uint8"]] = "uint8" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal["|u1"]]] = ("|u1",) @@ -517,7 +517,7 @@ def to_json( Parameters ---------- zarr_format : ZarrFormat - The zarr format version. Supported values are 2 and 3. + The Zarr format version. Supported values are 2 and 3. Returns ------- @@ -541,7 +541,12 @@ def to_json( @property def item_size(self) -> int: """ - The size of the item in bytes. For an unsigned 8-bit integer, this is always 1 byte. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 1 @@ -549,13 +554,13 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): """ - A Zarr data type for arrays containing 16-bit signed integers. This class wraps the NumPy + A Zarr data type for arrays containing 16-bit signed integers. Wraps the NumPy np.dtypes.Int16DType data type. Scalars for this data type are instances np.int16. Attributes ---------- dtype_cls : np.dtypes.Int16DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["int16"]] = "int16" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: If the input JSON is not a valid representation of this data type. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -704,7 +709,12 @@ def to_json( @property def item_size(self) -> int: """ - The size of the item in bytes. For an unsigned 16-bit integer, this is always 2 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 2 @@ -714,13 +724,13 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): """ A Zarr data type for arrays containing 16-bit unsigned integers. - This class wraps the NumPy np.dtypes.UInt16DType data type. - Scalars for this data type are instances np.uint16. + Wraps the NumPy ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of + ``np.uint16``. Attributes ---------- dtype_cls : np.dtypes.UInt16DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["uint16"]] = "uint16" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: Parameters ---------- dtype : np.dtype - The numpy data type. + The NumPy data type. Returns ------- @@ -790,7 +800,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: If the input JSON is not a valid representation of this data type. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -869,7 +879,12 @@ def to_json( @property def item_size(self) -> int: """ - The size of the item in bytes. For an unsigned 16-bit integer, this is always 2 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 2 @@ -877,13 +892,15 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): """ - A Zarr data type for arrays containing 32-bit signed integers. This class wraps the NumPy - np.dtypes.Int32DType data type. Scalars for this data type are instances np.int32. + A Zarr data type for arrays containing 32-bit signed integers. + + Wraps the NumPy ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of + ``np.int32``. Attributes ---------- dtype_cls : np.dtypes.Int32DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["int32"]] = "int32" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">i4"], Literal["i4", " Self: Returns ------- - Int32 - The Int32 instance. + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of an Int32. """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) @@ -939,8 +961,8 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- - Int32 - The Int32 instance. + Self + An instance of this data type. Raises ------ @@ -948,7 +970,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: If the input JSON is not a valid representation of an Int32. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -967,8 +989,8 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- - Int32 - The Int32 instance. + Self + An instance of this data type. Raises ------ @@ -990,17 +1012,32 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">i4", "i4" or "i4", " int: """ - The size of the item in bytes. For an signed 32-bit integer, this is always 4 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 4 @@ -1022,13 +1064,13 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): """ A Zarr data type for arrays containing 32-bit unsigned integers. - This class wraps the NumPy np.dtypes.UInt32DType data type. Scalars for this data type are - instances np.uint32. + Wraps the NumPy ``np.dtypes.UInt32DType`` data type. Scalars for this data type are instances of + ``np.uint32``. Attributes ---------- dtype_cls : np.dtypes.UInt32DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["uint32"]] = "uint32" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">u4"], Literal["u4", " Self: integer. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -1146,11 +1188,11 @@ def to_json( Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - str + DTypeConfig_V2[Literal[">u4", " int: """ - The size of the item in bytes. For an unsigned 32-bit integer, this is always 4 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 4 @@ -1173,13 +1220,13 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): """ A Zarr data type for arrays containing 64-bit signed integers. - This class wraps the NumPy np.dtypes.Int64DType data type. Scalars for this data type are - instances np.int64. + Wraps the NumPy ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of + ``np.int64``. Attributes ---------- dtype_cls : np.dtypes.Int64DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["int64"]] = "int64" The name of this data type in Zarr V3. _zarr_v2_names : ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: integer. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -1297,12 +1344,12 @@ def to_json( Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - str - The JSON-serializable representation of the data type + DTypeConfig_V2[Literal[">i8", " int: """ - The size of the item in bytes. For a signed 64-bit integer, this is always 8 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 8 @@ -1324,14 +1376,13 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): """ A Zarr data type for arrays containing 64-bit unsigned integers. - This data type wraps the NumPy np.dtypes.UInt64DType data type. Scalars for this - data type are instances of np.uint64. - + This data type wraps the NumPy ``np.dtypes.UInt64DType`` data type. Scalars for this data type + are instances of ``np.uint64``. Attributes ---------- dtype_cls: np.dtypes.UInt64DType - The class of the underlying numpy dtype. + The class of the underlying NumPy dtype. _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" The name of this data type in Zarr V3. _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: """ - Convert the data type to a native numpy dtype. + Convert the data type to a native NumPy dtype. Returns ------- np.dtypes.UInt64DType - The native numpy dtype. + The native NumPy dtype. """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -1376,7 +1427,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: integer. """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -1423,12 +1474,12 @@ def to_json( Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - str - The JSON-serializable representation of the data type + DTypeConfig_V2[Literal[">u8", " Self: """ - Create an instance of this data type from a native numpy dtype. + Create an instance of this data type from a native NumPy dtype. Parameters ---------- dtype : TBaseDType - The native numpy dtype. + The native NumPy dtype. Returns ------- @@ -1467,6 +1518,11 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: @property def item_size(self) -> int: """ - The size of the item in bytes. For an unsigned 64-bit integer, this is always 8 bytes. + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. """ return 8 diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3399eb3ae4..e44da15af6 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -14,7 +14,7 @@ runtime_checkable, ) -import numpy as np +import NumPy as np from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( @@ -62,12 +62,12 @@ class FixedLengthUTF32( """ A Zarr data type for arrays containing fixed-length UTF-32 strings. - This class wraps the NumPy np.dtypes.StrDType data type. Scalars for this data type are instances of np.str_. + Wraps the NumPy np.dtypes.StrDType data type. Scalars for this data type are instances of np.str_. Attributes ---------- dtype_cls : Type[np.dtypes.StrDType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["fixed_length_utf32"]] The name of this data type in Zarr V3. code_point_bytes : ClassVar[int] = 4 @@ -118,7 +118,7 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: @classmethod def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid JSON representation of a numpy U dtype. + Check that the input is a valid JSON representation of a NumPy U dtype. Parameters ---------- @@ -128,7 +128,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] Returns ------- TypeGuard[DTypeConfig_V2[str, None]] - Whether the input is a valid JSON representation of a numpy U dtype. + Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( check_dtype_spec_v2(data) @@ -140,7 +140,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: """ - Check that the input is a valid JSON representation of a numpy U dtype. + Check that the input is a valid JSON representation of a NumPy U dtype. Parameters ---------- @@ -150,7 +150,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: Returns ------- TypeGuard[FixedLengthUTF32JSONV3] - Whether the input is a valid JSON representation of a numpy U dtype. + Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( isinstance(data, dict) @@ -197,7 +197,7 @@ def to_json( @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ - Create a FixedLengthUTF32 from a JSON representation of a numpy U dtype. + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. Parameters ---------- @@ -210,17 +210,17 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: An instance of this data type. """ if cls._check_json_v2(data): - # Construct the numpy dtype instead of string parsing. + # Construct the NumPy dtype instead of string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a numpy U dtype." + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." ) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ - Create a FixedLengthUTF32 from a JSON representation of a numpy U dtype. + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. Parameters ---------- @@ -318,7 +318,7 @@ def cast_scalar(self, data: object) -> np.str_: The native scalar value. """ if self._check_scalar(data): - # We explicitly truncate before casting because of the following numpy behavior: + # We explicitly truncate before casting because of the following NumPy behavior: # >>> x = np.dtype('U3').type('hello world') # >>> x # np.str_('hello world') @@ -330,50 +330,58 @@ def cast_scalar(self, data: object) -> np.str_: else: return self.to_native_dtype().type(data[: self.length]) raise TypeError( - f"Cannot convert object with type {type(data)} to a numpy unicode string scalar." + f"Cannot convert object with type {type(data)} to a NumPy unicode string scalar." ) @property def item_size(self) -> int: """ - Return the size of each item in the data type. + The size of a single scalar in bytes. Returns ------- int - The size of each item in the data type. + The size of a single scalar in bytes. """ return self.length * self.code_point_bytes def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: """ - This function checks the type of JSON-encoded variable length strings. It is generous for - backwards compatibility, as zarr-python v2 would use ints for variable length strings - fill values + Check if the input is a valid JSON scalar for a variable-length string. + + This function is generous for backwards compatibility, as Zarr Python v2 would use ints for + variable-length string fill values. + + Parameters + ---------- + data : object + The JSON value to check. + + Returns + ------- + TypeGuard[int | str | float] + True if the input is a valid scalar for a variable-length string. """ return isinstance(data, int | str | float) -# VariableLengthUTF8 is defined in two places, conditioned on the version of numpy. -# If numpy 2 is installed, then VariableLengthUTF8 is defined with the numpy variable length -# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the numpy object +# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. +# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length +# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object # dtype as the native dtype. class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): """ - A base class for the variable length UTF-8 string data type. This class should not be used - as data type, but as a base class for other variable length string data types. - - This class is a generic implementation of a variable length UTF-8 string data type. It is - intended to be used as a base class for other variable length string data types. + Base class for variable-length UTF-8 string data types. Not intended for direct use, but as a + base for concrete implementations. Attributes ---------- dtype_cls : TDType_co The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] The name of this data type in Zarr V3. - object_codec_id : ClassVar[Literal["vlen-utf8"]] = "variable_length_utf8" + object_codec_id : ClassVar[Literal["vlen-utf8"]] The object codec ID for this data type. """ @@ -383,17 +391,23 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create an instance of this class from a numpy dtype. + Create an instance of this data type from a compatible NumPy data type. + Parameters ---------- - dtype : numpy.dtype - The numpy dtype to create an instance from. + dtype : TBaseDType + The native data type. Returns ------- Self - An instance of this class. + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input is not compatible with this data type. """ if cls._check_native_dtype(dtype): return cls() @@ -407,8 +421,8 @@ def _check_json_v2( data: DTypeJSON, ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]: """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. + "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype + for Zarr v2." Parameters ---------- @@ -418,7 +432,7 @@ def _check_json_v2( Returns ------- TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]] - Whether the input is a valid JSON representation of a numpy O dtype, and that the + Whether the input is a valid JSON representation of a NumPy "object" data type, and that the object codec id is appropriate for variable-length UTF-8 strings. """ return ( @@ -449,7 +463,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_u @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ - Create an instance of this class from a JSON representation of a numpy O dtype. + Create an instance of this class from a JSON representation of a NumPy "object" dtype. Parameters ---------- @@ -459,7 +473,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this class. + An instance of this data type. """ if cls._check_json_v2(data): return cls() @@ -482,7 +496,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this class. + An instance of this data type. """ if cls._check_json_v3(data): return cls() @@ -604,21 +618,21 @@ def _cast_scalar_unchecked(self, data: SupportsStr) -> str: def cast_scalar(self, data: object) -> str: """ - Cast a scalar value to the native scalar type. + Cast an object to a string. Parameters ---------- data : object - The scalar value to cast. + The value to cast. Returns ------- str - The native scalar type of the scalar value. + The input cast to str. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - raise TypeError(f"Cannot convert object with type {type(data)} to a python string.") + raise TypeError(f"Cannot convert object with type {type(data)} to a Python string.") if _NUMPY_SUPPORTS_VLEN_STRING: @@ -626,14 +640,14 @@ def cast_scalar(self, data: object) -> str: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] """ - A Zarr data type for arrays containing variable-length UTF-8 strings. This class wraps the - NumPy np.dtypes.StringDType data type. Scalars for this data type are python strings. + A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the + NumPy np.dtypes.StringDType data type. Scalars for this data type are Python strings. Attributes ---------- dtype_cls : Type[np.dtypes.StringDType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" The name of this data type in Zarr V3. object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" @@ -658,14 +672,14 @@ def to_native_dtype(self) -> np.dtypes.StringDType: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] """ - A Zarr data type for arrays containing variable-length UTF-8 strings. This class wraps the - NumPy np.dtypes.ObjecDType data type. Scalars for this data type are python strings. + A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the + NumPy np.dtypes.ObjectDType data type. Scalars for this data type are Python strings. Attributes ---------- dtype_cls : Type[np.dtypes.ObjectDType] - The numpy dtype class for this data type. + The NumPy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" The name of this data type in Zarr V3. object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 07e3000826..2452baa6d9 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload import numpy as np @@ -33,8 +33,20 @@ @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): + """ + A Zarr data type for arrays containing structured scalars, AKA "record arrays". + + Wraps the NumPy `np.dtypes.VoidDType` if the data type has fields. Scalars for this data + type are instances of `np.void`, with a ``fields`` attribute. + + Attributes + ---------- + fields : Sequence[tuple[str, ZDType]] + The fields of the structured dtype. + """ + dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "structured" + _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] @classmethod @@ -56,6 +68,30 @@ def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Structured ZDType from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtypes.VoidDType with a non-null + ``fields`` attribute. + + Notes + ----- + This method attempts to resolve the fields of the structured dtype using the data type + registry. + """ from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] @@ -72,6 +108,19 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + """ + Convert the structured Zarr data type to a native NumPy void dtype. + + This method constructs a NumPy dtype with fields corresponding to the + fields of the structured Zarr data type, by converting each field's + data type to its native dtype representation. + + Returns + ------- + np.dtypes.VoidDType[int] + The native NumPy void dtype representing the structured data type. + """ + return cast( "np.dtypes.VoidDType[int]", np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), @@ -82,6 +131,24 @@ def _check_json_v2( cls, data: DTypeJSON, ) -> TypeGuard[DTypeConfig_V2[StructuredName_V2, None]]: + """ + Check if the input is a valid JSON representation of a Structured data type + for Zarr V2. + + The input data must be a mapping that contains a "name" key that is not a str, + and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[StructuredName_V2, None]] + True if the input is a valid JSON representation of a Structured data type + for Zarr V2, False otherwise. + """ return ( check_dtype_spec_v2(data) and not isinstance(data["name"], str) @@ -93,6 +160,26 @@ def _check_json_v2( def _check_json_v3( cls, data: DTypeJSON ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]]: + """ + Check if the input data is a valid JSON representation of a structured data type + for Zarr V3. + + The input must be a dictionary with a "name" key and a "configuration" key. The + "name" key must have the value "structured", and the "configuration" key must map + to a dictionary containing a "fields" key. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]] + True if the input is a valid JSON representation of a structured data type for Zarr V3, + False otherwise. + """ + return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -150,6 +237,24 @@ def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3: + """ + Convert the structured data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version. Accepted values are 2 and 3. + + Returns + ------- + DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3 + The JSON representation of the structured data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: fields = [ [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] @@ -171,9 +276,41 @@ def to_json( def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! + """ + Check that the input is a valid scalar value for this structured data type. + + Parameters + ---------- + data : object + The scalar value to check. + + Returns + ------- + TypeGuard[StructuredScalarLike] + Whether the input is a valid scalar value for this structured data type. + """ return isinstance(data, (bytes, list, tuple, int, np.void)) def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: + """ + Cast a python object to a numpy structured scalar without type checking. + + Parameters + ---------- + data : StructuredScalarLike + The data to cast. + + Returns + ------- + np.void + The casted data as a numpy structured scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be castable to a numpy structured scalar. + + """ na_dtype = self.to_native_dtype() if isinstance(data, bytes): res = np.frombuffer(data, dtype=na_dtype)[0] @@ -184,15 +321,68 @@ def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: return cast("np.void", res) def cast_scalar(self, data: object) -> np.void: + """ + Cast a Python object to a NumPy structured scalar. + + This function attempts to cast the provided data to a NumPy structured scalar. + If the data is compatible with the structured scalar type, it is cast without + type checking. Otherwise, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a NumPy structured scalar. + + Returns + ------- + np.void + The data cast as a NumPy structured scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy structured scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy structured scalar." + msg = f"Cannot convert object with type {type(data)} to a NumPy structured scalar." raise TypeError(msg) def default_scalar(self) -> np.void: + """ + Get the default scalar value for this structured data type. + + Returns + ------- + np.void + The default scalar value, which is the scalar representation of 0 + cast to this structured data type. + """ + return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + """ + Read a JSON-serializable value as a NumPy structured scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.void + The NumPy structured scalar. + + Raises + ------ + TypeError + If the input is not a base64-encoded string. + """ if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_native_dtype() @@ -200,9 +390,32 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise TypeError(f"Invalid type: {data}. Expected a string.") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar, which is a base64-encoded + string of the bytes that make up the scalar. + """ return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) @property def item_size(self) -> int: - # Lets have numpy do the arithmetic here + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.to_native_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 62efdf4106..625bda5d2f 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -160,8 +160,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Returns ------- Self - An instance of this class configured with the unit, scale factor, and endianness - derived from the provided dtype. + An instance of this data type. Raises ------ @@ -259,12 +258,12 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: @property def item_size(self) -> int: """ - The size of each item in the data type, in bytes. + The size of a single scalar in bytes. Returns ------- int - The size of each item in the data type, in bytes. + The size of a single scalar in bytes. """ return 8 @@ -274,7 +273,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has """ A Zarr data type for arrays containing NumPy TimeDelta64 data. - This class wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + Wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type are instances of ``np.timedelta64``. Attributes @@ -353,6 +352,12 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: } This function can be used as a type guard to narrow the type of unknown JSON input. + + Returns + ------- + TypeGuard[DateTime64JSONV3] + True if the JSON input is a valid representation of a TimeDelta64 in Zarr V3, + otherwise False. """ return ( isinstance(data, dict) @@ -513,7 +518,7 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd """ A Zarr data type for arrays containing NumPy Datetime64 data. - This class wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + Wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type are instances of ``np.datetime64``. Attributes diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 1d2a97a90a..cb9ab50044 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -23,34 +23,131 @@ # have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: + """ + A registry for ZDType classes. + + This registry is a mapping from Zarr data type names to their + corresponding ZDType classes. + + Attributes + ---------- + contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] + The mapping from Zarr data type names to their corresponding + ZDType classes. + """ + contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - def lazy_load(self) -> None: - for e in self.lazy_load_list: + def _lazy_load(self) -> None: + """ + Load all data types from the lazy load list and register them with + the registry. After loading, clear the lazy load list. + """ + for e in self._lazy_load_list: self.register(e.load()._zarr_v3_name, e.load()) - self.lazy_load_list.clear() + self._lazy_load_list.clear() def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - # don't register the same dtype twice + """ + Register a data type with the registry. + + Parameters + ---------- + key : str + The Zarr V3 name of the data type. + cls : type[ZDType[TBaseDType, TBaseScalar]] + The class of the data type to register. + + Notes + ----- + This method is idempotent. If the data type is already registered, this + method does nothing. + """ if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls def unregister(self, key: str) -> None: - """Unregister a data type by its key.""" + """ + Unregister a data type from the registry. + + Parameters + ---------- + key : str + The key associated with the ZDType class to be unregistered. + + Returns + ------- + None + + Raises + ------ + KeyError + If the data type is not found in the registry. + """ if key in self.contents: del self.contents[key] else: raise KeyError(f"Data type '{key}' not found in registry.") def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: + """ + Retrieve a registered ZDType class by its key. + + Parameters + ---------- + key : str + The key associated with the desired ZDType class. + + Returns + ------- + type[ZDType[TBaseDType, TBaseScalar]] + The ZDType class registered under the given key. + + Raises + ------ + KeyError + If the key is not found in the registry. + """ + return self.contents[key] def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a native data type, e.g. a NumPy data type, to a registered ZDType. + + Parameters + ---------- + dtype : TBaseDType + The native data type to match. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the provided NumPy data type. + + Raises + ------ + ValueError + If the data type is a NumPy "Object" type, which is ambiguous, or if multiple + or no Zarr data types are found that match the provided dtype. + + Notes + ----- + This function attempts to resolve a Zarr data type from a given native data type. + If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type + can represent multiple Zarr data types. In such cases, a specific Zarr data type + should be explicitly constructed instead of relying on dynamic resolution. + + If multiple matches are found, it will also raise a ValueError. In this case + conflicting data types must be unregistered, or the Zarr data type should be explicitly + constructed. + """ + if dtype == np.dtype("O"): msg = ( f"Zarr data type resolution from {dtype} failed. " @@ -82,6 +179,27 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: def match_json( self, data: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a JSON representation of a data type to a registered ZDType. + + Parameters + ---------- + data : DTypeJSON + The JSON representation of a data type to match. + zarr_format : ZarrFormat + The Zarr format version to consider when matching data types. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the JSON representation. + + Raises + ------ + ValueError + If no matching Zarr data type is found for the given JSON data. + """ + for val in self.contents.values(): try: return val.from_json(data, zarr_format=zarr_format) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index eb345b24b1..189d42abed 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -93,8 +93,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index c7d5f90065..0e3db059e1 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -166,7 +166,7 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry.lazy_load() + data_type_registry._lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance From c4031dc1df5564b08c3c815b9c854dae4f189c82 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 11:05:46 +0200 Subject: [PATCH 11/42] fixup --- docs/user-guide/data_types.rst | 36 +++++++++++++++---------------- src/zarr/core/dtype/npy/string.py | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 2ea31bf860..f1ba516ed0 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -20,7 +20,7 @@ Array Data Types ^^^^^^^^^^^^^^^^ Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data -type is encoded in the JSON metadata for the array. This means that the data type of an array must be +type is encoded in the JSON metadata for the array. This means that the data type of an array must be JSON-serializable. In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. @@ -47,7 +47,7 @@ Data Types in Zarr Version 2 Version 2 of the Zarr format defined its data types relative to `NumPy's data types `_, -and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr +and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: .. code-block:: python @@ -86,8 +86,8 @@ of named fields, where each field is itself a distinct NumPy data type. See the Crucially, NumPy does not use a special data type for structured data types—instead, NumPy implements structured data types as an optional feature of the so-called "Void" data type, which models arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void -data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` -attribute does not convey information about the fields contained in a structured data type. For +data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` +attribute does not convey information about the fields contained in a structured data type. For these reasons, Zarr V2 uses a special data type encoding for structured data types. They are stored in JSON as lists of pairs, where the first element is a string, and the second element is a Zarr V2 data type specification. This representation supports recursion. @@ -99,7 +99,7 @@ For example: >>> store = {} >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', [('subfield_c', '>f4'), ('subfield_d', 'i2')])]) >>> np_dtype.str - '|V6' + '|V8' >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> dtype_meta @@ -109,7 +109,7 @@ Object Data Type ^^^^^^^^^^^^^^^^ The NumPy "object" type is essentially an array of references to arbitrary Python objects. -It can model arrays of variable-length UTF-8 strings, arrays of variable-length byte strings, or +It can model arrays of variable-length UTF-8 strings, arrays of variable-length byte strings, or even arrays of variable-length arrays, each with their own distinct data type. This makes the "object" data type expressive, but also complicated for Zarr V2. Remember that, with @@ -127,10 +127,10 @@ If an array with data type "object" used the ``"vlen-utf8"`` codec, then it was array of variable-length strings. If an array with data type "object" used the ``"vlen-bytes"`` codec, then it was interpreted as an array of variable-length byte strings. -This means that the ``dtype`` field alone does not fully specify a data type in Zarr V2. The name of +This means that the ``dtype`` field alone does not fully specify a data type in Zarr V2. The name of the object codec used, if one was used, is also required. Although this fact can be ignored for many simple numeric data types, any comprehensive approach to Zarr V2 data types must either reject -the "object" data types or include the "object codec" identifier in the JSON form of the basic data +the "object" data types or include the "object codec" identifier in the JSON form of the basic data type model. Data Types in Zarr Version 3 @@ -167,9 +167,9 @@ For more about data types in Zarr V3, see the Data Types in Zarr Python ------------------------- -The two Zarr formats that Zarr Python supports specify data types in different ways: data types in -Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data -types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data +The two Zarr formats that Zarr Python supports specify data types in different ways: data types in +Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data +types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data types do not have any associated endianness information, unlike Zarr V2 data types. Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences. @@ -178,7 +178,7 @@ which provides Zarr V2 and Zarr V3 compatibility routines for "native" data type In this context, a "native" data type is a Python class, typically defined in another library, that models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. -Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called +Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. As of this writing, the only native data types Zarr Python supports are NumPy data types. We could @@ -375,16 +375,16 @@ equivalents, as there is a nearly unbounded number of different structured data a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," -is essentially a dictionary where the keys are strings (a canonical name for each data type), and the -values are the data type classes themselves. Dynamic data type resolution entails iterating over -these data type classes, invoking a special class constructor defined on each one, and returning a +is essentially a dictionary where the keys are strings (a canonical name for each data type), and the +values are the data type classes themselves. Dynamic data type resolution entails iterating over +these data type classes, invoking a special class constructor defined on each one, and returning a concrete data type instance if and only if exactly one of those constructor invocations is successful. -In plain language, we take some user input, like a NumPy data type, offer it to all the +In plain language, we take some user input, like a NumPy data type, offer it to all the known data type classes, and return an instance of the one data type class that can accept that user input. We want to avoid a situation where the same native data type matches multiple Zarr data types; that is, a NumPy data type should *uniquely* specify a single Zarr data type. But data type resolution is -dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we -attempt data type resolution against *every* data type class, and if, for some reason, a native data +dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we +attempt data type resolution against *every* data type class, and if, for some reason, a native data type matches multiple Zarr data types, we treat this as an error and raise an exception. \ No newline at end of file diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index e44da15af6..824915437a 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -14,7 +14,7 @@ runtime_checkable, ) -import NumPy as np +import numpy as np from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( From ae268b92200b53da3eb585a4997c32be366b9201 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 11:35:35 +0200 Subject: [PATCH 12/42] prose --- docs/user-guide/data_types.rst | 38 +++++++++++++++++--------------- src/zarr/core/dtype/npy/float.py | 2 +- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index f1ba516ed0..8a2ecd0d9b 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -87,10 +87,10 @@ Crucially, NumPy does not use a special data type for structured data types—in implements structured data types as an optional feature of the so-called "Void" data type, which models arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` -attribute does not convey information about the fields contained in a structured data type. For -these reasons, Zarr V2 uses a special data type encoding for structured data types. They are stored -in JSON as lists of pairs, where the first element is a string, and the second element is a Zarr V2 -data type specification. This representation supports recursion. +attribute does not convey information about the fields contained in a structured data type. +For these reasons, Zarr V2 uses a special data type encoding for structured data types. +They are stored in JSON as lists of pairs, where the first element is a string, and the second +element is a Zarr V2 data type specification. This representation supports recursion. For example: @@ -110,28 +110,30 @@ Object Data Type The NumPy "object" type is essentially an array of references to arbitrary Python objects. It can model arrays of variable-length UTF-8 strings, arrays of variable-length byte strings, or -even arrays of variable-length arrays, each with their own distinct data type. +even arrays of variable-length arrays, each with a distinct data type. This makes the "object" data +type expressive, but also complicated to store. -This makes the "object" data type expressive, but also complicated for Zarr V2. Remember that, with -the exception of "structured" data types, Zarr V2 uses the NumPy string representation of a data type -to identify it in metadata. +Zarr Python cannot persistently store references to arbitrary Python objects. But if each of those Python +objects has a consistent type, then we can use a special encoding procedure to store the array. This +is how Zarr Python stores variable-length UTF-8 strings, or variable-length byte strings. -An "object" array of variable-length UTF-8 strings and an "object" array of variable-length byte strings -have logically separate data types, but in NumPy they would both have the same array data type: "object", -and thus the same string representation. +Although these are separate data types in this library, they are both "object" arrays in NumPy, which means +they have the same Zarr V2 string representation: ``"|O"``. Clearly in this case the string +representation of the data type is ambiguous in this case. -So, Zarr V2 disambiguated different "object" data type arrays on the basis of their chunk encoding, -i.e., the codecs declared in the ``filters`` and ``compressor`` attributes of array metadata. +So for Zarr V2 we have to disambiguate different "object" data type arrays on the basis of their +encoding procedure, i.e., the codecs declared in the ``filters`` and ``compressor`` attributes of array +metadata. If an array with data type "object" used the ``"vlen-utf8"`` codec, then it was interpreted as an array of variable-length strings. If an array with data type "object" used the ``"vlen-bytes"`` codec, then it was interpreted as an array of variable-length byte strings. -This means that the ``dtype`` field alone does not fully specify a data type in Zarr V2. The name of -the object codec used, if one was used, is also required. Although this fact can be ignored for many -simple numeric data types, any comprehensive approach to Zarr V2 data types must either reject -the "object" data types or include the "object codec" identifier in the JSON form of the basic data -type model. +This all means that the ``dtype`` field alone does not fully specify a data type in Zarr V2. +The name of the object codec used, if one was used, is also required. +Although this fact can be ignored for many simple numeric data types, any comprehensive approach to +Zarr V2 data types must either reject the "object" data types or include the "object codec" +identifier in the JSON form of the basic data type model. Data Types in Zarr Version 3 ---------------------------- diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index bbc3cf7e93..c2a5449f49 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -36,7 +36,7 @@ @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): """ - Base class for NumPy float data types. + Base class for float data types. Attributes ---------- From eec3ec39c87ed9e16fd1b094529209a740622ccb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 12:44:52 +0200 Subject: [PATCH 13/42] gamble on a new pytest version fixing windows CI failure --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f34b220c9c..5e6374900c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ gpu = [ test = [ "coverage", # Pin possibly due to https://github.com/pytest-dev/pytest-cov/issues/693 - "pytest<8.4", + "pytest==8.4.0", "pytest-asyncio", "pytest-cov", "pytest-accept", From f942508c667eb7a0e899a554bf15feeb847ed595 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 12:45:01 +0200 Subject: [PATCH 14/42] gamble on a new pytest version fixing windows CI failure --- src/zarr/core/dtype/npy/int.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index d71e938511..f2d06843ab 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -411,8 +411,8 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): """ - A Zarr data type for arrays containing 8-bit unsigned integers. - + A Zarr data type for arrays containing 8-bit unsigned integers. + Wraps the NumPy ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. Attributes From 9d3dc48ca40b6be41828fe22167e020fb1532b76 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 12:52:45 +0200 Subject: [PATCH 15/42] revert change to pytest dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5e6374900c..f34b220c9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ gpu = [ test = [ "coverage", # Pin possibly due to https://github.com/pytest-dev/pytest-cov/issues/693 - "pytest==8.4.0", + "pytest<8.4", "pytest-asyncio", "pytest-cov", "pytest-accept", From 532ae1ee4c9462ea1895f9f992bee0a8961290f7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 12:55:06 +0200 Subject: [PATCH 16/42] skip example tests on windows --- tests/test_examples.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_examples.py b/tests/test_examples.py index 8e26785c46..c97766364b 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -2,6 +2,7 @@ import re import subprocess +import sys from pathlib import Path from typing import Final @@ -61,6 +62,9 @@ def resave_script(source_path: Path, dest_path: Path) -> None: dest_path.write_text(dest_text) +@pytest.mark.skipif( + sys.platform in ("win32",), reason="This test fails due for unknown reasons on Windows in CI." +) @pytest.mark.parametrize("script_path", script_paths) def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: dest_path = tmp_path / script_path.name From 620749b784f51d99fc19330ae4a4c3be2b52fdda Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 14:27:52 +0200 Subject: [PATCH 17/42] unexclude api from exclude_patterns --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index c565c97b54..efbf708c25 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -124,7 +124,7 @@ def skip_submodules( # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "talks", "api"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "talks"] # The reST default role (used for this markup: `text`) to use for all # documents. From 45aab29e0e8ce460b997a584d769f078aaca45e5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 14:28:45 +0200 Subject: [PATCH 18/42] harmonize docstrings --- src/zarr/core/dtype/npy/bool.py | 6 ++-- src/zarr/core/dtype/npy/bytes.py | 60 ++++++++----------------------- src/zarr/core/dtype/npy/int.py | 40 ++++----------------- src/zarr/core/dtype/npy/string.py | 15 +++++--- 4 files changed, 35 insertions(+), 86 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index cb500f3db6..2697086f36 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -21,8 +21,10 @@ @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ - A Zarr data type for arrays containing booleans. Wraps the NumPy - ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of ``np.bool_``. + A Zarr data type for arrays containing booleans. + + Wraps the NumPy ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of + ``np.bool_``. Attributes ---------- diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index e259610489..52f9f5563e 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -35,8 +35,10 @@ class FixedLengthBytesConfig(TypedDict): @dataclass(frozen=True, kw_only=True) class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): """ - A Zarr data type for arrays containing null-terminated bytes. Wraps the NumPy - ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of ``np.bytes_``. + A Zarr data type for arrays containing fixed-length null-terminated byte sequences. + + Wraps the NumPy ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of + ``np.bytes_``. This data type is parametrized by an integral length which specifies size in bytes of each scalar. Because this data type uses null-terminated semantics, indexing into @@ -158,15 +160,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3] """ Check that the input is a valid representation of NullTerminatedBytes in Zarr V3. - The input must be a mapping with the following structure: - - { - "name": "null_terminated_bytes", - "configuration": { - "length_bytes": - } - } - Parameters ---------- data : DTypeJSON @@ -256,19 +249,7 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3: """ - Generate a JSON representation of NullTerminatedBytes. - - If zarr_format is 2, the return value will be a dictionary with the form - { - "name": "|S", - "object_codec_id": None - } - - If zarr_format is 3, the resulting JSON will be a dictionary with the form - { - "name": "null_terminated_bytes", - "configuration": {"length_bytes": self.length} - } + Generate a JSON representation of this data type. Parameters ---------- @@ -440,13 +421,10 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): - # np.dtypes.VoidDType is specified in an odd way in NumPy - # it cannot be used to create instances of the dtype - # so we have to tell mypy to ignore this here - """ - A Zarr data type for arrays containing raw bytes. Wraps the NumPy ``void`` data type. - Scalars for this data type are instances of ``np.void``. + A Zarr data type for arrays containing fixed-length sequences of raw bytes. + + Wraps the NumPy ``void`` data type. Scalars for this data type are instances of ``np.void``. This data type is parametrized by an integral length which specifies size in bytes of each scalar belonging to this data type. @@ -491,6 +469,9 @@ class does not support structured data types. """ + # np.dtypes.VoidDType is specified in an odd way in NumPy + # it cannot be used to create instances of the dtype + # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" @@ -694,19 +675,7 @@ def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawBytesJSONV3: """ - Generate a JSON representation of RawBytes. - - If zarr_format is 2, the return value will be a dictionary with the form - { - "name": "|V", - "object_codec_id": None - } - - If zarr_format is 3, the resulting JSON will be a dictionary with the form - { - "name": "raw_bytes", - "configuration": {"length_bytes": self.length} - } + Generate a JSON representation of this data type. Parameters ---------- @@ -874,8 +843,9 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): """ - A Zarr data type for arrays containing variable-length bytes. Wraps the NumPy - "object" data type. Scalars for this data type are instances of plain python bytes. + A Zarr data type for arrays containing variable-length sequences of bytes. + + Wraps the NumPy "object" data type. Scalars for this data type are instances of ``bytes``. Attributes ---------- diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index f2d06843ab..75bbebef18 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -256,7 +256,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): """ - A Zarr data type for 8-bit signed integers. + A Zarr data type for arrays containing 8-bit signed integers. Wraps the NumPy ``np.dtypes.Int8DType`` data type. Scalars for this data type are instances of ``np.int8``. @@ -555,8 +555,10 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): """ - A Zarr data type for arrays containing 16-bit signed integers. Wraps the NumPy - np.dtypes.Int16DType data type. Scalars for this data type are instances np.int16. + A Zarr data type for arrays containing 16-bit signed integers. + + Wraps the NumPy ``np.dtypes.Int16DType`` data type. Scalars for this data type are instances of + ``np.int16``. Attributes ---------- @@ -675,16 +677,6 @@ def to_json( """ Serialize this ZDType to v2- or v3-flavored JSON - If the zarr_format is 2, then return a dict like this: - .. code-block:: json - - { - "name": ">i2" or "u2" or "i4" or " str: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] """ - A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the - NumPy np.dtypes.StringDType data type. Scalars for this data type are Python strings. + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the NumPy ``np.dtypes.StringDType`` data type. Scalars for this data type are instances + of ``str``. Attributes @@ -672,8 +675,10 @@ def to_native_dtype(self) -> np.dtypes.StringDType: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] """ - A Zarr data type for arrays containing variable-length UTF-8 strings. Wraps the - NumPy np.dtypes.ObjectDType data type. Scalars for this data type are Python strings. + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the NumPy ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances + of ``str``. Attributes From bf15d713a7f8b8db08181ad0f24102c2477df5c1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 16:16:30 +0200 Subject: [PATCH 19/42] numpy -> np --- docs/user-guide/data_types.rst | 2 +- src/zarr/core/dtype/npy/bool.py | 2 +- src/zarr/core/dtype/npy/bytes.py | 10 +++++----- src/zarr/core/dtype/npy/complex.py | 4 ++-- src/zarr/core/dtype/npy/float.py | 6 +++--- src/zarr/core/dtype/npy/int.py | 16 ++++++++-------- src/zarr/core/dtype/npy/string.py | 6 +++--- src/zarr/core/dtype/npy/structured.py | 8 ++++---- src/zarr/core/dtype/npy/time.py | 18 +++++++++--------- src/zarr/core/indexing.py | 2 +- 10 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 8a2ecd0d9b..e8583a3fdc 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -179,7 +179,7 @@ We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/ind which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. In this context, a "native" data type is a Python class, typically defined in another library, that -models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. +models an array's data type. For example, ``numpy.dtypes.UInt8DType`` is a native data type defined in NumPy. Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 2697086f36..a3f1653c95 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -23,7 +23,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ A Zarr data type for arrays containing booleans. - Wraps the NumPy ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of ``np.bool_``. Attributes diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 52f9f5563e..ea59af44eb 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -37,7 +37,7 @@ class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLengt """ A Zarr data type for arrays containing fixed-length null-terminated byte sequences. - Wraps the NumPy ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of ``np.bytes_``. This data type is parametrized by an integral length which specifies size in bytes of each @@ -724,7 +724,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: Returns ------- - np.void + numpy.void The casted data as a NumPy void scalar. Notes @@ -753,7 +753,7 @@ def cast_scalar(self, data: object) -> np.void: Returns ------- - np.void + numpy.void The data cast as a NumPy void scalar. Raises @@ -775,7 +775,7 @@ def default_scalar(self) -> np.void: Returns ------- - np.void + numpy.void The default scalar value. """ return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) @@ -815,7 +815,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Returns ------- - np.void + numpy.void The NumPy void scalar. Raises diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 88ec58bb75..74742feab4 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -349,7 +349,7 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): """ A Zarr data type for arrays containing 64 bit complex floats. - Wraps the NumPy ``np.dtypes.Complex64DType`` data type. Scalars for this data type + Wraps the ``np.dtypes.Complex64DType`` data type. Scalars for this data type are instances of ``np.complex64``. Attributes @@ -384,7 +384,7 @@ class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndia """ A Zarr data type for arrays containing 64 bit complex floats. - Wraps the NumPy ``np.dtypes.Complex128DType`` data type. Scalars for this data type + Wraps the ``np.dtypes.Complex128DType`` data type. Scalars for this data type are instances of ``np.complex128``. Attributes diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index c2a5449f49..06ef0cc4c2 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -317,7 +317,7 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): """ A Zarr data type for arrays containing 16-bit floating point numbers. - Wraps the NumPy ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances + Wraps the ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances of ``np.float16``. Attributes @@ -352,7 +352,7 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): """ A Zarr data type for arrays containing 32-bit floating point numbers. - Wraps the NumPy ``np.dtypes.Float32DType`` data type. Scalars for this data type are instances + Wraps the ``np.dtypes.Float32DType`` data type. Scalars for this data type are instances of ``np.float32``. Attributes @@ -387,7 +387,7 @@ class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): """ A Zarr data type for arrays containing 64-bit floating point numbers. - Wraps the NumPy ``np.dtypes.Float64DType`` data type. Scalars for this data type are instances + Wraps the ``np.dtypes.Float64DType`` data type. Scalars for this data type are instances of ``np.float64``. Attributes diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 75bbebef18..3e729a426c 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -258,7 +258,7 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): """ A Zarr data type for arrays containing 8-bit signed integers. - Wraps the NumPy ``np.dtypes.Int8DType`` data type. Scalars for this data type are + Wraps the ``np.dtypes.Int8DType`` data type. Scalars for this data type are instances of ``np.int8``. Attributes @@ -413,7 +413,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): """ A Zarr data type for arrays containing 8-bit unsigned integers. - Wraps the NumPy ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. + Wraps the ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. Attributes ---------- @@ -557,7 +557,7 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): """ A Zarr data type for arrays containing 16-bit signed integers. - Wraps the NumPy ``np.dtypes.Int16DType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.Int16DType`` data type. Scalars for this data type are instances of ``np.int16``. Attributes @@ -717,7 +717,7 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): """ A Zarr data type for arrays containing 16-bit unsigned integers. - Wraps the NumPy ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of ``np.uint16``. Attributes @@ -877,7 +877,7 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): """ A Zarr data type for arrays containing 32-bit signed integers. - Wraps the NumPy ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of ``np.int32``. Attributes @@ -1037,7 +1037,7 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): """ A Zarr data type for arrays containing 32-bit unsigned integers. - Wraps the NumPy ``np.dtypes.UInt32DType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.UInt32DType`` data type. Scalars for this data type are instances of ``np.uint32``. Attributes @@ -1193,7 +1193,7 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): """ A Zarr data type for arrays containing 64-bit signed integers. - Wraps the NumPy ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of ``np.int64``. Attributes @@ -1349,7 +1349,7 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): """ A Zarr data type for arrays containing 64-bit unsigned integers. - Wraps the NumPy ``np.dtypes.UInt64DType`` data type. Scalars for this data type + Wraps the ``np.dtypes.UInt64DType`` data type. Scalars for this data type are instances of ``np.uint64``. Attributes diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index fe0d87cfe2..0ef2d94e48 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -62,7 +62,7 @@ class FixedLengthUTF32( """ A Zarr data type for arrays containing fixed-length UTF-32 strings. - Wraps the NumPy ``np.dtypes.StrDType`` data type. Scalars for this data type are instances of + Wraps the ``np.dtypes.StrDType`` data type. Scalars for this data type are instances of ``np.str_``. Attributes @@ -643,7 +643,7 @@ class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type- """ A Zarr data type for arrays containing variable-length UTF-8 strings. - Wraps the NumPy ``np.dtypes.StringDType`` data type. Scalars for this data type are instances + Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances of ``str``. @@ -677,7 +677,7 @@ class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-re """ A Zarr data type for arrays containing variable-length UTF-8 strings. - Wraps the NumPy ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances + Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances of ``str``. diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 2452baa6d9..5d009a9c4f 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -302,7 +302,7 @@ def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: Returns ------- - np.void + numpy.void The casted data as a numpy structured scalar. Notes @@ -335,7 +335,7 @@ def cast_scalar(self, data: object) -> np.void: Returns ------- - np.void + numpy.void The data cast as a NumPy structured scalar. Raises @@ -355,7 +355,7 @@ def default_scalar(self) -> np.void: Returns ------- - np.void + numpy.void The default scalar value, which is the scalar representation of 0 cast to this structured data type. """ @@ -375,7 +375,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Returns ------- - np.void + numpy.void The NumPy structured scalar. Raises diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 625bda5d2f..03304ec625 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -58,7 +58,7 @@ def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np Returns ------- - np.datetime64 + numpy.datetime64 The datetime64 value. """ dtype_name = f"datetime64[{scale_factor}{unit}]" @@ -71,7 +71,7 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: Parameters ---------- - data : np.datetime64 | np.timedelta64 + data : np.datetime64 | numpy.timedelta64 The value to convert. Returns @@ -273,7 +273,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has """ A Zarr data type for arrays containing NumPy TimeDelta64 data. - Wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type are instances of ``np.timedelta64``. Attributes @@ -463,7 +463,7 @@ def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: Returns ------- - np.timedelta64 + numpy.timedelta64 The input data cast as a numpy timedelta64 scalar. """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") @@ -500,7 +500,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedel Returns ------- - np.timedelta64 + numpy.timedelta64 The scalar value of this data type. Raises @@ -518,7 +518,7 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd """ A Zarr data type for arrays containing NumPy Datetime64 data. - Wraps the NumPy ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type are instances of ``np.datetime64``. Attributes @@ -706,7 +706,7 @@ def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: Returns ------- - np.datetime64 + numpy.datetime64 The casted data as a numpy datetime scalar. """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") @@ -722,7 +722,7 @@ def cast_scalar(self, data: object) -> np.datetime64: Returns ------- - np.datetime64 + numpy.datetime64 The data cast as a numpy datetime scalar. Raises @@ -758,7 +758,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetim Returns ------- - np.datetime64 + numpy.datetime64 The numpy datetime scalar. Raises diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index c11889f7f4..a41699b99e 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -799,7 +799,7 @@ def slice_to_range(s: slice, length: int) -> range: def ix_(selection: Any, shape: ChunkCoords) -> npt.NDArray[np.intp]: - """Convert an orthogonal selection to a numpy advanced (fancy) selection, like ``numpy.ix_`` + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like ``np.ix_`` but with support for slices and single ints.""" # normalisation From 27cccdd4ae2c2f7cf07fbe2ef00552e1bf2bf17b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 16:17:32 +0200 Subject: [PATCH 20/42] restructure list of dtypes --- docs/user-guide/data_types.rst | 57 ++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index e8583a3fdc..ccde32c4a7 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,4 +1,4 @@ -Array Data Types +Array data types ================ Zarr's Data Type Model @@ -179,7 +179,7 @@ We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/ind which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. In this context, a "native" data type is a Python class, typically defined in another library, that -models an array's data type. For example, ``numpy.dtypes.UInt8DType`` is a native data type defined in NumPy. +models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. @@ -195,14 +195,19 @@ API for the following operations: - Encoding and decoding a scalar value to and from Zarr V2 and Zarr V3 array metadata - Casting a Python object to a scalar value consistent with the data type -The following section lists the data types built into Zarr Python. +List of data types +^^^^^^^^^^^^^^^^^^ -Boolean Types -^^^^^^^^^^^^^ +The following section lists the data types built in to Zarr Python. With a few exceptions, Zarr +Python supports nearly all of the data types in NumPy. If you need a data type that is not listed +here, it's possible to create it yourself: see :ref:`adding-new-data-types`. + +Boolean +""""""" - `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ -Integral Types -^^^^^^^^^^^^^^ +Integral +"""""""" - `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ - `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ - `Signed 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int32>`_ @@ -212,27 +217,38 @@ Integral Types - `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ - `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ -Floating-Point Types -^^^^^^^^^^^^^^^^^^^^ +Floating-point +"""""""""""""" - `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ - `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ - `64-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float64>`_ - `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ - `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ -String Types -^^^^^^^^^^^^ +String +"""""" - `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ - `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ -Byte String Types -^^^^^^^^^^^^^^^^^ +Bytes +""""" - `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ - `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ - `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ +Temporal +"""""""" +- `DateTime64 <../api/zarr/dtype/index.html#zarr.dtype.DateTime64>`_ +- `TimeDelta64 <../api/zarr/dtype/index.html#zarr.dtype.TimeDelta64>`_ + +Struct-like +""""""""""" +- `Structured <../api/zarr/dtype/index.html#zarr.dtype.Structured>`_ + Example Usage -~~~~~~~~~~~~~ +^^^^^^^^^^^^^ + +This section will demonstrates the basic usage of Zarr data types. Create a ``ZDType`` from a native data type: @@ -296,8 +312,10 @@ Deserialize a scalar value from JSON: >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) +.. _adding-new-data-types: + Adding New Data Types -~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^ Each Zarr data type is a separate Python class that inherits from `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. You can define a custom data type by @@ -311,7 +329,7 @@ Python project directory. :language: python Data Type Resolution -~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^ Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array with a NumPy data type object: @@ -338,7 +356,7 @@ data type as its ``dtype`` attribute: .. code-block:: python >>> type(a.dtype) - + But if we inspect the metadata for the array, we can see the Zarr data type object: @@ -379,8 +397,9 @@ a static lookup table, Zarr Python relies on a dynamic approach to data type res Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," is essentially a dictionary where the keys are strings (a canonical name for each data type), and the values are the data type classes themselves. Dynamic data type resolution entails iterating over -these data type classes, invoking a special class constructor defined on each one, and returning a -concrete data type instance if and only if exactly one of those constructor invocations is successful. +these data type classes, invoking that class' `from_native_dtype <#api/dtype/ZDType.from_native_dtype>`_ +method, and returning a concrete data type instance if and only if exactly one of those constructor +invocations is successful. In plain language, we take some user input, like a NumPy data type, offer it to all the known data type classes, and return an instance of the one data type class that can accept that user input. From a68751a212bb286dcc147f49f25a181c55094c79 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 16:18:39 +0200 Subject: [PATCH 21/42] code block --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index ccde32c4a7..901b637069 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -283,7 +283,7 @@ Serialize to JSON for Zarr V2: .. note:: The representation returned by ``to_json`` is more abstract than the literal contents of Zarr V2 - array metadata, because the JSON representation used by the `ZDType` classes must be distinct across + array metadata, because the JSON representation used by the ``ZDType`` classes must be distinct across different data types. Zarr V2 identifies multiple distinct data types with the "object" data type identifier ``"|O"``, which means extra information is needed to disambiguate these data types from one another. That's the reason for the ``object_codec_id`` field you see here. See the From f3c44dba35372084f41cca8976fc0b2e4a683aca Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 16:28:40 +0200 Subject: [PATCH 22/42] prose --- docs/user-guide/data_types.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 901b637069..4a6761c4f2 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -118,8 +118,7 @@ objects has a consistent type, then we can use a special encoding procedure to s is how Zarr Python stores variable-length UTF-8 strings, or variable-length byte strings. Although these are separate data types in this library, they are both "object" arrays in NumPy, which means -they have the same Zarr V2 string representation: ``"|O"``. Clearly in this case the string -representation of the data type is ambiguous in this case. +they have the *same* Zarr V2 string representation: ``"|O"``. So for Zarr V2 we have to disambiguate different "object" data type arrays on the basis of their encoding procedure, i.e., the codecs declared in the ``filters`` and ``compressor`` attributes of array From 3045e9a2ebe1b3891aa12aa6ccb7c184291493db Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 16:42:32 +0200 Subject: [PATCH 23/42] revert ectopic change --- src/zarr/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index a41699b99e..c11889f7f4 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -799,7 +799,7 @@ def slice_to_range(s: slice, length: int) -> range: def ix_(selection: Any, shape: ChunkCoords) -> npt.NDArray[np.intp]: - """Convert an orthogonal selection to a numpy advanced (fancy) selection, like ``np.ix_`` + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like ``numpy.ix_`` but with support for slices and single ints.""" # normalisation From 84b572e1c97bf2fe59e17b4b938d6e7b61dfe432 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 17:15:32 +0200 Subject: [PATCH 24/42] remove trailing underscore from np.void --- src/zarr/core/dtype/npy/bytes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index ea59af44eb..948db9813e 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -445,10 +445,10 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize from_json(data, zarr_format) : NullTerminatedBytes Create RawBytes from JSON data. - cast_scalar(data) : np.void_ + cast_scalar(data) : np.void Cast a python object to np.void. - default_scalar() : np.void_ + default_scalar() : np.void Return the default scalar value. to_json_scalar(data, zarr_format) : str From f35d4c14de699269dd34b64ca8b5afd26f069807 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 17:24:58 +0200 Subject: [PATCH 25/42] remove methods section, correct attributes --- src/zarr/core/dtype/npy/bytes.py | 84 +++++--------------------------- src/zarr/core/dtype/npy/int.py | 19 -------- 2 files changed, 11 insertions(+), 92 deletions(-) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 948db9813e..d504dffe94 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -46,34 +46,11 @@ class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLengt Attributes ---------- - length : int - The length of the bytes. - dtype_cls: ClassVar[type[np.dtypes.BytesDType[int]]] = np.dtypes.BytesDType The NumPy data type wrapped by this ZDType. - - Methods - ------- - to_json(zarr_format) : dict - Convert the NullTerminatedBytes to JSON data. - - from_json(data, zarr_format) : NullTerminatedBytes - Create NullTerminatedBytes from JSON data. - - cast_scalar(data) : np.bytes_ - Cast a python object to np.bytes_. - - default_scalar() : np.bytes_ - Return the default scalar value. - - to_json_scalar(data, zarr_format) : str - Convert input to a scalar and return as JSON data. - - from_json_scalar(data, zarr_format) : np.bytes_ - Create np.bytes_ from JSON data. - - item_size : int - Return the item size, in bytes, of the data type. + _zarr_v3_name : ClassVar[Literal["null_terminated_bytes"]] + length : int + The length of the bytes. Notes ----- @@ -380,7 +357,7 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: """ - Read a JSON-serializable value as np.bytes_. + Read a JSON-serializable value as ``np.bytes_``. Parameters ---------- @@ -431,34 +408,11 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize Attributes ---------- - length : int - The length of the bytes. - dtype_cls: ClassVar[type[np.dtypes.VoidDType[int]]] = np.dtypes.VoidDtype The NumPy data type wrapped by this ZDType. - - Methods - ------- - to_json(zarr_format) : dict - Convert RawBytes to JSON data. - - from_json(data, zarr_format) : NullTerminatedBytes - Create RawBytes from JSON data. - - cast_scalar(data) : np.void - Cast a python object to np.void. - - default_scalar() : np.void - Return the default scalar value. - - to_json_scalar(data, zarr_format) : str - Convert input to a scalar and return as JSON data. - - from_json_scalar(data, zarr_format) : np.bytes_ - Create a np.void from JSON data. - - item_size : int - Return the item size, in bytes, of the data type. + _zarr_v3_name : ClassVar[Literal["raw_bytes"]] + length : int + The length of the bytes. Notes ----- @@ -851,26 +805,10 @@ class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): ---------- dtype_cls: ClassVar[type[np.dtypes.ObjectDType]] = np.dtypes.ObjectDType The NumPy data type wrapped by this ZDType. - - Methods - ------- - to_json(zarr_format) : dict - Convert the VariableLengthBytes to JSON data. - - from_json(data, zarr_format) : VariableLengthBytes - Create VariableLengthBytes from JSON data. - - cast_scalar(data) : bytes - Cast a python object to bytes. - - default_scalar() : bytes - Return the default scalar value. - - to_json_scalar(data, zarr_format) : str - Convert input to a scalar and return as JSON data. - - from_json_scalar(data, zarr_format) : bytes - Create bytes from JSON data. + _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + The name of this data type in Zarr V3. + object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" + The object codec ID for this data type. Notes ----- diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 3e729a426c..4e66e62315 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -63,25 +63,6 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): ---------- _zarr_v2_names : ClassVar[tuple[str, ...]] Possible Zarr V2 JSON names for the data type. - - Methods - ------- - _check_json_v2(data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: - Check if input is a valid JSON representation for Zarr v2. - _check_json_v3(data: object) -> TypeGuard[str]: - Check if JSON value is consistent with Zarr v3 for this data type. - _check_scalar(data: object) -> TypeGuard[IntLike]: - Check if a Python object is IntLike. - _cast_scalar_unchecked(data: IntLike) -> TIntScalar_co: - Create an integer without type checking of the input. - cast_scalar(data: object) -> TIntScalar_co: - Convert object to NumPy integer, raising TypeError if invalid. - default_scalar() -> TIntScalar_co: - Get the default value, which is 0 cast to this dtype. - from_json_scalar(data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: - Read a JSON-serializable value as a NumPy int scalar. - to_json_scalar(data: object, *, zarr_format: ZarrFormat) -> int: - Convert an object to JSON-serializable scalar. """ _zarr_v2_names: ClassVar[tuple[str, ...]] From 669afd3b0cb7fb7b9abfbc6d81795de724396e89 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 18:10:01 +0200 Subject: [PATCH 26/42] resolve docs build error my re-ordering plugins. great stuff, sphinx --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index efbf708c25..61d83ef819 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -38,7 +38,6 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", - "sphinx.ext.viewcode", "sphinx.ext.intersphinx", 'autoapi.extension', "numpydoc", @@ -46,6 +45,7 @@ "sphinx_copybutton", "sphinx_design", 'sphinx_reredirects', + "sphinx.ext.viewcode", ] issues_github_path = "zarr-developers/zarr-python" From 9af24edb6c43fca20d19e5f97fe778e3782eca1c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 18:23:32 +0200 Subject: [PATCH 27/42] numpy -> np --- src/zarr/core/dtype/npy/bytes.py | 8 ++++---- src/zarr/core/dtype/npy/structured.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index d504dffe94..0d07737373 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -678,7 +678,7 @@ def _cast_scalar_unchecked(self, data: object) -> np.void: Returns ------- - numpy.void + np.void The casted data as a NumPy void scalar. Notes @@ -707,7 +707,7 @@ def cast_scalar(self, data: object) -> np.void: Returns ------- - numpy.void + np.void The data cast as a NumPy void scalar. Raises @@ -729,7 +729,7 @@ def default_scalar(self) -> np.void: Returns ------- - numpy.void + np.void The default scalar value. """ return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) @@ -769,7 +769,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Returns ------- - numpy.void + np.void The NumPy void scalar. Raises diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 5d009a9c4f..2452baa6d9 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -302,7 +302,7 @@ def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: Returns ------- - numpy.void + np.void The casted data as a numpy structured scalar. Notes @@ -335,7 +335,7 @@ def cast_scalar(self, data: object) -> np.void: Returns ------- - numpy.void + np.void The data cast as a NumPy structured scalar. Raises @@ -355,7 +355,7 @@ def default_scalar(self) -> np.void: Returns ------- - numpy.void + np.void The default scalar value, which is the scalar representation of 0 cast to this structured data type. """ @@ -375,7 +375,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Returns ------- - numpy.void + np.void The NumPy structured scalar. Raises From 3453af2e0433f197458de4d6e562335979b9d682 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 18:27:14 +0200 Subject: [PATCH 28/42] fix doctests --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 4a6761c4f2..38f755bbc1 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -355,7 +355,7 @@ data type as its ``dtype`` attribute: .. code-block:: python >>> type(a.dtype) - + But if we inspect the metadata for the array, we can see the Zarr data type object: From efb767f009c1bbdafcae3205033238fc83c5f37f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 18:38:39 +0200 Subject: [PATCH 29/42] add pytest to docs env, because this resolves a warning about a missing pytest import? --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f34b220c9c..53b37d6c6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,7 +109,8 @@ docs = [ 'numcodecs[msgpack]', 'rich', 's3fs>=2023.10.0', - 'astroid<4' + 'astroid<4', + 'pytest' ] From 6e6d337f30ee7f5c12b8431d96e3e93ab74a5a94 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 18:58:14 +0200 Subject: [PATCH 30/42] put return types in double backticks --- src/zarr/core/dtype/npy/bool.py | 6 +++--- src/zarr/core/dtype/npy/bytes.py | 6 +++--- src/zarr/core/dtype/npy/string.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index a3f1653c95..cb67303f5f 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -235,7 +235,7 @@ def cast_scalar(self, data: object) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The numpy boolean scalar. Raises @@ -254,7 +254,7 @@ def default_scalar(self) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The default value. """ return np.False_ @@ -290,7 +290,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The numpy boolean scalar. Raises diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 0d07737373..10771c48a8 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -308,7 +308,7 @@ def cast_scalar(self, data: object) -> np.bytes_: Returns ------- - np.bytes_ + ``np.bytes_`` The data cast as a NumPy bytes scalar. Raises @@ -328,7 +328,7 @@ def default_scalar(self) -> np.bytes_: Returns ------- - np.bytes_ + ``np.bytes_`` The default scalar value. """ return np.bytes_(b"") @@ -368,7 +368,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: Returns ------- - np.bytes_ + ``np.bytes_`` The NumPy bytes scalar obtained from decoding the base64 string. Raises diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 0ef2d94e48..dcfa4e89cf 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -244,7 +244,7 @@ def default_scalar(self) -> np.str_: Returns ------- - np.str_ + ``np.str_`` The default scalar value. """ return np.str_("") @@ -280,7 +280,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: Returns ------- - np.str_ + ``np.str_`` The native scalar value. """ if check_json_str(data): @@ -315,7 +315,7 @@ def cast_scalar(self, data: object) -> np.str_: Returns ------- - np.str_ + ``np.str_`` The native scalar value. """ if self._check_scalar(data): From cee23aa2ba89dc51d028f5f4da10da3bc8ee9ec4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 19:14:59 +0200 Subject: [PATCH 31/42] escape piped return types --- src/zarr/core/dtype/npy/bool.py | 7 ++++--- src/zarr/core/dtype/npy/bytes.py | 2 +- src/zarr/core/dtype/npy/int.py | 8 ++++---- src/zarr/core/dtype/npy/string.py | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index cb67303f5f..87c1fe6ac9 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -28,9 +28,10 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): Attributes ---------- + _zarr_v3_name : Literal["bool"] = "bool" The Zarr v3 name of the dtype. - _zarr_v2_name : Literal["|b1"] = "|b1" + _zarr_v2_name : ``Literal["|b1"]`` = ``"|b1"`` The Zarr v2 name of the dtype, which is also a string representation of the boolean dtype used by NumPy. dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType @@ -99,7 +100,7 @@ def _check_json_v2( Returns ------- - TypeGuard[DTypeConfig_V2[Literal["|b1"], None]] + ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` True if the input is a valid JSON representation, False otherwise. """ return ( @@ -194,7 +195,7 @@ def to_json( Returns ------- - DTypeConfig_V2[Literal["|b1"], None] or Literal["bool"] + ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` The JSON representation of the Bool instance. Raises diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 10771c48a8..a89247d7d9 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -990,7 +990,7 @@ def to_json( Returns ------- - DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"] + ``DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]`` The JSON-serializable representation of the variable-length bytes data type. For zarr_format 2, returns a dictionary with "name" and "object_codec_id". For zarr_format 3, returns a string identifier "variable_length_bytes". diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 4e66e62315..6badbaf3b0 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -248,7 +248,7 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["int8"]] = "int8" The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal["|i1"]]] = ("|i1",) + _zarr_v2_names : ``ClassVar[tuple[Literal["|i1"]]]`` = ``("|i1",)`` The names of this data type in Zarr V2. """ @@ -362,7 +362,7 @@ def to_json( Returns ------- - DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"] + ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` The JSON-serializable representation of the data type. Raises @@ -402,7 +402,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): The class of the underlying NumPy dtype. _zarr_v3_name : ClassVar[Literal["uint8"]] = "uint8" The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal["|u1"]]] = ("|u1",) + _zarr_v2_names : ``ClassVar[tuple[Literal["|u1"]]]`` = ``("|u1",)`` The names of this data type in Zarr V2. """ @@ -503,7 +503,7 @@ def to_json( Returns ------- - DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"] + ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` The JSON-serializable representation of the data type. Raises diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index dcfa4e89cf..ae120b0ea6 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -432,7 +432,7 @@ def _check_json_v2( Returns ------- - TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]] + ``TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]`` Whether the input is a valid JSON representation of a NumPy "object" data type, and that the object codec id is appropriate for variable-length UTF-8 strings. """ @@ -524,7 +524,7 @@ def to_json( Returns ------- - DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"] + ``DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]`` The JSON representation of this data type. """ if zarr_format == 2: From 31720953767d4235c997b19bc61669567404904d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 24 Jun 2025 20:59:16 +0200 Subject: [PATCH 32/42] fix internal link --- docs/user-guide/data_types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 38f755bbc1..f7b4da844d 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -74,7 +74,7 @@ V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that d However, Zarr version 3 data types do not store endianness information. There are two special cases to consider: `"structured" data types <#structured-data-type>`_, and -`"object" <#object-data-type>` data types. +`"object" <#object-data-type>`_ data types. Structured Data Type ^^^^^^^^^^^^^^^^^^^^ From a785e355a3a0b1bac3b22d7c2211e84f6cbb0a7f Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 27 Jun 2025 16:32:10 +0200 Subject: [PATCH 33/42] Update examples/custom_dtype.py --- examples/custom_dtype.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py index 0534f38685..5670b3821b 100644 --- a/examples/custom_dtype.py +++ b/examples/custom_dtype.py @@ -7,9 +7,6 @@ # ] # /// # -# Note: the zarr version must be changed in order to run this outside of the -# zarr source tree. For example, to make this script truly stand-alone, specify the zarr -# dependency as just "zarr" """ Demonstrate how to extend Zarr Python by defining a new data type From d408b9d9fee1863a803ba5e7e4ad7e59bc8d2f4d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 1 Jul 2025 10:50:20 +0200 Subject: [PATCH 34/42] make datatype configuration typeddict readonly --- src/zarr/core/common.py | 6 ++++-- src/zarr/core/dtype/common.py | 6 ++++-- src/zarr/core/dtype/npy/bool.py | 2 +- src/zarr/core/dtype/npy/bytes.py | 6 +++--- src/zarr/core/dtype/npy/complex.py | 2 +- src/zarr/core/dtype/npy/float.py | 2 +- src/zarr/core/dtype/npy/int.py | 16 ++++++++-------- src/zarr/core/dtype/npy/string.py | 4 ++-- src/zarr/core/dtype/npy/structured.py | 2 +- src/zarr/core/dtype/npy/time.py | 2 +- 10 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 13ee8bc5a0..2655afbe3d 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,6 +19,8 @@ overload, ) +from typing_extensions import ReadOnly + from zarr.core.config import config as zarr_config if TYPE_CHECKING: @@ -48,8 +50,8 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): - name: TName - configuration: TConfig + name: ReadOnly[TName] + configuration: ReadOnly[TConfig] def product(tup: ChunkCoords) -> int: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index ba0ebba91b..156928b6eb 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -13,6 +13,8 @@ TypeVar, ) +from typing_extensions import ReadOnly + from zarr.core.common import NamedConfig EndiannessStr = Literal["little", "big"] @@ -55,8 +57,8 @@ class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): - name: TDTypeNameV2_co - object_codec_id: TObjectCodecID_co + name: ReadOnly[TDTypeNameV2_co] + object_codec_id: ReadOnly[TObjectCodecID_co] DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 87c1fe6ac9..b385a3012d 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -176,7 +176,7 @@ def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... @overload diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index cc2d3ab1f4..8868645a04 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -224,7 +224,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload @@ -637,7 +637,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload @@ -985,7 +985,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json( self, zarr_format: Literal[2] ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]: ... diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 74742feab4..10466de4a3 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -185,7 +185,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 06ef0cc4c2..6c9dcc4019 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -161,7 +161,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 96b3f570e1..54a2405524 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -343,7 +343,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... @overload @@ -484,7 +484,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... @overload @@ -646,7 +646,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " Self: msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... @@ -1306,7 +1306,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... @@ -1435,7 +1435,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 7ea20f5a29..34e988e3c2 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -171,7 +171,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: and isinstance(data["configuration"]["length_bytes"], int) ) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload @@ -512,7 +512,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json( self, zarr_format: Literal[2] ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ... diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 9767c30089..0e9e20a381 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -232,7 +232,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[StructuredName_V2, None]: ... @overload diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 9e59ab13fd..8a86cbe720 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -198,7 +198,7 @@ def to_native_dtype(self) -> BaseTimeDType_co: dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... From b4a114de5ee9ebd7f3db1e7a4415fd611db00a81 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 1 Jul 2025 12:06:57 +0200 Subject: [PATCH 35/42] document namedconfig --- src/zarr/core/common.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 2655afbe3d..a5ef7aeb7a 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -50,8 +50,19 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): + """ + A typed dictionary representing an object with a name and configuration, where the configuration + is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + """ + name: ReadOnly[TName] + """The name of the object.""" + configuration: ReadOnly[TConfig] + """The configuration of the object.""" def product(tup: ChunkCoords) -> int: From 063969695be35dc8e7d449ee1bd98d68183f3cd1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 2 Jul 2025 13:17:52 +0200 Subject: [PATCH 36/42] document and export typeddicts, move dtype docs to an advanced section --- docs/user-guide/arrays.rst | 3 +- docs/user-guide/data_types.rst | 2 + docs/user-guide/index.rst | 2 +- src/zarr/core/dtype/__init__.py | 39 ++- src/zarr/core/dtype/npy/bool.py | 14 +- src/zarr/core/dtype/npy/bytes.py | 237 +++++++++++----- src/zarr/core/dtype/npy/complex.py | 25 +- src/zarr/core/dtype/npy/float.py | 41 +-- src/zarr/core/dtype/npy/int.py | 139 +++++----- src/zarr/core/dtype/npy/string.py | 125 +++++++-- src/zarr/core/dtype/npy/structured.py | 100 +++++-- src/zarr/core/dtype/npy/time.py | 374 +++++++++++++++++--------- src/zarr/core/dtype/wrapper.py | 28 +- src/zarr/dtype.py | 20 ++ 14 files changed, 778 insertions(+), 371 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index c27f1296b9..d203b20844 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -23,7 +23,8 @@ The code above creates a 2-dimensional array of 32-bit integers with 10000 rows and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 columns (and so there will be 100 chunks in total). The data is written to a :class:`zarr.storage.MemoryStore` (e.g. an in-memory dict). See -:ref:`user-guide-persist` for details on storing arrays in other stores. +:ref:`user-guide-persist` for details on storing arrays in other stores, and see +:ref:`user-guide-data-types` for an in-depth look at the data types supported by Zarr. For a complete list of array creation routines see the :mod:`zarr` module documentation. diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index f7b4da844d..cefbfb4a39 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,3 +1,5 @@ +.. _user-guide-data-types: + Array data types ================ diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index ea34ac2561..f92c576f32 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -8,7 +8,6 @@ User guide installation arrays - data_types groups attributes storage @@ -21,6 +20,7 @@ Advanced Topics .. toctree:: :maxdepth: 1 + data_types performance consolidated_metadata extending diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index d174c5294e..1d36689ec8 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -7,14 +7,28 @@ DTypeJSON, ) from zarr.core.dtype.npy.bool import Bool -from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes +from zarr.core.dtype.npy.bytes import ( + NullTerminatedBytes, + NullterminatedBytesJSON_V2, + NullTerminatedBytesJSON_V3, + RawBytes, + RawBytesJSON_V2, + RawBytesJSON_V3, + VariableLengthBytes, + VariableLengthBytesJSON_V2, +) from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.structured import ( - Structured, +from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3 +from zarr.core.dtype.npy.time import ( + DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, + TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, ) -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -27,7 +41,10 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( FixedLengthUTF32, + FixedLengthUTF32JSON_V2, + FixedLengthUTF32JSON_V3, VariableLengthUTF8, + VariableLengthUTF8JSON_V2, ) from zarr.core.dtype.registry import DataTypeRegistry from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -39,7 +56,11 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", + "DateTime64JSON_V2", + "DateTime64JSON_V3", "FixedLengthUTF32", + "FixedLengthUTF32JSON_V2", + "FixedLengthUTF32JSON_V3", "Float16", "Float32", "Float64", @@ -48,18 +69,28 @@ "Int32", "Int64", "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", + "NullterminatedBytesJSON_V2", "RawBytes", + "RawBytesJSON_V2", + "RawBytesJSON_V3", "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", "TBaseDType", "TBaseScalar", "TimeDelta64", "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", "UInt8", "UInt16", "UInt32", "UInt64", "VariableLengthBytes", + "VariableLengthBytesJSON_V2", "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", "parse_data_type", diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index b385a3012d..15eaa61b12 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -37,11 +37,11 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType The NumPy dtype class. - Notes - ----- + References + ---------- This class implements the boolean data type defined in Zarr V2 and V3. - You can read the formal specification of that data type in the respective - `specification document `_ + + See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" @@ -112,7 +112,7 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: """ - Check that the input is a valid JSON representation of a Bool in Zarr V3 format. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -144,7 +144,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a Bool. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): return cls() @@ -169,7 +169,7 @@ def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a Bool. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): return cls() diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 8868645a04..e148ca31b4 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -25,11 +25,141 @@ class FixedLengthBytesConfig(TypedDict): + """ + A configuration for a data type that takes a ``length_bytes`` parameter. + + Attributes + ---------- + + length_bytes : int + The length in bytes of the data associated with this configuration. + + Examples + -------- + .. code-block:: python + + { + "length_bytes": 12 + } + """ + length_bytes: int -NullTerminatedBytesJSONV3 = NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] -RawBytesJSONV3 = NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig] +class NullterminatedBytesJSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``NullTerminatedBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + + Examples + -------- + .. code-block:: python + + { + "name": "|S10", + "object_codec_id": None + } + """ + + +class NullTerminatedBytesJSON_V3( + NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] +): + """ + The JSON representation of the ``NullTerminatedBytes`` data type in Zarr V3. + + References + ---------- + This representation is not currently defined in an external specification. + + + Examples + -------- + .. code-block:: python + + { + "name": "null_terminated_bytes", + "configuration": { + "length_bytes": 12 + } + } + + """ + + +class RawBytesJSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``RawBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + + Examples + -------- + .. code-block:: python + + { + "name": "|V10", + "object_codec_id": None + } + """ + + +class RawBytesJSON_V3(NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig]): + """ + The JSON representation of the ``RawBytes`` data type in Zarr V3. + + References + ---------- + This representation is not currently defined in an external specification. + + + Examples + -------- + .. code-block:: python + + { + "name": "raw_bytes", + "configuration": { + "length_bytes": 12 + """ + + +class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]): + """ + A wrapper around the JSON representation of the ``VariableLengthBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-bytes"`` + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + Examples + -------- + .. code-block:: python + + { + "name": "|O", + "object_codec_id": "vlen-bytes" + } + """ @dataclass(frozen=True, kw_only=True) @@ -115,7 +245,7 @@ def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[NullterminatedBytesJSON_V2]: """ Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2. @@ -141,9 +271,9 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3]: """ - Check that the input is a valid representation of NullTerminatedBytes in Zarr V3. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -152,8 +282,9 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3] Returns ------- - True if the input is a valid representation of NullTerminatedBytes in Zarr V3, False - otherwise. + TypeGuard[NullTerminatedBytesJSON_V3] + True if the input is a valid representation of this class in Zarr V3, False + otherwise. """ return ( isinstance(data, dict) @@ -167,11 +298,11 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3] @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ - Create an instance of NullTerminatedBytes from Zarr V2-flavored JSON. + Create an instance of this class from Zarr V2-flavored JSON. This method checks if the input data is a valid representation of - NullTerminatedBytes in Zarr V2. If so, it returns a new instance of - NullTerminatedBytes with a ``length`` as specified in the input data. + this class in Zarr V2. If so, it returns a new instance of + this class with a ``length`` as specified in the input data. Parameters ---------- @@ -186,7 +317,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of NullTerminatedBytes. + If the input data is not a valid representation of this class. """ if cls._check_json_v2(data): @@ -198,11 +329,11 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ - Create an instance of NullTerminatedBytes from Zarr V3-flavored JSON. + Create an instance of this class from Zarr V3-flavored JSON. This method checks if the input data is a valid representation of - NullTerminatedBytes in Zarr V3. If so, it returns a new instance of - NullTerminatedBytes with a ``length`` as specified in the input data. + this class in Zarr V3. If so, it returns a new instance of + this class with a ``length`` as specified in the input data. Parameters ---------- @@ -217,7 +348,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of NullTerminatedBytes. + If the input data is not a valid representation of this class. """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) @@ -225,14 +356,14 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + def to_json(self, zarr_format: Literal[2]) -> NullterminatedBytesJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSON_V3: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3: + ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSON_V3: """ Generate a JSON representation of this data type. @@ -243,7 +374,7 @@ def to_json( Returns ------- - DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3 + NullterminatedBytesJSON_V2 | NullTerminatedBytesJSON_V3 The JSON-serializable representation of the data type """ if zarr_format == 2: @@ -517,16 +648,9 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V2]: """ - Check that the input is a valid representation of RawBytes in Zarr V2. - - The input data must be a mapping with the following structure: - - { - "name": "|V", - "object_codec_id": None - } + Check that the input is a valid representation of this class in Zarr V2. Parameters ---------- @@ -535,7 +659,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] Returns ------- - True if the input is a valid representation of RawBytes in Zarr V3, False otherwise. + True if the input is a valid representation of this class in Zarr V3, False otherwise. """ return ( @@ -546,18 +670,9 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: """ - Check that the input is a valid representation of RawBytes in Zarr V3. - - The input must be a mapping with the following structure: - - { - "name": "raw_bytes", - "configuration": { - "length_bytes": - } - } + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -566,8 +681,9 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSONV3]: Returns ------- - True if the input is a valid representation of RawBytes in Zarr V3, False - otherwise. + TypeGuard[RawBytesJSON_V3] + True if the input is a valid representation of this class in Zarr V3, False + otherwise. """ return ( isinstance(data, dict) @@ -600,7 +716,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of this RawBytes. + If the input data is not a valid representation of this class. """ if cls._check_json_v2(data): name = data["name"] @@ -630,7 +746,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of RawBytes. + If the input data is not a valid representation of this class. """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) @@ -638,12 +754,12 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + def to_json(self, zarr_format: Literal[2]) -> RawBytesJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> RawBytesJSON_V3: ... - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawBytesJSONV3: + def to_json(self, zarr_format: ZarrFormat) -> RawBytesJSON_V2 | RawBytesJSON_V3: """ Generate a JSON representation of this data type. @@ -654,7 +770,7 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawByt Returns ------- - DTypeConfig_V2[str, None] | RawBytesJSONV3 + RawBytesJSON_V2 | RawBytesJSON_V3 The JSON-serializable representation of the data type. """ if zarr_format == 2: @@ -881,7 +997,7 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]]: + ) -> TypeGuard[VariableLengthBytesJSON_V2]: """ Check that the input is a valid JSON representation of a NumPy O dtype, and that the object codec id is appropriate for variable-length bytes strings. @@ -893,7 +1009,7 @@ def _check_json_v2( Returns ------- - True if the input is a valid representation of VariableLengthBytes in Zarr V2, False + True if the input is a valid representation of this class in Zarr V2, False otherwise. """ # Check that the input is a valid JSON representation of a Zarr v2 data type spec. @@ -908,10 +1024,7 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: """ - Check that the input is a valid representation of VariableLengthBytes in Zarr V3. - - This method verifies that the provided data matches the expected Zarr V3 representation - for VariableLengthBytes, which is the string "variable_length_bytes". + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -921,7 +1034,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_b Returns ------- TypeGuard[Literal["variable_length_bytes"]] - True if the input is a valid representation of VariableLengthBytes in Zarr V3, False otherwise. + True if the input is a valid representation of this class in Zarr V3, False otherwise. """ return data == cls._zarr_v3_name @@ -931,8 +1044,8 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: """ Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON. - This method checks if the input data is a valid representation of this VariableLengthBytes - in Zarr V2. If so, it returns a new instance VariableLengthBytes. + This method checks if the input data is a valid representation of this class + in Zarr V2. If so, it returns a new instance this class. Parameters ---------- @@ -947,7 +1060,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of this class. + If the input data is not a valid representation of this class class. """ if cls._check_json_v2(data): @@ -977,7 +1090,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input data is not a valid representation of VariableLengthBytes. + If the input data is not a valid representation of this class. """ if cls._check_json_v3(data): @@ -986,16 +1099,14 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) @overload - def to_json( - self, zarr_format: Literal[2] - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]: ... + def to_json(self, zarr_format: Literal[2]) -> VariableLengthBytesJSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]: + ) -> VariableLengthBytesJSON_V2 | Literal["variable_length_bytes"]: """ Convert the variable-length bytes data type to a JSON-serializable form. diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 10466de4a3..5de2018f3a 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -42,7 +42,7 @@ @dataclass(frozen=True) class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): """ - A base class for complex data types + A base class for Zarr data types that wrap NumPy complex float data types. """ # This attribute holds the possible zarr v2 JSON names for the data type @@ -76,7 +76,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def to_native_dtype(self) -> TComplexDType_co: """ - Convert this complex data type to a NumPy complex dtype with the appropriate byte order. + Convert this class to a NumPy complex dtype with the appropriate byte order. Returns ------- @@ -127,7 +127,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: Returns ------- TypeGuard[str] - True if the input is a valid representation of this data type in Zarr V3, False otherwise. + True if the input is a valid representation of this class in Zarr V3, False otherwise. """ return data == cls._zarr_v3_name @@ -135,7 +135,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: """ - Create an instance of this complex data type from Zarr V2-flavored JSON. + Create an instance of this class from Zarr V2-flavored JSON. Parameters ---------- @@ -145,12 +145,12 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Returns ------- Self - An instance of this data type. + An instance of this class. Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this complex data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without @@ -163,7 +163,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ - Create an instance of this complex data type from Zarr V3-flavored JSON. + Create an instance of this class from Zarr V3-flavored JSON. Parameters ---------- @@ -178,7 +178,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this complex data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): return cls() @@ -193,7 +193,7 @@ def to_json(self, zarr_format: Literal[3]) -> str: ... def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ - Serialize this complex data type to a JSON-compatible representation. + Serialize this object to a JSON-serializable representation. Parameters ---------- @@ -203,8 +203,9 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: Returns ------- DTypeConfig_V2[str, None] | str - If `zarr_format` is 2, a dictionary with "name" and "object_codec_id" is returned. - If `zarr_format` is 3, a string representation of the complex data type is returned. + If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is + returned. + If ``zarr_format`` is 3, a string representation of the complex data type is returned. Raises ------ @@ -236,7 +237,7 @@ def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: """ - Cast the provided scalar data to the native scalar type of this complex data type. + Cast the provided scalar data to the native scalar type of this class. Parameters ---------- diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 6c9dcc4019..26dde5d980 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -36,12 +36,7 @@ @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): """ - Base class for float data types. - - Attributes - ---------- - _zarr_v2_names : ClassVar[tuple[str, ...]] - The possible Zarr V2 JSON names for the data type. + A base class for Zarr data types that wrap NumPy float data types. """ # This attribute holds the possible zarr v2 JSON names for the data type @@ -104,7 +99,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: """ - Check that the input is a valid JSON representation of this data type. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -114,7 +109,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: Returns ------- TypeGuard[str] - True if the input is a valid JSON representation of this data type, False otherwise. + True if the input is a valid JSON representation of this class, False otherwise. """ return data == cls._zarr_v3_name @@ -324,10 +319,12 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): ---------- dtype_cls : Type[np.dtypes.Float16DType] The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["float16"]] - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">f2"], Literal["`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Float16DType @@ -359,10 +356,12 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): ---------- dtype_cls : Type[np.dtypes.Float32DType] The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["float32"]] - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">f4"], Literal["`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Float32DType @@ -394,10 +393,12 @@ class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): ---------- dtype_cls : Type[np.dtypes.Float64DType] The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["float64"]] - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">f8"], Literal["`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Float64DType diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 54a2405524..cbba824872 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -58,11 +58,6 @@ class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): This class provides methods for serialization and deserialization of integer types in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. - - Attributes - ---------- - _zarr_v2_names : ClassVar[tuple[str, ...]] - Possible Zarr V2 JSON names for the data type. """ _zarr_v2_names: ClassVar[tuple[str, ...]] @@ -84,7 +79,7 @@ def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: Returns ------- TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid representation of this integer data type in Zarr V2, + True if the input is a valid representation of this class in Zarr V2, False otherwise. """ @@ -97,11 +92,7 @@ def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: @classmethod def _check_json_v3(cls, data: object) -> TypeGuard[str]: """ - Check if JSON value is consistent with Zarr v3 for this data type. - - This method verifies whether the provided data matches the expected Zarr v3 - representation for this data type, which is the string specified by the - class-level attribute _zarr_v3_name. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -111,7 +102,7 @@ def _check_json_v3(cls, data: object) -> TypeGuard[str]: Returns ------- TypeGuard[str] - True if the input is a valid representation of this data type in Zarr v3, + True if the input is a valid representation of this class in Zarr v3, False otherwise. """ return data == cls._zarr_v3_name @@ -246,10 +237,12 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): ---------- dtype_cls : np.dtypes.Int8DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["int8"]] = "int8" - The name of this data type in Zarr V3. - _zarr_v2_names : ``ClassVar[tuple[Literal["|i1"]]]`` = ``("|i1",)`` - The names of this data type in Zarr V2. + + References + ---------- + This class implements the 8-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Int8DType @@ -274,7 +267,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Raises ------ DataTypeValidationError - If the input data type is not a valid representation of an Int8. + If the input data type is not a valid representation of this class Int8. """ if cls._check_native_dtype(dtype): return cls() @@ -311,7 +304,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an Int8. + If the input JSON is not a valid representation of this class Int8. """ if cls._check_json_v2(data): return cls() @@ -336,7 +329,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an Int8. + If the input JSON is not a valid representation of this class Int8. """ if cls._check_json_v3(data): return cls() @@ -400,10 +393,12 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): ---------- dtype_cls : np.dtypes.UInt8DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["uint8"]] = "uint8" - The name of this data type in Zarr V3. - _zarr_v2_names : ``ClassVar[tuple[Literal["|u1"]]]`` = ``("|u1",)`` - The names of this data type in Zarr V2. + + References + ---------- + This class implements the 8-bit unsigned integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.UInt8DType @@ -451,7 +446,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): @@ -477,7 +472,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): return cls() @@ -545,10 +540,12 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): ---------- dtype_cls : np.dtypes.Int16DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["int16"]] = "int16" - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">i2"], Literal["i2", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Int16DType @@ -611,7 +608,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): # Going via NumPy ensures that we get the endianness correct without @@ -639,7 +636,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): return cls() @@ -705,10 +702,12 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): ---------- dtype_cls : np.dtypes.UInt16DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["uint16"]] = "uint16" - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">u2"], Literal["u2", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.UInt16DType @@ -771,7 +770,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): # Going via NumPy ensures that we get the endianness correct without @@ -799,7 +798,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): return cls() @@ -865,10 +864,12 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): ---------- dtype_cls : np.dtypes.Int32DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["int32"]] = "int32" - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">i4"], Literal["i4", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Int32DType @@ -914,7 +915,7 @@ def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an Int32. + If the input JSON is not a valid representation of this class Int32. """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) @@ -952,7 +953,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an Int32. + If the input JSON is not a valid representation of this class Int32. """ if cls._check_json_v2(data): # Going via NumPy ensures that we get the endianness correct without @@ -980,7 +981,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an Int32. + If the input JSON is not a valid representation of this class Int32. """ if cls._check_json_v3(data): return cls() @@ -1046,10 +1047,12 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): ---------- dtype_cls : np.dtypes.UInt32DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["uint32"]] = "uint32" - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">u4"], Literal["u4", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.UInt32DType @@ -1074,7 +1077,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Raises ------ DataTypeValidationError - If the input data type is not a valid representation of a 32-bit unsigned + If the input data type is not a valid representation of this class 32-bit unsigned integer. """ if cls._check_native_dtype(dtype): @@ -1113,7 +1116,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a 32-bit unsigned + If the input JSON is not a valid representation of this class 32-bit unsigned integer. """ if cls._check_json_v2(data): @@ -1142,7 +1145,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a 32-bit unsigned + If the input JSON is not a valid representation of this class 32-bit unsigned integer. """ if cls._check_json_v3(data): @@ -1202,10 +1205,12 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): ---------- dtype_cls : np.dtypes.Int64DType The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["int64"]] = "int64" - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">i8"], Literal["i8", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.Int64DType @@ -1230,7 +1235,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Raises ------ DataTypeValidationError - If the input data type is not a valid representation of a 64-bit signed + If the input data type is not a valid representation of this class 64-bit signed integer. """ if cls._check_native_dtype(dtype): @@ -1269,7 +1274,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a 64-bit signed + If the input JSON is not a valid representation of this class 64-bit signed integer. """ if cls._check_json_v2(data): @@ -1298,7 +1303,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a 64-bit signed + If the input JSON is not a valid representation of this class 64-bit signed integer. """ if cls._check_json_v3(data): @@ -1358,10 +1363,12 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): ---------- dtype_cls: np.dtypes.UInt64DType The class of the underlying NumPy dtype. - _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - The name of this data type in Zarr V3. - _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", "`_ and `Zarr V3 `_ specification documents for details. """ dtype_cls = np.dtypes.UInt64DType @@ -1375,7 +1382,7 @@ def to_native_dtype(self) -> np.dtypes.UInt64DType: Returns ------- np.dtypes.UInt64DType - The native NumPy dtype. + The native NumPy dtype.eeeeeeeeeeeeeeeee """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @@ -1398,7 +1405,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an unsigned 64-bit + If the input JSON is not a valid representation of this class unsigned 64-bit integer. """ if cls._check_json_v2(data): @@ -1427,7 +1434,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of an unsigned 64-bit + If the input JSON is not a valid representation of this class unsigned 64-bit integer. """ if cls._check_json_v3(data): @@ -1481,7 +1488,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: Raises ------ DataTypeValidationError - If the input dtype is not a valid representation of an unsigned 64-bit + If the input dtype is not a valid representation of this class unsigned 64-bit integer. """ if cls._check_native_dtype(dtype): diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 34e988e3c2..cdf2e4fbfc 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -48,11 +48,60 @@ def __str__(self) -> str: ... class LengthBytesConfig(TypedDict): + """ + Configuration for a fixed-length string data type in Zarr V3. + + Attributes + ---------- + length_bytes : int + The length in bytes of the data associated with this configuration. + """ + length_bytes: int -# TODO: Fix this terrible name -FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] +class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``FixedLengthUTF32`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + Examples + -------- + + .. code-block:: python + + { + "name": " np.dtypes.StrDType[int]: return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: """ Check that the input is a valid JSON representation of a NumPy U dtype. @@ -136,7 +185,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] Returns ------- - TypeGuard[DTypeConfig_V2[str, None]] + TypeGuard[FixedLengthUTF32JSON_V2] Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( @@ -147,9 +196,9 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: """ - Check that the input is a valid JSON representation of a NumPy U dtype. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -175,11 +224,11 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3: + ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: """ Convert the FixedLengthUTF32 instance to a JSON representation. @@ -190,7 +239,7 @@ def to_json( Returns ------- - DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3 + DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 The JSON representation of the data type. """ if zarr_format == 2: @@ -375,23 +424,50 @@ def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: return isinstance(data, int | str | float) +class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]): + """ + A wrapper around the JSON representation of the ``VariableLengthUTF8`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-utf8"``. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + + Examples + -------- + .. code-block:: python + + { + "name": "|O", + "object_codec_id": "vlen-utf8" + } + """ + + # VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. # If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length # string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object # dtype as the native dtype. class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): """ - Base class for variable-length UTF-8 string data types. Not intended for direct use, but as a - base for concrete implementations. + A base class for variable-length UTF-8 string data types. + + Not intended for direct use, but as a base for concrete implementations. Attributes ---------- - dtype_cls : TDType_co - The class of the underlying NumPy dtype. - _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] - The name of this data type in Zarr V3. object_codec_id : ClassVar[Literal["vlen-utf8"]] The object codec ID for this data type. + + References + ---------- + This data type does not have a Zarr V3 specification. + + The Zarr V2 data type specification can be found `here `_. """ _zarr_v3_name: ClassVar[Literal["string"]] = "string" @@ -428,7 +504,7 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]: + ) -> TypeGuard[VariableLengthUTF8JSON_V2]: """ "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype for Zarr v2." @@ -440,7 +516,7 @@ def _check_json_v2( Returns ------- - ``TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]`` + ``TypeGuard[VariableLengthUTF8JSON_V2]`` Whether the input is a valid JSON representation of a NumPy "object" data type, and that the object codec id is appropriate for variable-length UTF-8 strings. """ @@ -453,8 +529,7 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: """ - Check that the input is a valid JSON representation of a variable length UTF-8 string - data type. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -513,15 +588,11 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) @overload - def to_json( - self, zarr_format: Literal[2] - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ... + def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]: + def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: """ Convert this data type to a JSON representation. @@ -532,7 +603,7 @@ def to_json( Returns ------- - ``DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]`` + ``VariableLengthUTF8JSON_V2 | Literal["string"]`` The JSON representation of this data type. """ if zarr_format == 2: diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 0e9e20a381..3faf70a475 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -1,15 +1,16 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, overload import numpy as np +from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( DataTypeValidationError, DTypeConfig_V2, DTypeJSON, - DTypeSpec_V3, HasItemSize, StructuredName_V2, check_dtype_spec_v2, @@ -24,13 +25,64 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType if TYPE_CHECKING: - from collections.abc import Sequence - - from zarr.core.common import JSON, NamedConfig, ZarrFormat + from zarr.core.common import JSON, ZarrFormat StructuredScalarLike = list[object] | tuple[object, ...] | bytes | int +class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]): + """ + A wrapper around the JSON representation of the ``Structured`` data type in Zarr V2. + + The ``name`` field is a sequence of sequences, where each inner sequence has two values: + the field name and the data type name for that field (which could be another sequence). + The data type names are strings, and the object codec ID is always None. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + Examples + -------- + .. code-block:: python + + { + "name": [ + ["f0", "`_. """ dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] def __post_init__(self) -> None: @@ -134,7 +191,7 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[StructuredName_V2, None]]: + ) -> TypeGuard[StructuredJSON_V2]: """ Check if the input is a valid JSON representation of a Structured data type for Zarr V2. @@ -149,7 +206,7 @@ def _check_json_v2( Returns ------- - TypeGuard[DTypeConfig_V2[StructuredName_V2, None]] + TypeGuard[StructuredJSON_V2] True if the input is a valid JSON representation of a Structured data type for Zarr V2, False otherwise. """ @@ -161,16 +218,9 @@ def _check_json_v2( ) @classmethod - def _check_json_v3( - cls, data: DTypeJSON - ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: """ - Check if the input data is a valid JSON representation of a structured data type - for Zarr V3. - - The input must be a dictionary with a "name" key and a "configuration" key. The - "name" key must have the value "structured", and the "configuration" key must map - to a dictionary containing a "fields" key. + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -179,7 +229,7 @@ def _check_json_v3( Returns ------- - TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]] + TypeGuard[StructuredJSON_V3] True if the input is a valid JSON representation of a structured data type for Zarr V3, False otherwise. """ @@ -225,7 +275,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: meta_fields = config["fields"] return cls( fields=tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) + (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) # type: ignore[misc] for f_name, f_dtype in meta_fields ) ) @@ -233,14 +283,12 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: raise DataTypeValidationError(msg) @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[StructuredName_V2, None]: ... + def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... + def to_json(self, zarr_format: Literal[3]) -> StructuredJSON_V3: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3: + def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON_V3: """ Convert the structured data type to a JSON-serializable form. @@ -251,7 +299,7 @@ def to_json( Returns ------- - DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3 + StructuredJSON_V2 | StructuredJSON_V3 The JSON representation of the structured data type. Raises @@ -275,7 +323,7 @@ def to_json( "name": self._zarr_v3_name, "configuration": {"fields": fields}, } - return cast("DTypeSpec_V3", base_dict) + return cast("StructuredJSON_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 8a86cbe720..84c01c235f 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -16,6 +16,7 @@ ) import numpy as np +from typing_extensions import ReadOnly from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( @@ -38,7 +39,6 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -_DTypeName = Literal["datetime64", "timedelta64"] TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None DateTimeLike = str | int | bytes | np.datetime64 | datetime | None @@ -101,12 +101,119 @@ def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: class TimeConfig(TypedDict): - unit: DateTimeUnit - scale_factor: int + """ + The configuration for the numpy.timedelta64 or numpy.datetime64 data type in Zarr V3. + + Attributes + ---------- + unit : ReadOnly[DateTimeUnit] + A string encoding a unit of time. + scale_factor : ReadOnly[int] + A scale factor. + + Examples + -------- + .. code-block:: python + + {"unit": "ms", "scale_factor": 1} + """ + + unit: ReadOnly[DateTimeUnit] + scale_factor: ReadOnly[int] + + +class DateTime64JSON_V3(NamedConfig[Literal["numpy.datetime64"], TimeConfig]): + """ + The JSON representation of the ``numpy.datetime64`` data type in Zarr V3. + + References + ---------- + This representation is defined in the ``numpy.datetime64`` + `specification document `_. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.datetime64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ -DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] -TimeDelta64JSONV3 = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] +class TimeDelta64JSON_V3(NamedConfig[Literal["numpy.timedelta64"], TimeConfig]): + """ + The JSON representation of the ``TimeDelta64`` data type in Zarr V3. + + References + ---------- + This representation is defined in the numpy.timedelta64 + `specification document `_. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ + + +class TimeDelta64JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``TimeDelta64`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `_. + + + Examples + -------- + .. code-block:: python + + { + "name": "`_. + + + Examples + -------- + .. code-block:: python + + { + "name": " Self: Raises ------ DataTypeValidationError - If the dtype is not a valid representation of a NumPy temporal data type. + If the dtype is not a valid representation of this class. """ if cls._check_native_dtype(dtype): @@ -198,45 +293,6 @@ def to_native_dtype(self) -> BaseTimeDType_co: dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return cast( - "DateTime64JSONV3 | TimeDelta64JSONV3", - { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - }, - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ Convert a python object to a JSON representation of a datetime64 or timedelta64 scalar. @@ -274,31 +330,37 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has A Zarr data type for arrays containing NumPy TimeDelta64 data. Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type - are instances of ``np.timedelta64``. + are instances of `np.timedelta64`. Attributes ---------- - dtype_cls : Type[np.dtypes.TimeDelta64DType] + dtype_cls : Type[np.dtypesTimeDelta64DType] The NumPy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["numpy.timedelta64"]] - The name of this data type in Zarr V3. - _zarr_v2_names : tuple - The names of this data type in Zarr V2. - _numpy_name : ClassVar[Literal["timedelta64"]] = "timedelta64" - The literate NumPy name of this data type. + scale_factor : int + The scale factor for this data type. + unit : DateTimeUnit + The unit for this data type. + + References + ---------- + The Zarr V2 representation of this data type is defined in the Zarr V2 + `specification document `_. + + The Zarr V3 representation of this data type is defined in the ``numpy.timedelta64`` + `specification document `_ """ # mypy infers the type of np.dtypes.TimeDelta64DType to be # "Callable[[Literal['Y', 'M', 'W', 'D'] | Literal['h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as']], Never]" dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] + unit: DateTimeUnit = "generic" + scale_factor: int = 1 _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names = (">m8", "m8"], Literal["m8", " TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[TimeDelta64JSON_V2]: """ Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, which could be in the form of strings like "m8[10s]". This method serves as a type @@ -312,7 +374,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] Returns ------- bool - True if the JSON input is a valid representation of a NumPy timedelta64 data type, + True if the JSON input is a valid representation of this class, otherwise False. """ if not check_dtype_spec_v2(data): @@ -332,31 +394,14 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ - Check that JSON input is a dict with a 'name' key with the value 'numpy.timedelta64', and a - 'configuration' key with a value of a dict with a 'unit' key and a 'scale_factor' key. The - 'unit' key should map to a string describing the unit of time, and the 'scale_factor' key - should map to an integer describing the scale factor. - - For example, the following is a valid JSON representation of a TimeDelta64 in Zarr V3: - - .. code-block:: json - - { - "name": "numpy.timedelta64", - "configuration": { - "unit": "generic", - "scale_factor": 1 - } - } - - This function can be used as a type guard to narrow the type of unknown JSON input. + Check that the input is a valid JSON representation of this class in Zarr V3. Returns ------- - TypeGuard[DateTime64JSONV3] - True if the JSON input is a valid representation of a TimeDelta64 in Zarr V3, + TypeGuard[DateTime64JSON_V3] + True if the JSON input is a valid representation of this class, otherwise False. """ return ( @@ -385,7 +430,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of a TimeDelta64. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): name = data["name"] @@ -430,6 +475,40 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) + @overload + def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + TimeDelta64JSON_V2 | TimeDelta64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: """ Check if the input is a scalar of this data type. @@ -506,7 +585,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedel Raises ------ TypeError - If the input JSON is not a valid representation of a scalar of this data type. + If the input JSON is not a valid representation of a scalar for this data type. """ if check_json_time(data): return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") @@ -518,30 +597,36 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd """ A Zarr data type for arrays containing NumPy Datetime64 data. - Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + Wraps the ``np.dtypes.TimeDelta64DType`` data type. Scalars for this data type are instances of ``np.datetime64``. Attributes ---------- dtype_cls : Type[np.dtypesTimeDelta64DType] The numpy dtype class for this data type. - _zarr_v3_name : ClassVar[Literal["numpy.timedelta64"]] - The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">m8"], Literal["`_. + + The Zarr V3 representation of this data type is defined in the ``numpy.datetime64`` + `specification document `_ """ dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names = (">M8", "M8"], Literal["M8", " TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V2]: """ Check that the input is a valid JSON representation of this data type. @@ -552,7 +637,7 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] Returns ------- - TypeGuard[DTypeConfig_V2[str, None]] + TypeGuard[DateTime64JSON_V2] True if the input is a valid JSON representation of a NumPy datetime64 data type, otherwise False. """ @@ -571,19 +656,9 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ - Check that the input is a valid JSON representation of this data type. - - The input must be a dictionary with the following structure: - - { - "name": "numpy.datetime64", - "configuration": { - "unit": , - "scale_factor": - } - } + Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- @@ -592,7 +667,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: Returns ------- - TypeGuard[DateTime64JSONV3] + TypeGuard[DateTime64JSON_V3] True if the input is a valid JSON representation of a numpy datetime64 data type in Zarr V3, False otherwise. """ @@ -609,7 +684,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: """ Create an instance of this data type from a Zarr V2-flavored JSON representation. - This method checks if the provided JSON data is a valid representation of the data type. + This method checks if the provided JSON data is a valid representation of this class. If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a DataTypeValidationError. @@ -626,7 +701,7 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v2(data): @@ -643,7 +718,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: """ Create an instance of this data type from a Zarr V3-flavored JSON representation. - This method checks if the provided JSON data is a valid representation of the data type. + This method checks if the provided JSON data is a valid representation of this class. If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a DataTypeValidationError. @@ -660,7 +735,7 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: Raises ------ DataTypeValidationError - If the input JSON is not a valid representation of this data type. + If the input JSON is not a valid representation of this class. """ if cls._check_json_v3(data): unit = data["configuration"]["unit"] @@ -674,9 +749,43 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) + @overload + def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + DateTime64JSON_V2 | DateTime64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: """ - Check if the input is a scalar of this data type. + Check if the input is convertible to a scalar of this data type. Parameters ---------- @@ -694,10 +803,7 @@ def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: """ - Cast the provided scalar data to np.datetime64 without checking. - - This method does not perform any type checking. - The input data must be a scalar of this data type. + Cast the input to a scalar of this data type without any type checking. Parameters ---------- @@ -707,13 +813,13 @@ def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: Returns ------- numpy.datetime64 - The casted data as a numpy datetime scalar. + The input cast to a NumPy datetime scalar. """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") def cast_scalar(self, data: object) -> np.datetime64: """ - Cast a scalar value to a numpy datetime scalar. + Cast the input to a scalar of this data type after a type check. Parameters ---------- @@ -723,7 +829,7 @@ def cast_scalar(self, data: object) -> np.datetime64: Returns ------- numpy.datetime64 - The data cast as a numpy datetime scalar. + The input cast to a NumPy datetime scalar. Raises ------ @@ -737,17 +843,19 @@ def cast_scalar(self, data: object) -> np.datetime64: def default_scalar(self) -> np.datetime64: """ - Return a default scalar of this data type. + Return the default scalar value for this data type. - This method provides a default value for the datetime64 scalar, which is - a 'Not-a-Time' (NaT) value. + Returns + ------- + numpy.datetime64 + The default scalar value, which is a 'Not-a-Time' (NaT) value """ return np.datetime64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: """ - Read a JSON-serializable value as a numpy datetime scalar. + Read a JSON-serializable value as a scalar. Parameters ---------- diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 7be97fa4b4..b53018c137 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -80,7 +80,9 @@ class variable, and it should generally be unique across different data types. @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ - Check that a native data type matches the dtype_cls class attribute. Used as a type guard. + Check that a native data type matches the dtype_cls class attribute. + + Used as a type guard. Parameters ---------- @@ -98,9 +100,10 @@ def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_ @abstractmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ - Create a ZDType instance from a native data type. The default implementation first performs - a type check via ``cls._check_native_dtype``. If that type check succeeds, the ZDType class - instance is created. + Create a ZDType instance from a native data type. + + The base implementation first performs a type check via ``cls._check_native_dtype``. + If that type check succeeds, the ZDType class instance is created. This method is used when taking a user-provided native data type, like a NumPy data type, and creating the corresponding ZDType instance from them. @@ -150,8 +153,7 @@ def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> S Parameters ---------- data : DTypeJSON - The JSON representation of the data type. The type annotation includes - Mapping[str, object] to accommodate typed dictionaries. + The JSON representation of the data type. zarr_format : ZarrFormat The zarr format version. @@ -159,7 +161,7 @@ def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> S Returns ------- Self - The wrapped data type. + An instance of this data type. """ if zarr_format == 2: return cls._from_json_v2(data) @@ -211,6 +213,7 @@ def _check_scalar(self, data: object) -> bool: def cast_scalar(self, data: object) -> TScalar_co: """ Cast a python object to the wrapped scalar type. + The type of the provided scalar is first checked for compatibility. If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. @@ -228,8 +231,9 @@ def cast_scalar(self, data: object) -> TScalar_co: @abstractmethod def default_scalar(self) -> TScalar_co: """ - Get the default scalar value for the wrapped data type. This is a method, rather than an - attribute, because the default value for some data types depends on parameters that are + Get the default scalar value for the wrapped data type. + + This is a method, rather than an attribute, because the default value for some data types depends on parameters that are not known until a concrete data type is wrapped. For example, data types parametrized by a length like fixed-length strings or bytes will generate scalars consistent with that length. @@ -263,8 +267,10 @@ def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TSca @abstractmethod def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ - Serialize a python object to the JSON representation of a scalar. The value will first be - cast to the scalar type associated with this ZDType, then serialized to JSON. + Serialize a python object to the JSON representation of a scalar. + + The value will first be cast to the scalar type associated with this ZDType, then serialized + to JSON. Parameters ---------- diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 1e6f322264..98485c7d65 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -4,6 +4,8 @@ Complex128, DataTypeValidationError, DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, FixedLengthUTF32, Float16, Float32, @@ -13,15 +15,23 @@ Int32, Int64, NullTerminatedBytes, + NullTerminatedBytesJSON_V3, RawBytes, + RawBytesJSON_V3, Structured, + StructuredJSON_V2, + StructuredJSON_V3, TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, UInt8, UInt16, UInt32, UInt64, VariableLengthBytes, + VariableLengthBytesJSON_V2, VariableLengthUTF8, + VariableLengthUTF8JSON_V2, ZDType, data_type_registry, parse_data_type, @@ -33,6 +43,8 @@ "Complex128", "DataTypeValidationError", "DateTime64", + "DateTime64JSON_V2", + "DateTime64JSON_V3", "FixedLengthUTF32", "Float16", "Float32", @@ -42,16 +54,24 @@ "Int32", "Int64", "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", "RawBytes", + "RawBytesJSON_V3", "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", "TimeDelta64", "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", "UInt8", "UInt16", "UInt32", "UInt64", "VariableLengthBytes", + "VariableLengthBytesJSON_V2", "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", "data_type_registry", From f608c121d6d3ab3f76613bbf91207d25363b868d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 2 Jul 2025 13:19:37 +0200 Subject: [PATCH 37/42] remove added features from list of missing features --- docs/user-guide/arrays.rst | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index d203b20844..baaf544e44 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -630,29 +630,6 @@ Missing features in 3.0 The following features have not been ported to 3.0 yet. -.. _user-guide-objects: - -Object arrays -~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Object arrays `_ for more details. - -.. _user-guide-strings: - -Fixed-length string arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Fixed-length string arrays `_ for more details. - -.. _user-guide-datetime: - -Datetime and Timedelta arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Datetime and Timedelta `_ for more details. - -.. _user-guide-copy: - Copying and migrating data ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 80bd0975edaa27ba9e20ce7fbea5316a73aaff91 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 2 Jul 2025 13:27:39 +0200 Subject: [PATCH 38/42] fix accidental copy + paste breakage --- src/zarr/core/dtype/npy/structured.py | 3 ++- src/zarr/core/dtype/npy/time.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 3faf70a475..7a0f01db01 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -2,7 +2,7 @@ from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload import numpy as np @@ -103,6 +103,7 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): The Zarr V2 data type specification can be found `here `_. """ + _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 84c01c235f..de8814bcd4 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -356,7 +356,7 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit: DateTimeUnit = "generic" scale_factor: int = 1 _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names: tuple[Literal[">m8"], Literal["m8", "m8"], Literal["m8", "M8"], Literal["M8", "M8"], Literal["M8", " Date: Wed, 2 Jul 2025 14:27:38 +0200 Subject: [PATCH 39/42] use anonymous rst links --- src/zarr/core/dtype/npy/bool.py | 2 +- src/zarr/core/dtype/npy/bytes.py | 6 +++--- src/zarr/core/dtype/npy/float.py | 6 +++--- src/zarr/core/dtype/npy/int.py | 16 ++++++++-------- src/zarr/core/dtype/npy/string.py | 6 +++--- src/zarr/core/dtype/npy/structured.py | 4 ++-- src/zarr/core/dtype/npy/time.py | 16 ++++++++-------- 7 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 15eaa61b12..2292807a8e 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -41,7 +41,7 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): ---------- This class implements the boolean data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index e148ca31b4..ad6d9c2195 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -56,7 +56,7 @@ class NullterminatedBytesJSON_V2(DTypeConfig_V2[str, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples @@ -105,7 +105,7 @@ class RawBytesJSON_V2(DTypeConfig_V2[str, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples @@ -149,7 +149,7 @@ class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-byt References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples -------- diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 26dde5d980..23623d54ae 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -324,7 +324,7 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): ---------- This class implements the float16 data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Float16DType @@ -361,7 +361,7 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): ---------- This class implements the float32 data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Float32DType @@ -398,7 +398,7 @@ class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): ---------- This class implements the float64 data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Float64DType diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index cbba824872..15e37d275c 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -242,7 +242,7 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): ---------- This class implements the 8-bit signed integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Int8DType @@ -398,7 +398,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): ---------- This class implements the 8-bit unsigned integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.UInt8DType @@ -545,7 +545,7 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): ---------- This class implements the 16-bit signed integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Int16DType @@ -707,7 +707,7 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): ---------- This class implements the unsigned 16-bit unsigned integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.UInt16DType @@ -869,7 +869,7 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): ---------- This class implements the 32-bit signed integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Int32DType @@ -1052,7 +1052,7 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): ---------- This class implements the 32-bit unsigned integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.UInt32DType @@ -1210,7 +1210,7 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): ---------- This class implements the 64-bit signed integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.Int64DType @@ -1368,7 +1368,7 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): ---------- This class implements the unsigned 64-bit integer data type defined in Zarr V2 and V3. - See the `Zarr V2 `_ and `Zarr V3 `_ specification documents for details. + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ dtype_cls = np.dtypes.UInt64DType diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index cdf2e4fbfc..64384b2d8b 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -70,7 +70,7 @@ class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples -------- @@ -434,7 +434,7 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8 References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples @@ -467,7 +467,7 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): ---------- This data type does not have a Zarr V3 specification. - The Zarr V2 data type specification can be found `here `_. + The Zarr V2 data type specification can be found `here `__. """ _zarr_v3_name: ClassVar[Literal["string"]] = "string" diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 7a0f01db01..1484b2b1ee 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -41,7 +41,7 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples -------- @@ -100,7 +100,7 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): ---------- This data type does not have a Zarr V3 specification. - The Zarr V2 data type specification can be found `here `_. + The Zarr V2 data type specification can be found `here `__. """ _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index de8814bcd4..88c3ce6c39 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -129,7 +129,7 @@ class DateTime64JSON_V3(NamedConfig[Literal["numpy.datetime64"], TimeConfig]): References ---------- This representation is defined in the ``numpy.datetime64`` - `specification document `_. + `specification document `__. Examples -------- @@ -152,7 +152,7 @@ class TimeDelta64JSON_V3(NamedConfig[Literal["numpy.timedelta64"], TimeConfig]): References ---------- This representation is defined in the numpy.timedelta64 - `specification document `_. + `specification document `__. Examples -------- @@ -178,7 +178,7 @@ class TimeDelta64JSON_V2(DTypeConfig_V2[str, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples @@ -202,7 +202,7 @@ class DateTime64JSON_V2(DTypeConfig_V2[str, None]): References ---------- The structure of the ``name`` field is defined in the Zarr V2 - `specification document `_. + `specification document `__. Examples @@ -344,10 +344,10 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has References ---------- The Zarr V2 representation of this data type is defined in the Zarr V2 - `specification document `_. + `specification document `__. The Zarr V3 representation of this data type is defined in the ``numpy.timedelta64`` - `specification document `_ + `specification document `__ """ # mypy infers the type of np.dtypes.TimeDelta64DType to be @@ -612,10 +612,10 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd References ---------- The Zarr V2 representation of this data type is defined in the Zarr V2 - `specification document `_. + `specification document `__. The Zarr V3 representation of this data type is defined in the ``numpy.datetime64`` - `specification document `_ + `specification document `__ """ dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] From 368145f0184b0b707e08576143d74199ab66fd5b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 3 Jul 2025 16:24:08 +0200 Subject: [PATCH 40/42] normalize typerror when check_scalar fails, and add tests for it --- examples/custom_dtype.py | 5 ++- src/zarr/core/dtype/npy/bool.py | 7 ++-- src/zarr/core/dtype/npy/bytes.py | 15 ++++++-- src/zarr/core/dtype/npy/complex.py | 5 ++- src/zarr/core/dtype/npy/float.py | 8 +++-- src/zarr/core/dtype/npy/int.py | 5 ++- src/zarr/core/dtype/npy/string.py | 24 +++++++------ src/zarr/core/dtype/npy/structured.py | 5 ++- src/zarr/core/dtype/npy/time.py | 10 ++++-- tests/test_dtype/conftest.py | 5 +-- tests/test_dtype/test_npy/test_bool.py | 1 + tests/test_dtype/test_npy/test_bytes.py | 9 +++-- tests/test_dtype/test_npy/test_complex.py | 3 +- tests/test_dtype/test_npy/test_float.py | 6 ++-- tests/test_dtype/test_npy/test_int.py | 9 ++++- tests/test_dtype/test_npy/test_string.py | 7 +++- tests/test_dtype/test_npy/test_structured.py | 15 +++++--- tests/test_dtype/test_npy/test_time.py | 8 +++++ tests/test_dtype/test_wrapper.py | 37 +++++++++++++++++--- 19 files changed, 136 insertions(+), 48 deletions(-) diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py index 5670b3821b..a98f3414f6 100644 --- a/examples/custom_dtype.py +++ b/examples/custom_dtype.py @@ -169,7 +169,10 @@ def cast_scalar(self, data: object) -> ml_dtypes.int2: """ if self._check_scalar(data): return ml_dtypes.int2(data) - msg = f"Cannot convert object with type {type(data)} to a 2-bit integer." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> ml_dtypes.int2: diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index 2292807a8e..aaf8445f0a 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -246,8 +246,11 @@ def cast_scalar(self, data: object) -> np.bool_: """ if self._check_scalar(data): return np.bool_(data) - msg = f"Cannot convert object with type {type(data)} to a numpy boolean." - raise TypeError(msg) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no cover def default_scalar(self) -> np.bool_: """ diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index ad6d9c2195..b7c764dcd9 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -458,7 +458,10 @@ def cast_scalar(self, data: object) -> np.bytes_: if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a NumPy bytes scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.bytes_: @@ -849,7 +852,10 @@ def cast_scalar(self, data: object) -> np.void: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a NumPy void scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.void: @@ -1263,5 +1269,8 @@ def cast_scalar(self, data: object) -> bytes: if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to bytes." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 5de2018f3a..2f432a9e0a 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -277,7 +277,10 @@ def cast_scalar(self, data: object) -> TComplexScalar_co: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> TComplexScalar_co: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 23623d54ae..3113bc5b61 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -11,7 +11,6 @@ DTypeJSON, HasEndianness, HasItemSize, - ScalarTypeValidationError, check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( @@ -235,8 +234,11 @@ def cast_scalar(self, data: object) -> TFloatScalar_co: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a NumPy float scalar." - raise ScalarTypeValidationError(msg) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) def default_scalar(self) -> TFloatScalar_co: """ diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index 15e37d275c..01a79142a3 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -166,7 +166,10 @@ def cast_scalar(self, data: object) -> TIntScalar_co: if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a NumPy integer." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> TIntScalar_co: diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 64384b2d8b..8cd04e7814 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -344,7 +344,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: + def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: """ Check that the input is a valid scalar value for this data type. @@ -355,11 +355,11 @@ def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: Returns ------- - TypeGuard[str | np.str_ | bytes | int] + TypeGuard[SupportsStr] Whether the input is a valid scalar value for this data type. """ # this is generous for backwards compatibility - return isinstance(data, str | np.str_ | bytes | int) + return isinstance(data, SupportsStr) def cast_scalar(self, data: object) -> np.str_: """ @@ -383,13 +383,13 @@ def cast_scalar(self, data: object) -> np.str_: # >>> x.dtype # dtype('U11') - if isinstance(data, int): - return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) - raise TypeError( - f"Cannot convert object with type {type(data)} to a NumPy unicode string scalar." + return self.to_native_dtype().type(str(data)[: self.length]) + + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." ) + raise TypeError(msg) # pragma: no-cover @property def item_size(self) -> int: @@ -711,7 +711,11 @@ def cast_scalar(self, data: object) -> str: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - raise TypeError(f"Cannot convert object with type {type(data)} to a Python string.") + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) if _NUMPY_SUPPORTS_VLEN_STRING: diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 1484b2b1ee..a0e3b0fbd4 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -399,7 +399,10 @@ def cast_scalar(self, data: object) -> np.void: if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a NumPy structured scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.void: diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 88c3ce6c39..d523e16940 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -554,7 +554,10 @@ def cast_scalar(self, data: object) -> np.timedelta64: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.timedelta64: @@ -838,7 +841,10 @@ def cast_scalar(self, data: object) -> np.datetime64: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.datetime64: diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 0be1c60088..0650d143c6 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -65,7 +65,4 @@ class TestB(TestExample): for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): params = getattr(metafunc.cls, fixture_name) - if len(params) == 0: - msg = f"{metafunc.cls}.{fixture_name} is empty. Please provide a non-empty sequence of values." - raise ValueError(msg) - metafunc.parametrize(fixture_name, params, scope="class") + metafunc.parametrize(fixture_name, params, scope="class", ids=str) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 010dec2e47..da30214b3b 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -38,4 +38,5 @@ class TestBool(BaseTestZDType): (Bool(), np.True_, np.True_), (Bool(), np.False_, np.False_), ) + invalid_scalar_params = (None,) item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py index 3f1ba9315e..8772f1a380 100644 --- a/tests/test_dtype/test_npy/test_bytes.py +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -45,6 +45,7 @@ class TestNullTerminatedBytes(BaseTestZDType): (NullTerminatedBytes(length=2), "ab", np.bytes_("ab")), (NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")), ) + invalid_scalar_params = ((NullTerminatedBytes(length=1), 1.0),) item_size_params = ( NullTerminatedBytes(length=1), NullTerminatedBytes(length=4), @@ -91,6 +92,7 @@ class TestRawBytes(BaseTestZDType): (RawBytes(length=2), b"ab", np.void(b"ab")), (RawBytes(length=4), b"abcd", np.void(b"abcd")), ) + invalid_scalar_params = ((RawBytes(length=1), 1.0),) item_size_params = ( RawBytes(length=1), RawBytes(length=4), @@ -133,11 +135,8 @@ class TestVariableLengthBytes(BaseTestZDType): (VariableLengthBytes(), "ab", b"ab"), (VariableLengthBytes(), "abcdefg", b"abcdefg"), ) - item_size_params = ( - VariableLengthBytes(), - VariableLengthBytes(), - VariableLengthBytes(), - ) + invalid_scalar_params = ((VariableLengthBytes(), 1.0),) + item_size_params = (VariableLengthBytes(),) @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b6a1e799eb..b4ce42be58 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -54,7 +54,7 @@ class TestComplex64(_BaseTestFloat): (Complex64(), complex(-1.0, math.inf), np.complex64(complex(-1.0, math.inf))), (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) - + invalid_scalar_params = ((Complex64(), {"type": "dict"}),) item_size_params = (Complex64(),) @@ -97,4 +97,5 @@ class TestComplex128(_BaseTestFloat): (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) + invalid_scalar_params = ((Complex128(), {"type": "dict"}),) item_size_params = (Complex128(),) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index e875dc87e3..90fa27c9cf 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -65,7 +65,7 @@ class TestFloat16(_BaseTestFloat): (Float16(), -1.0, np.float16(-1.0)), (Float16(), "NaN", np.float16("NaN")), ) - + invalid_scalar_params = ((Float16(), {"set!"}),) hex_string_params = (("0x7fc0", np.nan), ("0x7fc1", np.nan), ("0x3c00", 1.0)) item_size_params = (Float16(),) @@ -113,7 +113,7 @@ class TestFloat32(_BaseTestFloat): (Float32(), -1.0, np.float32(-1.0)), (Float32(), "NaN", np.float32("NaN")), ) - + invalid_scalar_params = ((Float32(), {"set!"}),) hex_string_params = (("0x7fc00000", np.nan), ("0x7fc00001", np.nan), ("0x3f800000", 1.0)) item_size_params = (Float32(),) @@ -160,7 +160,7 @@ class TestFloat64(_BaseTestFloat): (Float64(), -1.0, np.float64(-1.0)), (Float64(), "NaN", np.float64("NaN")), ) - + invalid_scalar_params = ((Float64(), {"set!"}),) hex_string_params = ( ("0x7ff8000000000000", np.nan), ("0x7ff8000000000001", np.nan), diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 71257907d5..efc4fae496 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -34,6 +34,7 @@ class TestInt8(BaseTestZDType): (Int8(), 1, np.int8(1)), (Int8(), -1, np.int8(-1)), ) + invalid_scalar_params = ((Int8(), {"set!"}), (Int8(), ("tuple",))) item_size_params = (Int8(),) @@ -68,7 +69,7 @@ class TestInt16(BaseTestZDType): (Int16(), 1, np.int16(1)), (Int16(), -1, np.int16(-1)), ) - + invalid_scalar_params = ((Int16(), {"set!"}), (Int16(), ("tuple",))) item_size_params = (Int16(),) @@ -106,6 +107,7 @@ class TestInt32(BaseTestZDType): (Int32(), 1, np.int32(1)), (Int32(), -1, np.int32(-1)), ) + invalid_scalar_params = ((Int32(), {"set!"}), (Int32(), ("tuple",))) item_size_params = (Int32(),) @@ -140,6 +142,7 @@ class TestInt64(BaseTestZDType): (Int64(), 1, np.int64(1)), (Int64(), -1, np.int64(-1)), ) + invalid_scalar_params = ((Int64(), {"set!"}), (Int64(), ("tuple",))) item_size_params = (Int64(),) @@ -171,6 +174,7 @@ class TestUInt8(BaseTestZDType): (UInt8(), 1, np.uint8(1)), (UInt8(), 0, np.uint8(0)), ) + invalid_scalar_params = ((UInt8(), {"set!"}), (UInt8(), ("tuple",))) item_size_params = (UInt8(),) @@ -205,6 +209,7 @@ class TestUInt16(BaseTestZDType): (UInt16(), 1, np.uint16(1)), (UInt16(), 0, np.uint16(0)), ) + invalid_scalar_params = ((UInt16(), {"set!"}), (UInt16(), ("tuple",))) item_size_params = (UInt16(),) @@ -239,6 +244,7 @@ class TestUInt32(BaseTestZDType): (UInt32(), 1, np.uint32(1)), (UInt32(), 0, np.uint32(0)), ) + invalid_scalar_params = ((UInt32(), {"set!"}), (UInt32(), ("tuple",))) item_size_params = (UInt32(),) @@ -273,4 +279,5 @@ class TestUInt64(BaseTestZDType): (UInt64(), 1, np.uint64(1)), (UInt64(), 0, np.uint64(0)), ) + invalid_scalar_params = ((UInt64(), {"set!"}), (UInt64(), ("tuple",))) item_size_params = (UInt64(),) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 7c3c6a8cd4..2cde6a1ac1 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -40,6 +40,8 @@ class TestVariableLengthString(BaseTestZDType): (VariableLengthUTF8(), "", np.str_("")), (VariableLengthUTF8(), "hi", np.str_("hi")), ) + # anything can become a string + invalid_scalar_params = (None,) item_size_params = (VariableLengthUTF8(),) else: @@ -74,7 +76,8 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] (VariableLengthUTF8(), "", np.str_("")), (VariableLengthUTF8(), "hi", np.str_("hi")), ) - + # anything can become a string + invalid_scalar_params = (None,) item_size_params = (VariableLengthUTF8(),) @@ -118,6 +121,8 @@ class TestFixedLengthUTF32(BaseTestZDType): FixedLengthUTF32(length=4), FixedLengthUTF32(length=10), ) + # anything can become a string + invalid_scalar_params = (None,) @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_structured.py b/tests/test_dtype/test_npy/test_structured.py index c51aa73ff3..e2cd2a6dfe 100644 --- a/tests/test_dtype/test_npy/test_structured.py +++ b/tests/test_dtype/test_npy/test_structured.py @@ -98,16 +98,21 @@ class TestStructured(BaseTestZDType): ), ) - def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: - if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): - return np.array_equal(scalar1, scalar2) - return super().scalar_equals(scalar1, scalar2) - item_size_params = ( Structured(fields=(("field1", Int32()), ("field2", Float64()))), Structured(fields=(("field1", Int64()), ("field2", Int32()))), ) + invalid_scalar_params = ( + (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "i am a string"), + (Structured(fields=(("field1", Int32()), ("field2", Float64()))), {"type": "dict"}), + ) + + def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: + if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): + return np.array_equal(scalar1, scalar2) + return super().scalar_equals(scalar1, scalar2) + def test_invalid_size() -> None: """ diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index e201be5cf6..b94b600cbf 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -68,6 +68,10 @@ class TestDateTime64(_TestTimeBase): (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), ) + invalid_scalar_params = ( + (DateTime64(unit="Y", scale_factor=1), 1.3), + (DateTime64(unit="Y", scale_factor=1), [1.3]), + ) item_size_params = (DateTime64(unit="ns", scale_factor=1),) @@ -113,6 +117,10 @@ class TestTimeDelta64(_TestTimeBase): (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), ) + invalid_scalar_params = ( + (TimeDelta64(unit="Y", scale_factor=1), 1.3), + (TimeDelta64(unit="Y", scale_factor=1), [1.3]), + ) item_size_params = (TimeDelta64(unit="ns", scale_factor=1),) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 8f461f1a77..cc365e86d4 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING, Any, ClassVar import pytest @@ -59,6 +60,16 @@ class BaseTestZDType: A tuple of invalid JSON representations for Zarr format version 3. cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. + scalar_v2_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v2 + scalar_v3_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v3 + invalid_scalar_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, value) tuples, where each value is expected to fail ZDType.cast_value. + item_size_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, expected) tuples for testing ZDType.item_size """ test_cls: type[ZDType[TBaseDType, TBaseScalar]] @@ -76,10 +87,13 @@ class BaseTestZDType: # pairs. the first element of the pair is used to create a dtype instance, and the second # element is the json serialization of the scalar that we want to round-trip. - scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + scalar_v2_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] - item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] + cast_value_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any, Any], ...]] = () + # Some data types, like bool and string, can consume any python object as a scalar. + # So we allow passing None in to this test to indicate that it should be skipped. + invalid_scalar_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...] | tuple[None]] = () + item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] = () def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: # An equality check for json-encoded scalars. This defaults to regular equality, @@ -124,6 +138,21 @@ def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) zdtype, value, expected = cast_value_params observed = zdtype.cast_scalar(value) assert self.scalar_equals(expected, observed) + # check that casting is idempotent + assert self.scalar_equals(zdtype.cast_scalar(observed), observed) + + def test_invalid_scalar( + self, invalid_scalar_params: tuple[ZDType[Any, Any], Any] | None + ) -> None: + if invalid_scalar_params is None: + pytest.skip(f"No test data provided for {self}.{__name__}") + zdtype, data = invalid_scalar_params + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {zdtype}." + ) + with pytest.raises(TypeError, match=re.escape(msg)): + zdtype.cast_scalar(data) def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: """ @@ -133,4 +162,4 @@ def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: if isinstance(item_size_params, HasItemSize): assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize else: - pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") + pytest.skip(f"Data type {item_size_params} does not implement HasItemSize") From 87c71faaf33289860e0d46dfd78a69bc423c6690 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 3 Jul 2025 16:41:22 +0200 Subject: [PATCH 41/42] prose --- docs/user-guide/data_types.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index cefbfb4a39..dc29874b3b 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -283,12 +283,12 @@ Serialize to JSON for Zarr V2: .. note:: - The representation returned by ``to_json`` is more abstract than the literal contents of Zarr V2 - array metadata, because the JSON representation used by the ``ZDType`` classes must be distinct across - different data types. Zarr V2 identifies multiple distinct data types with the "object" data type - identifier ``"|O"``, which means extra information is needed to disambiguate these data types from - one another. That's the reason for the ``object_codec_id`` field you see here. See the - `section <#object-data-type>`_ on the "object" data type for more information. + The representation returned by ``to_json(zarr_format=2)`` is more abstract than the literal contents + of Zarr V2 array metadata, because the JSON representation used by the ``ZDType`` classes must be + distinct across different data types. As noted `earlier <#object-data-type>`_, Zarr V2 identifies + multiple distinct data types with the "object" data type identifier ``"|O"``. Extra information + is needed to disambiguate these data types from one another. That's the reason for the + ``object_codec_id`` field you see here. And for V3: From 48000fcb526d8941fbaef0042d512aef391a462f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 3 Jul 2025 16:55:37 +0200 Subject: [PATCH 42/42] improve coverage the hard way and the easy way --- src/zarr/core/dtype/npy/bool.py | 2 +- src/zarr/core/dtype/npy/string.py | 6 +++--- tests/test_dtype/test_npy/test_bytes.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index aaf8445f0a..37371cd0cd 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -246,7 +246,7 @@ def cast_scalar(self, data: object) -> np.bool_: """ if self._check_scalar(data): return np.bool_(data) - msg = ( + msg = ( # pragma: no cover f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 8cd04e7814..32375a1c71 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -385,7 +385,7 @@ def cast_scalar(self, data: object) -> np.str_: return self.to_native_dtype().type(str(data)[: self.length]) - msg = ( + msg = ( # pragma: no cover f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) @@ -711,11 +711,11 @@ def cast_scalar(self, data: object) -> str: """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = ( + msg = ( # pragma: no cover f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) - raise TypeError(msg) + raise TypeError(msg) # pragma: no cover if _NUMPY_SUPPORTS_VLEN_STRING: diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py index 8772f1a380..78980f7809 100644 --- a/tests/test_dtype/test_npy/test_bytes.py +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -24,6 +24,7 @@ class TestNullTerminatedBytes(BaseTestZDType): "|S", "|U10", "|f8", + {"name": "|S4", "object_codec_id": "vlen-bytes"}, ) invalid_json_v3 = ( {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}},