diff --git a/changes/3157.doc.rst b/changes/3157.doc.rst new file mode 100644 index 0000000000..6132b195ec --- /dev/null +++ b/changes/3157.doc.rst @@ -0,0 +1,2 @@ +Add a self-contained example of data type extension to the ``examples`` directory, and expanded +the documentation for data types. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 68bf003ad5..61d83ef819 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -38,7 +38,6 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", - "sphinx.ext.viewcode", "sphinx.ext.intersphinx", 'autoapi.extension', "numpydoc", @@ -46,6 +45,7 @@ "sphinx_copybutton", "sphinx_design", 'sphinx_reredirects', + "sphinx.ext.viewcode", ] issues_github_path = "zarr-developers/zarr-python" @@ -56,7 +56,7 @@ autoapi_member_order = "groupwise" autoapi_root = "api" autoapi_keep_files = True -autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] +autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', 'inherited-members'] def skip_submodules( app: sphinx.application.Sphinx, diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index c27f1296b9..baaf544e44 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -23,7 +23,8 @@ The code above creates a 2-dimensional array of 32-bit integers with 10000 rows and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 columns (and so there will be 100 chunks in total). The data is written to a :class:`zarr.storage.MemoryStore` (e.g. an in-memory dict). See -:ref:`user-guide-persist` for details on storing arrays in other stores. +:ref:`user-guide-persist` for details on storing arrays in other stores, and see +:ref:`user-guide-data-types` for an in-depth look at the data types supported by Zarr. For a complete list of array creation routines see the :mod:`zarr` module documentation. @@ -629,29 +630,6 @@ Missing features in 3.0 The following features have not been ported to 3.0 yet. -.. _user-guide-objects: - -Object arrays -~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Object arrays `_ for more details. - -.. _user-guide-strings: - -Fixed-length string arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Fixed-length string arrays `_ for more details. - -.. _user-guide-datetime: - -Datetime and Timedelta arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Datetime and Timedelta `_ for more details. - -.. _user-guide-copy: - Copying and migrating data ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index 87c8efc1f5..dc29874b3b 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -1,34 +1,56 @@ -Data types -========== +.. _user-guide-data-types: -Zarr's data type model +Array data types +================ + +Zarr's Data Type Model ---------------------- -Every Zarr array has a "data type", which defines the meaning and physical layout of the -array's elements. As Zarr Python is tightly integrated with `NumPy `_, -it's easy to create arrays with NumPy data types: +Zarr is designed for interoperability with NumPy, so if you are familiar with NumPy or any other +N-dimensional array library, Zarr's model for array data types should seem familiar. However, Zarr +data types have some unique features that are described in this document. -.. code-block:: python +Zarr arrays operate under an essential design constraint: unlike NumPy arrays, Zarr arrays +are designed to be stored and accessed by other Zarr implementations. This means that, among other things, +Zarr data types must be serializable to metadata documents in accordance with the Zarr specifications, +which adds some unique aspects to the Zarr data type model. - >>> import zarr - >>> import numpy as np - >>> z = zarr.create_array(store={}, shape=(10,), dtype=np.dtype('uint8')) - >>> z - +The following sections explain Zarr's data type model in greater detail and demonstrate the +Zarr Python APIs for working with Zarr data types. + +Array Data Types +^^^^^^^^^^^^^^^^ + +Every Zarr array has a data type, which defines the meaning of the array's elements. An array's data +type is encoded in the JSON metadata for the array. This means that the data type of an array must be +JSON-serializable. + +In Zarr V2, the data type of an array is stored in the ``dtype`` field in array metadata. +Zarr V3 changed the name of this field to ``data_type`` and also defined new rules for the values +that can be assigned to the ``data_type`` field. + +For example, in Zarr V2, the boolean array data type was represented in array metadata as the +string ``"|b1"``. In Zarr V3, the same type is represented as the string ``"bool"``. + +Scalars +^^^^^^^ -Unlike NumPy arrays, Zarr arrays are designed to accessed by Zarr -implementations in different programming languages. This means Zarr data types must be interpreted -correctly when clients read an array. Each Zarr data type defines procedures for -encoding and decoding both the data type itself, and scalars from that data type to and from Zarr array metadata. And these serialization procedures -depend on the Zarr format. +Zarr also specifies how array elements, i.e., scalars, are encoded in array metadata. This is necessary +because Zarr uses a field in array metadata to define a default value for chunks that are not stored. +This field, called ``fill_value`` in both Zarr V2 and Zarr V3 metadata documents, contains a +JSON value that can be decoded to a scalar value compatible with the array's data type. -Data types in Zarr version 2 ------------------------------ +For the boolean data type, the scalar encoding is simple—booleans are natively supported by +JSON, so Zarr saves booleans as JSON booleans. Other scalars, like floats or raw bytes, have +more elaborate encoding schemes, and in some cases, this scheme depends on the Zarr format version. + +Data Types in Zarr Version 2 +---------------------------- Version 2 of the Zarr format defined its data types relative to `NumPy's data types `_, -and added a few non-NumPy data types as well. Thus the JSON identifier for a NumPy-compatible data -type is just the NumPy ``str`` attribute of that data type: +and added a few non-NumPy data types as well. With one exception (`structured data types <#structured-data-type>`_), the Zarr +V2 JSON identifier for a data type is just the NumPy ``str`` attribute of that data type: .. code-block:: python @@ -38,45 +60,94 @@ type is just the NumPy ``str`` attribute of that data type: >>> >>> store = {} >>> np_dtype = np.dtype('int64') + >>> np_dtype.str + '>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] >>> dtype_meta '>> assert dtype_meta == np_dtype.str .. note:: + The ``<`` character in the data type metadata encodes the `endianness `_, - or "byte order", of the data type. Following NumPy's example, + or "byte order," of the data type. As per the NumPy model, in Zarr version 2 each data type has an endianness where applicable. However, Zarr version 3 data types do not store endianness information. -In addition to defining a representation of the data type itself (which in the example above was -just a simple string ``"`_, and +`"object" <#object-data-type>`_ data types. -More broadly, each Zarr data type defines its own rules for how scalars of that type are stored in -``JSON``. +Structured Data Type +^^^^^^^^^^^^^^^^^^^^ +NumPy allows the construction of a so-called "structured" data types comprised of ordered collections +of named fields, where each field is itself a distinct NumPy data type. See the NumPy documentation +`here `_. -Data types in Zarr version 3 ------------------------------ +Crucially, NumPy does not use a special data type for structured data types—instead, NumPy +implements structured data types as an optional feature of the so-called "Void" data type, which models +arbitrary fixed-size byte strings. The ``str`` attribute of a regular NumPy void +data type is the same as the ``str`` of a NumPy structured data type. This means that the ``str`` +attribute does not convey information about the fields contained in a structured data type. +For these reasons, Zarr V2 uses a special data type encoding for structured data types. +They are stored in JSON as lists of pairs, where the first element is a string, and the second +element is a Zarr V2 data type specification. This representation supports recursion. -Zarr V3 brings several key changes to how data types are represented: +For example: + +.. code-block:: python -- Zarr V3 identifies the basic data types as strings like ``"int8"``, ``"int16"``, etc. + >>> store = {} + >>> np_dtype = np.dtype([('field_a', '>i2'), ('field_b', [('subfield_c', '>f4'), ('subfield_d', 'i2')])]) + >>> np_dtype.str + '|V8' + >>> z = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=2) + >>> dtype_meta = json.loads(store['.zarray'].to_bytes())["dtype"] + >>> dtype_meta + [['field_a', '>i2'], ['field_b', [['subfield_c', '>f4'], ['subfield_d', 'M[10s]"`` in - Zarr V2. This is more compact, but can be harder to parse. +- Zarr V3 data types do not have endianness. This is a departure from Zarr V2, where multi-byte + data types are defined with endianness information. Instead, Zarr V3 requires that the endianness + of encoded array chunks is specified in the ``codecs`` attribute of array metadata. The Zarr + V3 specification leaves the in-memory endianness of decoded array chunks as an implementation detail. For more about data types in Zarr V3, see the `V3 specification `_. -Data types in Zarr Python +Data Types in Zarr Python ------------------------- -The two Zarr formats that Zarr Python supports specify data types in two different ways: -data types in Zarr version 2 are encoded as NumPy-compatible strings, while data types in Zarr version -3 are encoded as either strings or ``JSON`` objects, -and the Zarr V3 data types don't have any associated endianness information, unlike Zarr V2 data types. +The two Zarr formats that Zarr Python supports specify data types in different ways: data types in +Zarr version 2 are encoded as NumPy-compatible strings (or lists, in the case of structured data +types), while data types in Zarr V3 are encoded as either strings or JSON objects. Zarr V3 data +types do not have any associated endianness information, unlike Zarr V2 data types. -To abstract over these syntactical and semantic differences, Zarr Python uses a class called -`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ provide Zarr V2 and Zarr V3 compatibility -routines for ""native" data types. In this context, a "native" data type is a Python class, -typically defined in another library, that models an array's data type. For example, ``np.uint8`` is a native -data type defined in NumPy, which Zarr Python wraps with a ``ZDType`` instance called +Zarr Python needs to support both Zarr V2 and V3, which means we need to abstract over these differences. +We do this with an abstract Zarr data type class: `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_, +which provides Zarr V2 and Zarr V3 compatibility routines for "native" data types. + +In this context, a "native" data type is a Python class, typically defined in another library, that +models an array's data type. For example, ``np.dtypes.UInt8DType`` is a native data type defined in NumPy. +Zarr Python wraps the NumPy ``uint8`` with a ``ZDType`` instance called `UInt8 <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. -Each data type supported by Zarr Python is modeled by ``ZDType`` subclass, which provides an -API for the following operations: +As of this writing, the only native data types Zarr Python supports are NumPy data types. We could +avoid the "native data type" jargon and just say "NumPy data type," but we do not want to rule out the +possibility of using non-NumPy array backends in the future. -- Wrapping / unwrapping a native data type -- Encoding / decoding a data type to / from Zarr V2 and Zarr V3 array metadata. -- Encoding / decoding a scalar value to / from Zarr V2 and Zarr V3 array metadata. +Each data type supported by Zarr Python is modeled by a ``ZDType`` subclass, which provides an +API for the following operations: +- Encoding and decoding a native data type +- Encoding and decoding a data type to and from Zarr V2 and Zarr V3 array metadata +- Encoding and decoding a scalar value to and from Zarr V2 and Zarr V3 array metadata +- Casting a Python object to a scalar value consistent with the data type + +List of data types +^^^^^^^^^^^^^^^^^^ + +The following section lists the data types built in to Zarr Python. With a few exceptions, Zarr +Python supports nearly all of the data types in NumPy. If you need a data type that is not listed +here, it's possible to create it yourself: see :ref:`adding-new-data-types`. + +Boolean +""""""" +- `Boolean <../api/zarr/dtype/index.html#zarr.dtype.Bool>`_ + +Integral +"""""""" +- `Signed 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int8>`_ +- `Signed 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int16>`_ +- `Signed 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int32>`_ +- `Signed 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.Int64>`_ +- `Unsigned 8-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt8>`_ +- `Unsigned 16-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt16>`_ +- `Unsigned 32-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt32>`_ +- `Unsigned 64-bit integer <../api/zarr/dtype/index.html#zarr.dtype.UInt64>`_ + +Floating-point +"""""""""""""" +- `16-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float16>`_ +- `32-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float32>`_ +- `64-bit floating-point <../api/zarr/dtype/index.html#zarr.dtype.Float64>`_ +- `64-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex64>`_ +- `128-bit complex floating-point <../api/zarr/dtype/index.html#zarr.dtype.Complex128>`_ + +String +"""""" +- `Fixed-length UTF-32 string <../api/zarr/dtype/index.html#zarr.dtype.FixedLengthUTF32>`_ +- `Variable-length UTF-8 string <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthUTF8>`_ + +Bytes +""""" +- `Fixed-length null-terminated bytes <../api/zarr/dtype/index.html#zarr.dtype.NullTerminatedBytes>`_ +- `Fixed-length raw bytes <../api/zarr/dtype/index.html#zarr.dtype.RawBytes>`_ +- `Variable-length bytes <../api/zarr/dtype/index.html#zarr.dtype.VariableLengthBytes>`_ + +Temporal +"""""""" +- `DateTime64 <../api/zarr/dtype/index.html#zarr.dtype.DateTime64>`_ +- `TimeDelta64 <../api/zarr/dtype/index.html#zarr.dtype.TimeDelta64>`_ + +Struct-like +""""""""""" +- `Structured <../api/zarr/dtype/index.html#zarr.dtype.Structured>`_ Example Usage -~~~~~~~~~~~~~ +^^^^^^^^^^^^^ + +This section will demonstrates the basic usage of Zarr data types. Create a ``ZDType`` from a native data type: @@ -130,7 +259,7 @@ Create a ``ZDType`` from a native data type: >>> import numpy as np >>> int8 = Int8.from_native_dtype(np.dtype('int8')) -Convert back to native data type: +Convert back to a native data type: .. code-block:: python @@ -144,14 +273,27 @@ Get the default scalar value for the data type: >>> default_value = int8.default_scalar() >>> assert default_value == np.int8(0) - -Serialize to JSON for Zarr V2 and V3 +Serialize to JSON for Zarr V2: .. code-block:: python >>> json_v2 = int8.to_json(zarr_format=2) >>> json_v2 {'name': '|i1', 'object_codec_id': None} + +.. note:: + + The representation returned by ``to_json(zarr_format=2)`` is more abstract than the literal contents + of Zarr V2 array metadata, because the JSON representation used by the ``ZDType`` classes must be + distinct across different data types. As noted `earlier <#object-data-type>`_, Zarr V2 identifies + multiple distinct data types with the "object" data type identifier ``"|O"``. Extra information + is needed to disambiguate these data types from one another. That's the reason for the + ``object_codec_id`` field you see here. + +And for V3: + +.. code-block:: python + >>> json_v3 = int8.to_json(zarr_format=3) >>> json_v3 'int8' @@ -170,3 +312,101 @@ Deserialize a scalar value from JSON: >>> scalar_value = int8.from_json_scalar(42, zarr_format=3) >>> assert scalar_value == np.int8(42) + +.. _adding-new-data-types: + +Adding New Data Types +^^^^^^^^^^^^^^^^^^^^^ + +Each Zarr data type is a separate Python class that inherits from +`ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_. You can define a custom data type by +writing your own subclass of `ZDType <../api/zarr/dtype/index.html#zarr.dtype.ZDType>`_ and adding +your data type to the data type registry. A complete example of this process is included below. + +The source code for this example can be found in the ``examples/custom_dtype.py`` file in the Zarr +Python project directory. + +.. literalinclude:: ../../examples/custom_dtype.py + :language: python + +Data Type Resolution +^^^^^^^^^^^^^^^^^^^^ + +Although Zarr Python uses a different data type model from NumPy, you can still define a Zarr array +with a NumPy data type object: + +.. code-block:: python + + >>> from zarr import create_array + >>> import numpy as np + >>> a = create_array({}, shape=(10,), dtype=np.dtype('int')) + >>> a + + +Or a string representation of a NumPy data type: + +.. code-block:: python + + >>> a = create_array({}, shape=(10,), dtype='>> a + + +The ``Array`` object presents itself like a NumPy array, including exposing a NumPy +data type as its ``dtype`` attribute: + +.. code-block:: python + + >>> type(a.dtype) + + +But if we inspect the metadata for the array, we can see the Zarr data type object: + +.. code-block:: python + + >>> type(a.metadata.data_type) + + +This example illustrates a general problem Zarr Python has to solve: how can we allow users to +specify a data type as a string or a NumPy ``dtype`` object, and produce the right Zarr data type +from that input? We call this process "data type resolution." Zarr Python also performs data type +resolution when reading stored arrays, although in this case the input is a JSON value instead +of a NumPy data type. + +For simple data types like ``int``, the solution could be extremely simple: just +maintain a lookup table that maps a NumPy data type to the Zarr data type equivalent. But not all +data types are so simple. Consider this case: + +.. code-block:: python + + >>> from zarr import create_array + >>> import warnings + >>> import numpy as np + >>> warnings.simplefilter("ignore", category=FutureWarning) + >>> a = create_array({}, shape=(10,), dtype=[('a', 'f8'), ('b', 'i8')]) + >>> a.dtype # this is the NumPy data type + dtype([('a', '>> a.metadata.data_type # this is the Zarr data type + Structured(fields=(('a', Float64(endianness='little')), ('b', Int64(endianness='little')))) + +In this example, we created a +`NumPy structured data type `_. +This data type is a container that can hold any NumPy data type, which makes it recursive. It is +not possible to make a lookup table that relates all NumPy structured data types to their Zarr +equivalents, as there is a nearly unbounded number of different structured data types. So instead of +a static lookup table, Zarr Python relies on a dynamic approach to data type resolution. + +Zarr Python defines a collection of Zarr data types. This collection, called a "data type registry," +is essentially a dictionary where the keys are strings (a canonical name for each data type), and the +values are the data type classes themselves. Dynamic data type resolution entails iterating over +these data type classes, invoking that class' `from_native_dtype <#api/dtype/ZDType.from_native_dtype>`_ +method, and returning a concrete data type instance if and only if exactly one of those constructor +invocations is successful. + +In plain language, we take some user input, like a NumPy data type, offer it to all the +known data type classes, and return an instance of the one data type class that can accept that user input. + +We want to avoid a situation where the same native data type matches multiple Zarr data types; that is, +a NumPy data type should *uniquely* specify a single Zarr data type. But data type resolution is +dynamic, so it's not possible to statically guarantee this uniqueness constraint. Therefore, we +attempt data type resolution against *every* data type class, and if, for some reason, a native data +type matches multiple Zarr data types, we treat this as an error and raise an exception. \ No newline at end of file diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index ea34ac2561..f92c576f32 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -8,7 +8,6 @@ User guide installation arrays - data_types groups attributes storage @@ -21,6 +20,7 @@ Advanced Topics .. toctree:: :maxdepth: 1 + data_types performance consolidated_metadata extending diff --git a/examples/custom_dtype.py b/examples/custom_dtype.py new file mode 100644 index 0000000000..a98f3414f6 --- /dev/null +++ b/examples/custom_dtype.py @@ -0,0 +1,245 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", +# "ml_dtypes==0.5.1", +# "pytest==8.4.1" +# ] +# /// +# + +""" +Demonstrate how to extend Zarr Python by defining a new data type +""" + +import json +import sys +from pathlib import Path +from typing import ClassVar, Literal, Self, TypeGuard, overload + +import ml_dtypes # necessary to add extra dtypes to NumPy +import numpy as np +import pytest + +import zarr +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype import ZDType, data_type_registry +from zarr.core.dtype.common import ( + DataTypeValidationError, + DTypeConfig_V2, + DTypeJSON, + check_dtype_spec_v2, +) + +# This is the int2 array data type +int2_dtype_cls = type(np.dtype("int2")) + +# This is the int2 scalar type +int2_scalar_cls = ml_dtypes.int2 + + +class Int2(ZDType[int2_dtype_cls, int2_scalar_cls]): + """ + This class provides a Zarr compatibility layer around the int2 data type (the ``dtype`` of a + NumPy array of type int2) and the int2 scalar type (the ``dtype`` of the scalar value inside an int2 array). + """ + + # This field is as the key for the data type in the internal data type registry, and also + # as the identifier for the data type when serializaing the data type to disk for zarr v3 + _zarr_v3_name: ClassVar[Literal["int2"]] = "int2" + # this field will be used internally + _zarr_v2_name: ClassVar[Literal["int2"]] = "int2" + + # we bind a class variable to the native data type class so we can create instances of it + dtype_cls = int2_dtype_cls + + @classmethod + def from_native_dtype(cls, dtype: np.dtype) -> Self: + """Create an instance of this ZDType from a native dtype.""" + if cls._check_native_dtype(dtype): + return cls() + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + def to_native_dtype(self: Self) -> int2_dtype_cls: + """Create an int2 dtype instance from this ZDType""" + return self.dtype_cls() + + @classmethod + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: + """ + Type check for Zarr v2-flavored JSON. + + This will check that the input is a dict like this: + .. code-block:: json + + { + "name": "int2", + "object_codec_id": None + } + + Note that this representation differs from the ``dtype`` field looks like in zarr v2 metadata. + Specifically, whatever goes into the ``dtype`` field in metadata is assigned to the ``name`` field here. + + See the Zarr docs for more information about the JSON encoding for data types. + """ + return ( + check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None + ) + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]: + """ + Type check for Zarr V3-flavored JSON. + + Checks that the input is the string "int2". + """ + return data == cls._zarr_v3_name + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr V3-flavored JSON. + """ + if cls._check_json_v2(data): + return cls() + # This first does a type check on the input, and if that passes we create an instance of the ZDType. + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" + raise DataTypeValidationError(msg) + + @classmethod + def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr V3-flavored JSON. + + This first does a type check on the input, and if that passes we create an instance of the ZDType. + """ + if cls._check_json_v3(data): + return cls() + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["int2"], None]: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Literal["int2"]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> DTypeConfig_V2[Literal["int2"], None] | Literal["int2"]: + """ + Serialize this ZDType to v2- or v3-flavored JSON + + If the zarr_format is 2, then return a dict like this: + .. code-block:: json + + { + "name": "int2", + "object_codec_id": None + } + + If the zarr_format is 3, then return the string "int2" + + """ + if zarr_format == 2: + return {"name": "int2", "object_codec_id": None} + elif zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def _check_scalar(self, data: object) -> TypeGuard[int | ml_dtypes.int2]: + """ + Check if a python object is a valid int2-compatible scalar + + The strictness of this type check is an implementation degree of freedom. + You could be strict here, and only accept int2 values, or be open and accept any integer + or any object and rely on exceptions from the int2 constructor that will be called in + cast_scalar. + """ + return isinstance(data, (int, int2_scalar_cls)) + + def cast_scalar(self, data: object) -> ml_dtypes.int2: + """ + Attempt to cast a python object to an int2. + + We first perform a type check to ensure that the input type is appropriate, and if that + passes we call the int2 scalar constructor. + """ + if self._check_scalar(data): + return ml_dtypes.int2(data) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) + + def default_scalar(self) -> ml_dtypes.int2: + """ + Get the default scalar value. This will be used when automatically selecting a fill value. + """ + return ml_dtypes.int2(0) + + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + """ + Convert a python object to a JSON representation of an int2 scalar. + This is necessary for taking user input for the ``fill_value`` attribute in array metadata. + + In this implementation, we optimistically convert the input to an int, + and then check that it lies in the acceptable range for this data type. + """ + # We could add a type check here, but we don't need to for this example + val: int = int(data) # type: ignore[call-overload] + if val not in (-2, -1, 0, 1): + raise ValueError("Invalid value. Expected -2, -1, 0, or 1.") + return val + + def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> ml_dtypes.int2: + """ + Read a JSON-serializable value as an int2 scalar. + + We first perform a type check to ensure that the JSON value is well-formed, then call the + int2 scalar constructor. + + The base definition of this method requires that it take a zarr_format parameter because + other data types serialize scalars differently in zarr v2 and v3, but we don't use this here. + + """ + if self._check_scalar(data): + return ml_dtypes.int2(data) + raise TypeError(f"Invalid type: {data}. Expected an int.") + + +# after defining dtype class, it must be registered with the data type registry so zarr can use it +data_type_registry.register(Int2._zarr_v3_name, Int2) + + +# this parametrized function will create arrays in zarr v2 and v3 using our new data type +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_custom_dtype(tmp_path: Path, zarr_format: Literal[2, 3]) -> None: + # create array and write values + z_w = zarr.create_array( + store=tmp_path, shape=(4,), dtype="int2", zarr_format=zarr_format, compressors=None + ) + z_w[:] = [-1, -2, 0, 1] + + # open the array + z_r = zarr.open_array(tmp_path, mode="r") + + print(z_r.info_complete()) + + # look at the array metadata + if zarr_format == 2: + meta_file = tmp_path / ".zarray" + else: + meta_file = tmp_path / "zarr.json" + print(json.dumps(json.loads(meta_file.read_text()), indent=2)) + + +if __name__ == "__main__": + # Run the example with printed output, and a dummy pytest configuration file specified. + # Without the dummy configuration file, at test time pytest will attempt to use the + # configuration file in the project root, which will error because Zarr is using some + # plugins that are not installed in this example. + sys.exit(pytest.main(["-s", __file__, f"-c {__file__}"])) diff --git a/pyproject.toml b/pyproject.toml index 6c18563a1f..53b37d6c6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,9 @@ test = [ "mypy", "hypothesis", "pytest-xdist", + "packaging", + "tomlkit", + "uv" ] remote_tests = [ 'zarr[remote]', @@ -106,7 +109,8 @@ docs = [ 'numcodecs[msgpack]', 'rich', 's3fs>=2023.10.0', - 'astroid<4' + 'astroid<4', + 'pytest' ] diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 13ee8bc5a0..a5ef7aeb7a 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -19,6 +19,8 @@ overload, ) +from typing_extensions import ReadOnly + from zarr.core.config import config as zarr_config if TYPE_CHECKING: @@ -48,8 +50,19 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]): - name: TName - configuration: TConfig + """ + A typed dictionary representing an object with a name and configuration, where the configuration + is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + """ + + name: ReadOnly[TName] + """The name of the object.""" + + configuration: ReadOnly[TConfig] + """The configuration of the object.""" def product(tup: ChunkCoords) -> int: diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 2f875ec491..1d36689ec8 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -7,14 +7,28 @@ DTypeJSON, ) from zarr.core.dtype.npy.bool import Bool -from zarr.core.dtype.npy.bytes import NullTerminatedBytes, RawBytes, VariableLengthBytes +from zarr.core.dtype.npy.bytes import ( + NullTerminatedBytes, + NullterminatedBytesJSON_V2, + NullTerminatedBytesJSON_V3, + RawBytes, + RawBytesJSON_V2, + RawBytesJSON_V3, + VariableLengthBytes, + VariableLengthBytesJSON_V2, +) from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.structured import ( - Structured, +from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3 +from zarr.core.dtype.npy.time import ( + DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, + TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, ) -from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 if TYPE_CHECKING: from zarr.core.common import ZarrFormat @@ -27,7 +41,10 @@ from zarr.core.common import JSON from zarr.core.dtype.npy.string import ( FixedLengthUTF32, + FixedLengthUTF32JSON_V2, + FixedLengthUTF32JSON_V3, VariableLengthUTF8, + VariableLengthUTF8JSON_V2, ) from zarr.core.dtype.registry import DataTypeRegistry from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -39,7 +56,11 @@ "DataTypeRegistry", "DataTypeValidationError", "DateTime64", + "DateTime64JSON_V2", + "DateTime64JSON_V3", "FixedLengthUTF32", + "FixedLengthUTF32JSON_V2", + "FixedLengthUTF32JSON_V3", "Float16", "Float32", "Float64", @@ -48,17 +69,28 @@ "Int32", "Int64", "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", + "NullterminatedBytesJSON_V2", "RawBytes", + "RawBytesJSON_V2", + "RawBytesJSON_V3", "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", "TBaseDType", "TBaseScalar", "TimeDelta64", "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", "UInt8", "UInt16", "UInt32", "UInt64", + "VariableLengthBytes", + "VariableLengthBytesJSON_V2", "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", "parse_data_type", diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 6f61b6775e..156928b6eb 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -13,6 +13,8 @@ TypeVar, ) +from typing_extensions import ReadOnly + from zarr.core.common import NamedConfig EndiannessStr = Literal["little", "big"] @@ -55,8 +57,8 @@ class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]): - name: TDTypeNameV2_co - object_codec_id: TObjectCodecID_co + name: ReadOnly[TDTypeNameV2_co] + object_codec_id: ReadOnly[TObjectCodecID_co] DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str] @@ -87,6 +89,9 @@ def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2 def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]: + """ + Check that all the elements of a sequence are valid zarr v2 structured dtype identifiers + """ return all(check_structured_dtype_v2_inner(d) for d in data) @@ -151,17 +156,23 @@ class DataTypeValidationError(ValueError): ... class ScalarTypeValidationError(ValueError): ... -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasLength: """ A mix-in class for data types with a length attribute, such as fixed-size collections of unicode strings, or bytes. + + Attributes + ---------- + length : int + The length of the scalars belonging to this data type. Note that this class does not assign + a unit to the length. Child classes may assign units. """ length: int -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasEndianness: """ A mix-in class for data types with an endianness attribute @@ -170,7 +181,7 @@ class HasEndianness: endianness: EndiannessStr = "little" -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasItemSize: """ A mix-in class for data types with an item size attribute. @@ -183,7 +194,7 @@ def item_size(self) -> int: raise NotImplementedError -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class HasObjectCodec: """ A mix-in class for data types that require an object codec id. diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index d8d52468bf..37371cd0cd 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -21,14 +21,27 @@ @dataclass(frozen=True, kw_only=True, slots=True) class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ - Wrapper for numpy boolean dtype. + A Zarr data type for arrays containing booleans. + + Wraps the ``np.dtypes.BoolDType`` data type. Scalars for this data type are instances of + ``np.bool_``. Attributes ---------- - name : str - The name of the dtype. - dtype_cls : ClassVar[type[np.dtypes.BoolDType]] - The numpy dtype class. + + _zarr_v3_name : Literal["bool"] = "bool" + The Zarr v3 name of the dtype. + _zarr_v2_name : ``Literal["|b1"]`` = ``"|b1"`` + The Zarr v2 name of the dtype, which is also a string representation + of the boolean dtype used by NumPy. + dtype_cls : ClassVar[type[np.dtypes.BoolDType]] = np.dtypes.BoolDType + The NumPy dtype class. + + References + ---------- + This class implements the boolean data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" @@ -38,7 +51,22 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Bool from a np.dtype('bool') instance. + Create an instance of Bool from an instance of np.dtypes.BoolDType. + + Parameters + ---------- + dtype : TBaseDType + The NumPy boolean dtype instance to convert. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the provided dtype is not compatible with this ZDType. """ if cls._check_native_dtype(dtype): return cls() @@ -48,7 +76,12 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def to_native_dtype(self: Self) -> np.dtypes.BoolDType: """ - Create a NumPy boolean dtype instance from this ZDType + Create a NumPy boolean dtype instance from this ZDType. + + Returns + ------- + np.dtypes.BoolDType + The NumPy boolean dtype. """ return self.dtype_cls() @@ -59,6 +92,16 @@ def _check_json_v2( ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: """ Check that the input is a valid JSON representation of a Bool. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` + True if the input is a valid JSON representation, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -68,10 +111,41 @@ def _check_json_v2( @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" @@ -79,12 +153,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: + """ + Create an instance of Bool from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Bool + An instance of Bool. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... @overload @@ -93,6 +185,24 @@ def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: + """ + Serialize this Bool instance to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` + The JSON representation of the Bool instance. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: return {"name": self._zarr_v2_name, "object_codec_id": None} elif zarr_format == 3: @@ -100,14 +210,47 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> bool: - # Anything can become a bool + """ + Check if the input can be cast to a boolean scalar. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + bool + True if the input can be cast to a boolean scalar, False otherwise. + """ return True def cast_scalar(self, data: object) -> np.bool_: + """ + Cast the input to a numpy boolean scalar. + + Parameters + ---------- + data : object + The data to cast. + + Returns + ------- + ``np.bool_`` + The numpy boolean scalar. + + Raises + ------ + TypeError + If the input cannot be converted to a numpy boolean. + """ if self._check_scalar(data): return np.bool_(data) - msg = f"Cannot convert object with type {type(data)} to a numpy boolean." - raise TypeError(msg) + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no cover def default_scalar(self) -> np.bool_: """ @@ -115,7 +258,7 @@ def default_scalar(self) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The default value. """ return np.False_ @@ -151,8 +294,13 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: Returns ------- - np.bool_ + ``np.bool_`` The numpy boolean scalar. + + Raises + ------ + TypeError + If the input is not a valid boolean type. """ if self._check_scalar(data): return np.bool_(data) @@ -160,4 +308,12 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bool_: @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 1 diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index b2f184b2fa..b7c764dcd9 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -3,7 +3,7 @@ import base64 import re from dataclasses import dataclass -from typing import Any, ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np @@ -25,15 +25,171 @@ class FixedLengthBytesConfig(TypedDict): + """ + A configuration for a data type that takes a ``length_bytes`` parameter. + + Attributes + ---------- + + length_bytes : int + The length in bytes of the data associated with this configuration. + + Examples + -------- + .. code-block:: python + + { + "length_bytes": 12 + } + """ + length_bytes: int -NullTerminatedBytesJSONV3 = NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] -RawBytesJSONV3 = NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig] +class NullterminatedBytesJSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``NullTerminatedBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + + Examples + -------- + .. code-block:: python + + { + "name": "|S10", + "object_codec_id": None + } + """ + + +class NullTerminatedBytesJSON_V3( + NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig] +): + """ + The JSON representation of the ``NullTerminatedBytes`` data type in Zarr V3. + + References + ---------- + This representation is not currently defined in an external specification. + + + Examples + -------- + .. code-block:: python + + { + "name": "null_terminated_bytes", + "configuration": { + "length_bytes": 12 + } + } + + """ + + +class RawBytesJSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``RawBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + + Examples + -------- + .. code-block:: python + + { + "name": "|V10", + "object_codec_id": None + } + """ + + +class RawBytesJSON_V3(NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig]): + """ + The JSON representation of the ``RawBytes`` data type in Zarr V3. + + References + ---------- + This representation is not currently defined in an external specification. + + + Examples + -------- + .. code-block:: python + + { + "name": "raw_bytes", + "configuration": { + "length_bytes": 12 + """ + + +class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]): + """ + A wrapper around the JSON representation of the ``VariableLengthBytes`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-bytes"`` + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": "|O", + "object_codec_id": "vlen-bytes" + } + """ @dataclass(frozen=True, kw_only=True) class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): + """ + A Zarr data type for arrays containing fixed-length null-terminated byte sequences. + + Wraps the ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of + ``np.bytes_``. + + This data type is parametrized by an integral length which specifies size in bytes of each + scalar. Because this data type uses null-terminated semantics, indexing into + NumPy arrays with this data type may return fewer than ``length`` bytes. + + Attributes + ---------- + dtype_cls: ClassVar[type[np.dtypes.BytesDType[int]]] = np.dtypes.BytesDType + The NumPy data type wrapped by this ZDType. + _zarr_v3_name : ClassVar[Literal["null_terminated_bytes"]] + length : int + The length of the bytes. + + Notes + ----- + This data type is designed for compatibility with NumPy arrays that use the NumPy ``bytes`` data type. + It may not be desirable for usage outside of that context. If compatibility + with the NumPy ``bytes`` data type is not essential, consider using the ``RawBytes`` + or ``VariableLengthBytes`` data types instead. + """ + dtype_cls = np.dtypes.BytesDType _zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes" @@ -47,6 +203,29 @@ def __post_init__(self) -> None: @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of NullTerminatedBytes from an instance of np.dtypes.BytesDType. + + This method checks if the provided data type is an instance of np.dtypes.BytesDType. + If so, it returns a new instance of NullTerminatedBytes with a length equal to the + length of input data type. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + NullTerminatedBytes + An instance of NullTerminatedBytes with the specified length. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with NullTerminatedBytes. + """ + if cls._check_native_dtype(dtype): return cls(length=dtype.itemsize) raise DataTypeValidationError( @@ -54,14 +233,36 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.BytesDType[int]: + """ + Create a NumPy bytes dtype from this NullTerminatedBytes ZDType. + + Returns + ------- + np.dtypes.BytesDType[int] + A NumPy data type object representing null-terminated bytes with a specified length. + """ + return self.dtype_cls(self.length) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[NullterminatedBytesJSON_V2]: """ - Check that the input is a valid representation of a numpy S dtype. We expect - something like ``{"name": "|S10", "object_codec_id": None}`` + Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2. + + The input data must be a mapping that contains a "name" key that matches the pattern + "|S" and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input data is a valid representation, False otherwise. """ + return ( check_dtype_spec_v2(data) and isinstance(data["name"], str) @@ -70,17 +271,55 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[NullTerminatedBytesJSON_V3] + True if the input is a valid representation of this class in Zarr V3, False + otherwise. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and isinstance(data["configuration"], dict) and "length_bytes" in data["configuration"] + and isinstance(data["configuration"]["length_bytes"], int) ) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of + this class in Zarr V2. If so, it returns a new instance of + this class with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ + if cls._check_json_v2(data): name = data["name"] return cls(length=int(name[2:])) @@ -89,20 +328,55 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + this class in Zarr V3. If so, it returns a new instance of + this class with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + @overload + def to_json(self, zarr_format: Literal[2]) -> NullterminatedBytesJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSON_V3: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSONV3: + ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSON_V3: + """ + Generate a JSON representation of this data type. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + NullterminatedBytesJSON_V2 | NullTerminatedBytesJSON_V3 + The JSON-serializable representation of the data type + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -114,36 +388,137 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: - # this is generous for backwards compatibility + """ + Check if the provided data is of type BytesLike. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes NumPy bytes, strings, bytes, + and integers. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[BytesLike] + True if the data is bytes-like, False otherwise. + """ + return isinstance(data, BytesLike) def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_: - # We explicitly truncate the result because of the following numpy behavior: - # >>> x = np.dtype('S3').type('hello world') - # >>> x - # np.bytes_(b'hello world') - # >>> x.dtype - # dtype('S11') + """ + Cast the provided scalar data to ``np.bytes_``, truncating if necessary. + + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + np.bytes_ + The casted data as a NumPy bytes scalar. + Notes + ----- + This method does not perform any type checking. + The input data must be bytes-like. + """ if isinstance(data, int): return self.to_native_dtype().type(str(data)[: self.length]) else: return self.to_native_dtype().type(data[: self.length]) def cast_scalar(self, data: object) -> np.bytes_: + """ + Attempt to cast a given object to a NumPy bytes scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a NumPy bytes scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a NumPy bytes scalar. + + Returns + ------- + ``np.bytes_`` + The data cast as a NumPy bytes scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy bytes scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy bytes scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.bytes_: + """ + Return a default scalar value, which for this data type is an empty byte string. + + Returns + ------- + ``np.bytes_`` + The default scalar value. + """ return np.bytes_(b"") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + This method encodes the given scalar as a NumPy bytes scalar and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ as_bytes = self.cast_scalar(data) return base64.standard_b64encode(as_bytes).decode("ascii") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: + """ + Read a JSON-serializable value as ``np.bytes_``. + + Parameters + ---------- + data : JSON + The JSON-serializable base64-encoded string. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + ``np.bytes_`` + The NumPy bytes scalar obtained from decoding the base64 string. + + Raises + ------ + TypeError + If the input data is not a base64-encoded string. + """ + if check_json_str(data): return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii"))) raise TypeError( @@ -152,12 +527,45 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_: @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.length @dataclass(frozen=True, kw_only=True) class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): - # np.dtypes.VoidDType is specified in an odd way in numpy + """ + A Zarr data type for arrays containing fixed-length sequences of raw bytes. + + Wraps the NumPy ``void`` data type. Scalars for this data type are instances of ``np.void``. + + This data type is parametrized by an integral length which specifies size in bytes of each + scalar belonging to this data type. + + Attributes + ---------- + dtype_cls: ClassVar[type[np.dtypes.VoidDType[int]]] = np.dtypes.VoidDtype + The NumPy data type wrapped by this ZDType. + _zarr_v3_name : ClassVar[Literal["raw_bytes"]] + length : int + The length of the bytes. + + Notes + ----- + Although the NumPy "Void" data type is used to create "structured" data types in NumPy, this + class does not support structured data types. + + See the ``Structured`` data type for this functionality. + + """ + + # np.dtypes.VoidDType is specified in an odd way in NumPy # it cannot be used to create instances of the dtype # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] @@ -174,8 +582,11 @@ def __post_init__(self) -> None: @classmethod def _check_native_dtype( cls: type[Self], dtype: TBaseDType - ) -> TypeGuard[np.dtypes.VoidDType[Any]]: + ) -> TypeGuard[np.dtypes.VoidDType[int]]: """ + Check that the input is a NumPy void dtype with no fields. + + Numpy void dtype comes in two forms: * If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes. * If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype, @@ -184,18 +595,42 @@ def _check_native_dtype( Parameters ---------- - dtype : TDType + dtype : TDBaseDType The dtype to check. Returns ------- Bool - True if the dtype matches, False otherwise. + True if the dtype is an instance of np.dtypes.VoidDType with no fields, False otherwise. """ return cls.dtype_cls is type(dtype) and dtype.fields is None @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of RawBytes from an instance of np.dtypes.VoidDType. + + This method checks if the provided data type is compatible with RawBytes. The input + must be an instance of np.dtypes.VoidDType, and have no fields. If the input is compatible, + this method returns an instance of RawBytes with the specified length. + + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + RawBytes + An instance of RawBytes with the specified length. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with RawBytes. + """ + if cls._check_native_dtype(dtype): return cls(length=dtype.itemsize) raise DataTypeValidationError( @@ -203,15 +638,32 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + """ + Create a NumPy void dtype from this RawBytes ZDType. + + Returns + ------- + np.dtypes.VoidDType[int] + A NumPy data type object representing raw bytes with a specified length. + """ # Numpy does not allow creating a void type # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V2]: """ - Check that the input is a valid representation of a numpy S dtype. We expect - something like ``{"name": "|V10", "object_codec_id": None}`` + Check that the input is a valid representation of this class in Zarr V2. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of this class in Zarr V3, False otherwise. + """ return ( check_dtype_spec_v2(data) @@ -221,17 +673,54 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[RawBytesJSON_V3] + True if the input is a valid representation of this class in Zarr V3, False + otherwise. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} + and isinstance(data["configuration"]["length_bytes"], int) ) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of RawBytes from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of + RawBytes in Zarr V2. If so, it returns a new instance of + RawBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ if cls._check_json_v2(data): name = data["name"] return cls(length=int(name[2:])) @@ -240,18 +729,53 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of RawBytes from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + RawBytes in Zarr V3. If so, it returns a new instance of + RawBytes with a ``length`` as specified in the input data. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + RawBytes + An instance of RawBytes. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... + @overload + def to_json(self, zarr_format: Literal[2]) -> RawBytesJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> RawBytesJSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> RawBytesJSON_V3: ... - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawBytesJSONV3: + def to_json(self, zarr_format: ZarrFormat) -> RawBytesJSON_V2 | RawBytesJSON_V3: + """ + Generate a JSON representation of this data type. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + RawBytesJSON_V2 | RawBytesJSON_V3 + The JSON-serializable representation of the data type. + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -259,46 +783,205 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | RawByt return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> bool: + def _check_scalar(self, data: object) -> TypeGuard[np.bytes_ | str | bytes | np.void]: + """ + Check if the provided data can be cast to np.void. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes np.bytes_, np.void, strings, and bytes objects. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[np.bytes_ | str | bytes | np.void] + True if the data is void-scalar-like, False otherwise. + """ return isinstance(data, np.bytes_ | str | bytes | np.void) def _cast_scalar_unchecked(self, data: object) -> np.void: + """ + Cast the provided scalar data to np.void. + + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + np.void + The casted data as a NumPy void scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be castable to np.void. + """ native_dtype = self.to_native_dtype() - # Without the second argument, numpy will return a void scalar for dtype V1. + # Without the second argument, NumPy will return a void scalar for dtype V1. # The second argument ensures that, if native_dtype is something like V10, # the result will actually be a V10 scalar. return native_dtype.type(data, native_dtype) def cast_scalar(self, data: object) -> np.void: + """ + Attempt to cast a given object to a NumPy void scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a NumPy void scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a NumPy void scalar. + + Returns + ------- + np.void + The data cast as a NumPy void scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy void scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy void scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.void: + """ + Return the default scalar value for this data type. + + The default scalar is a NumPy void scalar of the same length as the data type, + filled with zero bytes. + + Returns + ------- + np.void + The default scalar value. + """ return self.to_native_dtype().type(("\x00" * self.length).encode("ascii")) def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: - return base64.standard_b64encode(self.cast_scalar(data).tobytes()).decode("ascii") + """ + Convert a scalar to a JSON-serializable string representation. + + This method converts the given scalar to bytes and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ + as_bytes = self.cast_scalar(data) + return base64.standard_b64encode(as_bytes.tobytes()).decode("ascii") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + """ + Read a JSON-serializable value as a np.void. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.void + The NumPy void scalar. + + Raises + ------ + TypeError + If the data is not a string, or if the string is not a valid base64 encoding. + """ if check_json_str(data): return self.to_native_dtype().type(base64.standard_b64decode(data)) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.length @dataclass(frozen=True, kw_only=True) class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): + """ + A Zarr data type for arrays containing variable-length sequences of bytes. + + Wraps the NumPy "object" data type. Scalars for this data type are instances of ``bytes``. + + Attributes + ---------- + dtype_cls: ClassVar[type[np.dtypes.ObjectDType]] = np.dtypes.ObjectDType + The NumPy data type wrapped by this ZDType. + _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + The name of this data type in Zarr V3. + object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" + The object codec ID for this data type. + + Notes + ----- + Because this data type uses the NumPy "object" data type, it does not guarantee a compact memory + representation of array data. Therefore a "vlen-bytes" codec is needed to ensure that the array + data can be persisted to storage. + """ + dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of VariableLengthBytes from an instance of np.dtypes.ObjectDType. + + This method checks if the provided data type is an instance of np.dtypes.ObjectDType. + If so, it returns an instance of VariableLengthBytes. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + VariableLengthBytes + An instance of VariableLengthBytes. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with VariableLengthBytes. + """ if cls._check_native_dtype(dtype): return cls() raise DataTypeValidationError( @@ -306,29 +989,86 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthBytes ZDType. + + Returns + ------- + np.dtypes.ObjectDType + A NumPy data type object representing variable-length bytes. + """ return self.dtype_cls() @classmethod def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]]: + ) -> TypeGuard[VariableLengthBytesJSON_V2]: """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. + Check that the input is a valid JSON representation of a NumPy O dtype, and that the + object codec id is appropriate for variable-length bytes strings. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + True if the input is a valid representation of this class in Zarr V2, False + otherwise. """ - return ( - check_dtype_spec_v2(data) - and data["name"] == "|O" - and data["object_codec_id"] == cls.object_codec_id - ) + # Check that the input is a valid JSON representation of a Zarr v2 data type spec. + if not check_dtype_spec_v2(data): + return False + + # Check that the object codec id is appropriate for variable-length bytes strings. + if data["name"] != "|O": + return False + return data["object_codec_id"] == cls.object_codec_id @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[Literal["variable_length_bytes"]] + True if the input is a valid representation of this class in Zarr V3, False otherwise. + """ + return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON. + + This method checks if the input data is a valid representation of this class + in Zarr V2. If so, it returns a new instance this class. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class class. + """ + if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" @@ -336,22 +1076,64 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of VariableLengthBytes from Zarr V3-flavored JSON. + + This method checks if the input data is a valid representation of + VariableLengthBytes in Zarr V3. If so, it returns a new instance of + VariableLengthBytes. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + VariableLengthBytes + An instance of VariableLengthBytes. + + Raises + ------ + DataTypeValidationError + If the input data is not a valid representation of this class. + """ + if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json( - self, zarr_format: Literal[2] - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]: ... + @overload + def to_json(self, zarr_format: Literal[2]) -> VariableLengthBytesJSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]: + ) -> VariableLengthBytesJSON_V2 | Literal["variable_length_bytes"]: + """ + Convert the variable-length bytes data type to a JSON-serializable form. + + Parameters + ---------- + zarr_format : ZarrFormat + The zarr format version. Accepted values are 2 and 3. + + Returns + ------- + ``DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]`` + The JSON-serializable representation of the variable-length bytes data type. + For zarr_format 2, returns a dictionary with "name" and "object_codec_id". + For zarr_format 3, returns a string identifier "variable_length_bytes". + + Raises + ------ + ValueError + If zarr_format is not 2 or 3. + """ + if zarr_format == 2: return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: @@ -360,26 +1142,135 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_scalar(self) -> bytes: + """ + Return the default scalar value for the variable-length bytes data type. + + Returns + ------- + bytes + The default scalar value, which is an empty byte string. + """ + return b"" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + This method encodes the given scalar as bytes and then + encodes the bytes as a base64-encoded string. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar. + """ return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type] def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes: + """ + Decode a base64-encoded JSON string to bytes. + + Parameters + ---------- + data : JSON + The JSON-serializable base64-encoded string. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + bytes + The decoded bytes from the base64 string. + + Raises + ------ + TypeError + If the input data is not a base64-encoded string. + """ + if check_json_str(data): return base64.standard_b64decode(data.encode("ascii")) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: + """ + Check if the provided data is of type BytesLike. + + This method is used to verify if the input data can be considered as a + scalar of bytes-like type, which includes NumPy bytes, strings, bytes, + and integers. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[BytesLike] + True if the data is bytes-like, False otherwise. + """ return isinstance(data, BytesLike) def _cast_scalar_unchecked(self, data: BytesLike) -> bytes: + """ + Cast the provided scalar data to bytes. + + Parameters + ---------- + data : BytesLike + The data to cast. + + Returns + ------- + bytes + The casted data as bytes. + + Notes + ----- + This method does not perform any type checking. + The input data must be bytes-like. + """ if isinstance(data, str): return bytes(data, encoding="utf-8") return bytes(data) def cast_scalar(self, data: object) -> bytes: + """ + Attempt to cast a given object to a bytes scalar. + + This method first checks if the provided data is a valid scalar that can be + converted to a bytes scalar. If the check succeeds, the unchecked casting + operation is performed. If the data is not valid, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a bytes scalar. + + Returns + ------- + bytes + The data cast as a bytes scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a bytes scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to bytes." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 38e506f1bc..2f432a9e0a 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -41,11 +41,33 @@ @dataclass(frozen=True) class BaseComplex(ZDType[TComplexDType_co, TComplexScalar_co], HasEndianness, HasItemSize): + """ + A base class for Zarr data types that wrap NumPy complex float data types. + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a NumPy complex dtype. + + Parameters + ---------- + dtype : TBaseDType + The native dtype to convert. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the dtype is not compatible with this data type. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -53,6 +75,15 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> TComplexDType_co: + """ + Convert this class to a NumPy complex dtype with the appropriate byte order. + + Returns + ------- + TComplexDType_co + A NumPy data type object representing the complex data type with the specified byte order. + """ + byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -60,6 +91,19 @@ def to_native_dtype(self) -> TComplexDType_co: def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. + + The input data must be a mapping that contains a "name" key that is one of + the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + bool + True if the input is a valid JSON representation, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -69,10 +113,45 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this data type in Zarr V3. + + This method verifies that the provided data matches the expected Zarr V3 + representation, which is the string specified by the class-level attribute _zarr_v3_name. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this class in Zarr V3, False otherwise. + """ + return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this class. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v2(data): # Going via numpy ensures that we get the endianness correct without # annoying string parsing. @@ -83,12 +162,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload @@ -96,18 +193,26 @@ def to_json(self, zarr_format: Literal[3]) -> str: ... def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: """ - Convert the wrapped data type to a JSON-serializable form. + Serialize this object to a JSON-serializable representation. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] | str + If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is + returned. + If ``zarr_format`` is 3, a string representation of the complex data type is returned. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ + if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -115,15 +220,67 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: + """ + Check that the input is a scalar complex value. + + Parameters + ---------- + data : object + The value to check. + + Returns + ------- + TypeGuard[ComplexLike] + True if the input is a scalar complex value, False otherwise. + """ return isinstance(data, ComplexLike) def _cast_scalar_unchecked(self, data: ComplexLike) -> TComplexScalar_co: + """ + Cast the provided scalar data to the native scalar type of this class. + + Parameters + ---------- + data : ComplexLike + The data to cast. + + Returns + ------- + TComplexScalar_co + The casted data as a numpy complex scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be a scalar complex value. + """ return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TComplexScalar_co: + """ + Attempt to cast a given object to a numpy complex scalar. + + Parameters + ---------- + data : object + The data to be cast to a numpy complex scalar. + + Returns + ------- + TComplexScalar_co + The data cast as a numpy complex scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy complex scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> TComplexScalar_co: @@ -193,21 +350,69 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: @dataclass(frozen=True, kw_only=True) class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + Wraps the ``np.dtypes.Complex64DType`` data type. Scalars for this data type + are instances of ``np.complex64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex64DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex64"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8", "c8"], Literal["c8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True) class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndianness): + """ + A Zarr data type for arrays containing 64 bit complex floats. + + Wraps the ``np.dtypes.Complex128DType`` data type. Scalars for this data type + are instances of ``np.complex128``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Complex128DType] + The numpy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["complex128"]] + The name of this data type in Zarr V3. + _zarr_v2_names : ClassVar[tuple[Literal[">c16"], Literal["c16", "c16"], Literal["c16", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 16 diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 7b7243993f..3113bc5b61 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -11,7 +11,6 @@ DTypeJSON, HasEndianness, HasItemSize, - ScalarTypeValidationError, check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( @@ -35,11 +34,28 @@ @dataclass(frozen=True) class BaseFloat(ZDType[TFloatDType_co, TFloatScalar_co], HasEndianness, HasItemSize): + """ + A base class for Zarr data types that wrap NumPy float data types. + """ + # This attribute holds the possible zarr v2 JSON names for the data type _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this ZDType from a NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -47,6 +63,14 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> TFloatDType_co: + """ + Convert the wrapped data type to a NumPy data type. + + Returns + ------- + TFloatDType_co + The NumPy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] @@ -54,6 +78,16 @@ def to_native_dtype(self) -> TFloatDType_co: def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: """ Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid JSON representation of this data type, False otherwise. """ return ( check_dtype_spec_v2(data) @@ -63,12 +97,38 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[str] + True if the input is a valid JSON representation of this class, False otherwise. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -77,12 +137,25 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this ZDType from Zarr v3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload @@ -99,8 +172,13 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: Returns ------- - str - The JSON-serializable representation of the wrapped data type + DTypeConfig_V2[str, None] or str + The JSON-serializable representation of the wrapped data type. + + Raises + ------ + ValueError + If zarr_format is not 2 or 3. """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} @@ -109,31 +187,73 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: + """ + Check that the input is a valid scalar value. + + Parameters + ---------- + data : object + The input to check. + + Returns + ------- + TypeGuard[FloatLike] + True if the input is a valid scalar value, False otherwise. + """ return isinstance(data, FloatLike) def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: + """ + Cast a scalar value to a NumPy float scalar. + + Parameters + ---------- + data : FloatLike + The scalar value to cast. + + Returns + ------- + TFloatScalar_co + The NumPy float scalar. + """ return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TFloatScalar_co: + """ + Cast a scalar value to a NumPy float scalar. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + TFloatScalar_co + The NumPy float scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy float scalar." - raise ScalarTypeValidationError(msg) + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) def default_scalar(self) -> TFloatScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this zdtype. Returns ------- - Int scalar + TFloatScalar_co The default value. """ return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScalar_co: """ - Read a JSON-serializable value as a numpy float. + Read a JSON-serializable value as a NumPy float scalar. Parameters ---------- @@ -144,8 +264,8 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TFloatScal Returns ------- - TScalar_co - The numpy float. + TFloatScalar_co + The NumPy float scalar. """ if zarr_format == 2: if check_json_float_v2(data): @@ -191,32 +311,110 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> float | st @dataclass(frozen=True, kw_only=True) class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): + """ + A Zarr data type for arrays containing 16-bit floating point numbers. + + Wraps the ``np.dtypes.Float16DType`` data type. Scalars for this data type are instances + of ``np.float16``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float16DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float16 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" _zarr_v2_names: ClassVar[tuple[Literal[">f2"], Literal["f2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): + """ + A Zarr data type for arrays containing 32-bit floating point numbers. + + Wraps the ``np.dtypes.Float32DType`` data type. Scalars for this data type are instances + of ``np.float32``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float32DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float32 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" _zarr_v2_names: ClassVar[tuple[Literal[">f4"], Literal["f4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): + """ + A Zarr data type for arrays containing 64-bit floating point numbers. + + Wraps the ``np.dtypes.Float64DType`` data type. Scalars for this data type are instances + of ``np.float64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypes.Float64DType] + The NumPy dtype class for this data type. + + References + ---------- + This class implements the float64 data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" _zarr_v2_names: ClassVar[tuple[Literal[">f8"], Literal["f8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index d3423ed61a..01a79142a3 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -53,14 +53,36 @@ @dataclass(frozen=True) class BaseInt(ZDType[TIntDType_co, TIntScalar_co], HasItemSize): - # This attribute holds the possible zarr V2 JSON names for the data type + """ + A base class for integer data types in Zarr. + + This class provides methods for serialization and deserialization of integer types + in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. + """ + _zarr_v2_names: ClassVar[tuple[str, ...]] @classmethod def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: """ - Check that the input is a valid JSON representation of this data type. + Check that the input is a valid JSON representation of this integer data type in Zarr V2. + + This method verifies that the provided data matches the expected Zarr V2 representation + for this data type. The input data must be a mapping that contains a "name" key that is + one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[DTypeConfig_V2[str, None]] + True if the input is a valid representation of this class in Zarr V2, + False otherwise. """ + return ( check_dtype_spec_v2(data) and data["name"] in cls._zarr_v2_names @@ -70,54 +92,117 @@ def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: @classmethod def _check_json_v3(cls, data: object) -> TypeGuard[str]: """ - Check that a JSON value is consistent with the zarr v3 spec for this data type. + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : object + The JSON data to check. + + Returns + ------- + TypeGuard[str] + True if the input is a valid representation of this class in Zarr v3, + False otherwise. """ return data == cls._zarr_v3_name def _check_scalar(self, data: object) -> TypeGuard[IntLike]: """ - Check that a python object is IntLike + Check if the input object is of an IntLike type. + + This method verifies whether the provided data can be considered as an integer-like + value, which includes objects supporting integer conversion. + + Parameters + ---------- + data : object + The data to check. + + Returns + ------- + TypeGuard[IntLike] + True if the data is IntLike, False otherwise. """ + return isinstance(data, IntLike) def _cast_scalar_unchecked(self, data: IntLike) -> TIntScalar_co: """ - Create an integer without any type checking of the input. + Casts a given scalar value to the native integer scalar type without type checking. + + Parameters + ---------- + data : IntLike + The scalar value to cast. + + Returns + ------- + TIntScalar_co + The casted integer scalar of the native dtype. """ + return self.to_native_dtype().type(data) # type: ignore[return-value] def cast_scalar(self, data: object) -> TIntScalar_co: + """ + Attempt to cast a given object to a NumPy integer scalar. + + Parameters + ---------- + data : object + The data to be cast to a NumPy integer scalar. + + Returns + ------- + TIntScalar_co + The data cast as a NumPy integer scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy integer scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy integer." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> TIntScalar_co: """ - Get the default value, which is 0 cast to this dtype + Get the default value, which is 0 cast to this dtype. Returns ------- - Int scalar + TIntScalar_co The default value. """ return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar_co: """ - Read a JSON-serializable value as a numpy int scalar. + Read a JSON-serializable value as a NumPy int scalar. Parameters ---------- data : JSON The JSON-serializable value. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - TScalar_co - The numpy scalar. + TIntScalar_co + The NumPy int scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. """ if check_json_int(data): return self._cast_scalar_unchecked(data) @@ -125,14 +210,15 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> TIntScalar def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: """ - Convert an object to JSON-serializable scalar. + Convert an object to a JSON serializable scalar. For the integer data types, + the JSON form is a plain integer. Parameters ---------- - data : _BaseScalar + data : object The value to convert. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- @@ -144,6 +230,24 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: @dataclass(frozen=True, kw_only=True) class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): + """ + A Zarr data type for arrays containing 8-bit signed integers. + + Wraps the ``np.dtypes.Int8DType`` data type. Scalars for this data type are + instances of ``np.int8``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int8DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 8-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Int8DType _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) @@ -151,7 +255,22 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Int8 from a np.dtype('int8') instance. + Create an Int8 from a np.dtype('int8') instance. + + Parameters + ---------- + dtype : TBaseDType + The np.dtype('int8') instance. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class Int8. """ if cls._check_native_dtype(dtype): return cls() @@ -160,10 +279,36 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self: Self) -> np.dtypes.Int8DType: + """ + Convert the Int8 instance to a np.dtype('int8') instance. + + Returns + ------- + np.dtypes.Int8DType + The np.dtype('int8') instance. + """ return self.dtype_cls() @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int8. + """ if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" @@ -171,12 +316,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int8 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int8. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... @overload @@ -186,17 +349,22 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Returns ------- - str - The JSON-serializable representation of the wrapped data type + ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. """ if zarr_format == 2: return {"name": self._zarr_v2_names[0], "object_codec_id": None} @@ -206,11 +374,36 @@ def to_json( @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 1 @dataclass(frozen=True, kw_only=True) class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): + """ + A Zarr data type for arrays containing 8-bit unsigned integers. + + Wraps the ``np.dtypes.UInt8DType`` data type. Scalars for this data type are instances of ``np.uint8``. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt8DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 8-bit unsigned integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.UInt8DType _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) @@ -218,7 +411,7 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ - Create a Bool from a np.dtype('uint8') instance. + Create a UInt8 from a np.dtype('uint8') instance. """ if cls._check_native_dtype(dtype): return cls() @@ -227,10 +420,38 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: + """ + Create a NumPy unsigned 8-bit integer dtype instance from this UInt8 ZDType. + + Returns + ------- + np.dtypes.UInt8DType + The NumPy unsigned 8-bit integer dtype. + """ + return self.dtype_cls() @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" @@ -238,12 +459,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... @overload @@ -253,37 +492,89 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: """ - Convert the wrapped data type to a JSON-serializable form. + Convert the data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. Supported values are 2 and 3. Returns ------- - str - The JSON-serializable representation of the wrapped data type + ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` + The JSON-serializable representation of the data type. + + Raises + ------ + ValueError + If `zarr_format` is not 2 or 3. """ if zarr_format == 2: + # For Zarr format version 2, return a dictionary with the name and object codec ID. return {"name": self._zarr_v2_names[0], "object_codec_id": None} elif zarr_format == 3: + # For Zarr format version 3, return the v3 name as a string. return self._zarr_v3_name + # Raise an error if the zarr_format is neither 2 nor 3. raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 1 @dataclass(frozen=True, kw_only=True) class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): + """ + A Zarr data type for arrays containing 16-bit signed integers. + + Wraps the ``np.dtypes.Int16DType`` data type. Scalars for this data type are instances of + ``np.int16``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int16DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 16-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Int16DType _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: + """ + Create an instance of this data type from a np.dtype('int16') instance. + + Parameters + ---------- + dtype : np.dtype + The instance of np.dtype('int16') to create from. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('int16'). + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -291,13 +582,39 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.Int16DType: + """ + Convert the data type to a np.dtype('int16') instance. + + Returns + ------- + np.dtype + The np.dtype('int16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -306,12 +623,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " DTypeConfig_V2[Literal[">i2", "i2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): + """ + A Zarr data type for arrays containing 16-bit unsigned integers. + + Wraps the ``np.dtypes.UInt16DType`` data type. Scalars for this data type are instances of + ``np.uint16``. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt16DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the unsigned 16-bit unsigned integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.UInt16DType _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: + """ + Create an instance of this data type from a np.dtype('uint16') instance. + + Parameters + ---------- + dtype : np.dtype + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtype('uint16'). + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -360,13 +744,39 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.UInt16DType: + """ + Convert the data type to a np.dtype('uint16') instance. + + Returns + ------- + np.dtype + The np.dtype('uint16') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -375,12 +785,30 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " DTypeConfig_V2[Literal[">u2", "u2", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 2 @dataclass(frozen=True, kw_only=True) class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): + """ + A Zarr data type for arrays containing 32-bit signed integers. + + Wraps the ``np.dtypes.Int32DType`` data type. Scalars for this data type are instances of + ``np.int32``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int32DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 32-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Int32DType _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " TypeGuard[np.dtyp """ A type guard that checks if the input is assignable to the type of ``cls.dtype_class`` - This method is overridden for this particular data type because of a windows-specific issue where - np.dtype('i') is an instance of ``np.dtypes.IntDType``, not an instance of ``np.dtypes.Int32DType``. + This method is overridden for this particular data type because of a Windows-specific issue + where np.dtype('i') creates an instance of ``np.dtypes.IntDType``, rather than an + instance of ``np.dtypes.Int32DType``, even though both represent 32-bit signed integers. Parameters ---------- @@ -442,34 +902,96 @@ def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtyp @classmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: + """ + Create an Int32 from a np.dtype('int32') instance. + + Parameters + ---------- + dtype : TBaseDType + The np.dtype('int32') instance. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" ) - def to_native_dtype(self) -> np.dtypes.Int32DType: + def to_native_dtype(self: Self) -> np.dtypes.Int32DType: + """ + Convert the Int32 instance to a np.dtype('int32') instance. + + Returns + ------- + np.dtypes.Int32DType + The np.dtype('int32') instance. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." raise DataTypeValidationError(msg) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an Int32 from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class Int32. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " DTypeConfig_V2[Literal[">i4", "i4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): + """ + A Zarr data type for arrays containing 32-bit unsigned integers. + + Wraps the ``np.dtypes.UInt32DType`` data type. Scalars for this data type are instances of + ``np.uint32``. + + Attributes + ---------- + dtype_cls : np.dtypes.UInt32DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 32-bit unsigned integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.UInt32DType _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " Self: + """ + Create a UInt32 from a np.dtype('uint32') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class 32-bit unsigned + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -518,13 +1090,40 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.UInt32DType: + """ + Create a NumPy unsigned 32-bit integer dtype instance from this UInt32 ZDType. + + Returns + ------- + np.dtypes.UInt32DType + The NumPy unsigned 32-bit integer dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 32-bit unsigned + integer. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -533,12 +1132,31 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 32-bit unsigned + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... @@ -546,17 +1164,17 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">u4", "u4", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 4 @dataclass(frozen=True, kw_only=True) class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit signed integers. + + Wraps the ``np.dtypes.Int64DType`` data type. Scalars for this data type are instances of + ``np.int64``. + + Attributes + ---------- + dtype_cls : np.dtypes.Int64DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the 64-bit signed integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.Int64DType _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: + """ + Create an Int64 from a np.dtype('int64') instance. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not a valid representation of this class 64-bit signed + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -585,13 +1248,40 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.Int64DType: + """ + Create a NumPy signed 64-bit integer dtype instance from this Int64 ZDType. + + Returns + ------- + np.dtypes.Int64DType + The NumPy signed 64-bit integer dtype. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 64-bit signed + integer. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -600,12 +1290,31 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class 64-bit signed + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... @@ -613,17 +1322,17 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">i8", "i8", " int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True) class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): + """ + A Zarr data type for arrays containing 64-bit unsigned integers. + + Wraps the ``np.dtypes.UInt64DType`` data type. Scalars for this data type + are instances of ``np.uint64``. + + Attributes + ---------- + dtype_cls: np.dtypes.UInt64DType + The class of the underlying NumPy dtype. + + References + ---------- + This class implements the unsigned 64-bit integer data type defined in Zarr V2 and V3. + + See the `Zarr V2 `__ and `Zarr V3 `__ specification documents for details. + """ + dtype_cls = np.dtypes.UInt64DType _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: + """ + Convert the data type to a native NumPy dtype. + + Returns + ------- + np.dtypes.UInt64DType + The native NumPy dtype.eeeeeeeeeeeeeeeee + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class unsigned 64-bit + integer. + """ if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without + # Going via NumPy ensures that we get the endianness correct without # annoying string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -659,14 +1421,32 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from Zarr V3-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class unsigned 64-bit + integer. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... @@ -674,17 +1454,17 @@ def to_json( self, zarr_format: ZarrFormat ) -> DTypeConfig_V2[Literal[">u8", "u8", " Self: + """ + Create an instance of this data type from a native NumPy dtype. + + Parameters + ---------- + dtype : TBaseDType + The native NumPy dtype. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input dtype is not a valid representation of this class unsigned 64-bit + integer. + """ if cls._check_native_dtype(dtype): return cls(endianness=get_endianness_from_numpy_dtype(dtype)) raise DataTypeValidationError( @@ -703,4 +1502,12 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3fb26cf366..32375a1c71 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -48,17 +48,82 @@ def __str__(self) -> str: ... class LengthBytesConfig(TypedDict): + """ + Configuration for a fixed-length string data type in Zarr V3. + + Attributes + ---------- + length_bytes : int + The length in bytes of the data associated with this configuration. + """ + length_bytes: int -# TODO: Fix this terrible name -FixedLengthUTF32JSONV3 = NamedConfig[Literal["fixed_length_utf32"], LengthBytesConfig] +class FixedLengthUTF32JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``FixedLengthUTF32`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + Examples + -------- + + .. code-block:: python + + { + "name": " None: @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a FixedLengthUTF32 from a NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The NumPy data type. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_native_dtype(dtype): endianness = get_endianness_from_numpy_dtype(dtype) return cls( @@ -84,13 +162,31 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.StrDType[int]: + """ + Convert the FixedLengthUTF32 instance to a NumPy data type. + + Returns + ------- + np.dtypes.StrDType[int] + The NumPy data type. + """ byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls(self.length).newbyteorder(byte_order) @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: """ - Check that the input is a valid JSON representation of a numpy U dtype. + Check that the input is a valid JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[FixedLengthUTF32JSON_V2] + Whether the input is a valid JSON representation of a NumPy U dtype. """ return ( check_dtype_spec_v2(data) @@ -100,7 +196,20 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TypeGuard[FixedLengthUTF32JSONV3] + Whether the input is a valid JSON representation of a NumPy U dtype. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -111,15 +220,28 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSONV3]: and isinstance(data["configuration"]["length_bytes"], int) ) - @overload # type: ignore[override] + @overload def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSONV3: ... + def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... def to_json( self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSONV3: + ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: + """ + Convert the FixedLengthUTF32 instance to a JSON representation. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 + The JSON representation of the data type. + """ if zarr_format == 2: return {"name": self.to_native_dtype().str, "object_codec_id": None} elif zarr_format == 3: @@ -132,75 +254,220 @@ def to_json( @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v2(data): - # Construct the numpy dtype instead of string parsing. + # Construct the NumPy dtype instead of string parsing. name = data["name"] return cls.from_native_dtype(np.dtype(name)) raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a numpy U dtype." + f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." ) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) def default_scalar(self) -> np.str_: + """ + Return the default scalar value for this data type. + + Returns + ------- + ``np.str_`` + The default scalar value. + """ return np.str_("") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert the scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + str + The JSON representation of the scalar value. + """ return str(data) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_: + """ + Convert the JSON representation of a scalar value to the native scalar value. + + Parameters + ---------- + data : JSON + The JSON data. + zarr_format : ZarrFormat + The Zarr format to use. + + Returns + ------- + ``np.str_`` + The native scalar value. + """ if check_json_str(data): return self.to_native_dtype().type(data) raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[str | np.str_ | bytes | int]: + def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + TypeGuard[SupportsStr] + Whether the input is a valid scalar value for this data type. + """ # this is generous for backwards compatibility - return isinstance(data, str | np.str_ | bytes | int) + return isinstance(data, SupportsStr) def cast_scalar(self, data: object) -> np.str_: + """ + Cast the scalar value to the native scalar value. + + Parameters + ---------- + data : object + The scalar value. + + Returns + ------- + ``np.str_`` + The native scalar value. + """ if self._check_scalar(data): - # We explicitly truncate before casting because of the following numpy behavior: + # We explicitly truncate before casting because of the following NumPy behavior: # >>> x = np.dtype('U3').type('hello world') # >>> x # np.str_('hello world') # >>> x.dtype # dtype('U11') - if isinstance(data, int): - return self.to_native_dtype().type(str(data)[: self.length]) - else: - return self.to_native_dtype().type(data[: self.length]) - raise TypeError( - f"Cannot convert object with type {type(data)} to a numpy unicode string scalar." + return self.to_native_dtype().type(str(data)[: self.length]) + + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." ) + raise TypeError(msg) # pragma: no-cover @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.length * self.code_point_bytes def check_vlen_string_json_scalar(data: object) -> TypeGuard[int | str | float]: """ - This function checks the type of JSON-encoded variable length strings. It is generous for - backwards compatibility, as zarr-python v2 would use ints for variable length strings - fill values + Check if the input is a valid JSON scalar for a variable-length string. + + This function is generous for backwards compatibility, as Zarr Python v2 would use ints for + variable-length string fill values. + + Parameters + ---------- + data : object + The JSON value to check. + + Returns + ------- + TypeGuard[int | str | float] + True if the input is a valid scalar for a variable-length string. """ return isinstance(data, int | str | float) -# VariableLengthUTF8 is defined in two places, conditioned on the version of numpy. -# If numpy 2 is installed, then VariableLengthUTF8 is defined with the numpy variable length -# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the numpy object +class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]): + """ + A wrapper around the JSON representation of the ``VariableLengthUTF8`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-utf8"``. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + + Examples + -------- + .. code-block:: python + + { + "name": "|O", + "object_codec_id": "vlen-utf8" + } + """ + + +# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy. +# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length +# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object # dtype as the native dtype. class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): """ - A base class for the variable length UTF-8 string data type. This class should not be used - as data type, but as a base class for other variable length string data types. + A base class for variable-length UTF-8 string data types. + + Not intended for direct use, but as a base for concrete implementations. + + Attributes + ---------- + object_codec_id : ClassVar[Literal["vlen-utf8"]] + The object codec ID for this data type. + + References + ---------- + This data type does not have a Zarr V3 specification. + + The Zarr V2 data type specification can be found `here `__. """ _zarr_v3_name: ClassVar[Literal["string"]] = "string" @@ -208,6 +475,25 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec): @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this data type from a compatible NumPy data type. + + + Parameters + ---------- + dtype : TBaseDType + The native data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input is not compatible with this data type. + """ if cls._check_native_dtype(dtype): return cls() raise DataTypeValidationError( @@ -218,10 +504,21 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]]: + ) -> TypeGuard[VariableLengthUTF8JSON_V2]: """ - Check that the input is a valid JSON representation of a numpy O dtype, and that the - object codec id is appropriate for variable-length UTF-8 strings. + "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype + for Zarr v2." + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + ``TypeGuard[VariableLengthUTF8JSON_V2]`` + Whether the input is a valid JSON representation of a NumPy "object" data type, and that the + object codec id is appropriate for variable-length UTF-8 strings. """ return ( check_dtype_spec_v2(data) @@ -230,11 +527,38 @@ def _check_json_v2( ) @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["string"]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[Literal["variable_length_utf8"]] + Whether the input is a valid JSON representation of a variable length UTF-8 string + data type. + """ return data == cls._zarr_v3_name @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a NumPy "object" dtype. + + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v2(data): return cls() msg = ( @@ -244,21 +568,44 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this class from a JSON representation of a variable length UTF-8 + string data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to create an instance from. + + Returns + ------- + Self + An instance of this data type. + """ if cls._check_json_v3(data): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json( - self, zarr_format: Literal[2] - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ... + @overload + def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]: + def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: + """ + Convert this data type to a JSON representation. + + Parameters + ---------- + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + ``VariableLengthUTF8JSON_V2 | Literal["string"]`` + The JSON representation of this data type. + """ if zarr_format == 2: return {"name": "|O", "object_codec_id": self.object_codec_id} elif zarr_format == 3: @@ -266,44 +613,175 @@ def to_json( raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def default_scalar(self) -> str: + """ + Return the default scalar value for this data type. + + Returns + ------- + str + The default scalar value. + """ return "" def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar value to a JSON representation. + + Parameters + ---------- + data : object + The scalar value to convert. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The JSON representation of the scalar value. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected a string.") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> str: + """ + Convert a JSON representation of a scalar value to the native scalar type. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + str + The native scalar type of the scalar value. + """ if not check_vlen_string_json_scalar(data): raise TypeError(f"Invalid type: {data}. Expected a string or number.") return str(data) def _check_scalar(self, data: object) -> TypeGuard[SupportsStr]: + """ + Check that the input is a valid scalar value for this data type. + + Parameters + ---------- + data : object + The scalar value to check. + + Returns + ------- + TypeGuard[SupportsStr] + Whether the input is a valid scalar value for this data type. + """ return isinstance(data, SupportsStr) def _cast_scalar_unchecked(self, data: SupportsStr) -> str: + """ + Cast a scalar value to a string. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + str + The string representation of the scalar value. + """ return str(data) def cast_scalar(self, data: object) -> str: + """ + Cast an object to a string. + + Parameters + ---------- + data : object + The value to cast. + + Returns + ------- + str + The input cast to str. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - raise TypeError(f"Cannot convert object with type {type(data)} to a python string.") + msg = ( # pragma: no cover + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) + raise TypeError(msg) # pragma: no cover if _NUMPY_SUPPORTS_VLEN_STRING: @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances + of ``str``. + + + Attributes + ---------- + dtype_cls : Type[np.dtypes.StringDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ + dtype_cls = np.dtypes.StringDType def to_native_dtype(self) -> np.dtypes.StringDType: + """ + Create a NumPy string dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.StringDType + The NumPy string dtype. + """ return self.dtype_cls() else: # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead. @dataclass(frozen=True, kw_only=True) class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef] + """ + A Zarr data type for arrays containing variable-length UTF-8 strings. + + Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances + of ``str``. + + + Attributes + ---------- + dtype_cls : Type[np.dtypes.ObjectDType] + The NumPy dtype class for this data type. + _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8" + The name of this data type in Zarr V3. + object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" + The object codec ID for this data type. + """ + dtype_cls = np.dtypes.ObjectDType def to_native_dtype(self) -> np.dtypes.ObjectDType: + """ + Create a NumPy object dtype from this VariableLengthUTF8 ZDType. + + Returns + ------- + np.dtypes.ObjectDType + The NumPy object dtype. + """ return self.dtype_cls() diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 8f40132820..a0e3b0fbd4 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -1,15 +1,16 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload import numpy as np +from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( DataTypeValidationError, DTypeConfig_V2, DTypeJSON, - DTypeSpec_V3, HasItemSize, StructuredName_V2, check_dtype_spec_v2, @@ -24,17 +25,86 @@ from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType if TYPE_CHECKING: - from collections.abc import Sequence - - from zarr.core.common import JSON, NamedConfig, ZarrFormat + from zarr.core.common import JSON, ZarrFormat StructuredScalarLike = list[object] | tuple[object, ...] | bytes | int +class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]): + """ + A wrapper around the JSON representation of the ``Structured`` data type in Zarr V2. + + The ``name`` field is a sequence of sequences, where each inner sequence has two values: + the field name and the data type name for that field (which could be another sequence). + The data type names are strings, and the object codec ID is always None. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": [ + ["f0", "`__. + """ + + _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] - _zarr_v3_name = "structured" fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] def __post_init__(self) -> None: @@ -60,6 +130,30 @@ def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Structured ZDType from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtypes.VoidDType with a non-null + ``fields`` attribute. + + Notes + ----- + This method attempts to resolve the fields of the structured dtype using the data type + registry. + """ from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] @@ -76,6 +170,19 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self: ) def to_native_dtype(self) -> np.dtypes.VoidDType[int]: + """ + Convert the structured Zarr data type to a native NumPy void dtype. + + This method constructs a NumPy dtype with fields corresponding to the + fields of the structured Zarr data type, by converting each field's + data type to its native dtype representation. + + Returns + ------- + np.dtypes.VoidDType[int] + The native NumPy void dtype representing the structured data type. + """ + return cast( "np.dtypes.VoidDType[int]", np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), @@ -85,7 +192,25 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: def _check_json_v2( cls, data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[StructuredName_V2, None]]: + ) -> TypeGuard[StructuredJSON_V2]: + """ + Check if the input is a valid JSON representation of a Structured data type + for Zarr V2. + + The input data must be a mapping that contains a "name" key that is not a str, + and an "object_codec_id" key that is None. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[StructuredJSON_V2] + True if the input is a valid JSON representation of a Structured data type + for Zarr V2, False otherwise. + """ return ( check_dtype_spec_v2(data) and not isinstance(data["name"], str) @@ -94,9 +219,22 @@ def _check_json_v2( ) @classmethod - def _check_json_v3( - cls, data: DTypeJSON - ) -> TypeGuard[NamedConfig[Literal["structured"], dict[str, Sequence[tuple[str, DTypeJSON]]]]]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[StructuredJSON_V3] + True if the input is a valid JSON representation of a structured data type for Zarr V3, + False otherwise. + """ + return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -138,22 +276,38 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: meta_fields = config["fields"] return cls( fields=tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) + (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) # type: ignore[misc] for f_name, f_dtype in meta_fields ) ) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[StructuredName_V2, None]: ... + @overload + def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... @overload - def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... + def to_json(self, zarr_format: Literal[3]) -> StructuredJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON_V3: + """ + Convert the structured data type to a JSON-serializable form. - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[StructuredName_V2, None] | DTypeSpec_V3: + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version. Accepted values are 2 and 3. + + Returns + ------- + StructuredJSON_V2 | StructuredJSON_V3 + The JSON representation of the structured data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ if zarr_format == 2: fields = [ [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] @@ -170,14 +324,46 @@ def to_json( "name": self._zarr_v3_name, "configuration": {"fields": fields}, } - return cast("DTypeSpec_V3", base_dict) + return cast("StructuredJSON_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! + """ + Check that the input is a valid scalar value for this structured data type. + + Parameters + ---------- + data : object + The scalar value to check. + + Returns + ------- + TypeGuard[StructuredScalarLike] + Whether the input is a valid scalar value for this structured data type. + """ return isinstance(data, (bytes, list, tuple, int, np.void)) def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: + """ + Cast a python object to a numpy structured scalar without type checking. + + Parameters + ---------- + data : StructuredScalarLike + The data to cast. + + Returns + ------- + np.void + The casted data as a numpy structured scalar. + + Notes + ----- + This method does not perform any type checking. + The input data must be castable to a numpy structured scalar. + + """ na_dtype = self.to_native_dtype() if isinstance(data, bytes): res = np.frombuffer(data, dtype=na_dtype)[0] @@ -188,15 +374,71 @@ def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: return cast("np.void", res) def cast_scalar(self, data: object) -> np.void: + """ + Cast a Python object to a NumPy structured scalar. + + This function attempts to cast the provided data to a NumPy structured scalar. + If the data is compatible with the structured scalar type, it is cast without + type checking. Otherwise, a TypeError is raised. + + Parameters + ---------- + data : object + The data to be cast to a NumPy structured scalar. + + Returns + ------- + np.void + The data cast as a NumPy structured scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a NumPy structured scalar. + """ + if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy structured scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.void: + """ + Get the default scalar value for this structured data type. + + Returns + ------- + np.void + The default scalar value, which is the scalar representation of 0 + cast to this structured data type. + """ + return self._cast_scalar_unchecked(0) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: + """ + Read a JSON-serializable value as a NumPy structured scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + np.void + The NumPy structured scalar. + + Raises + ------ + TypeError + If the input is not a base64-encoded string. + """ if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_native_dtype() @@ -204,9 +446,32 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: raise TypeError(f"Invalid type: {data}. Expected a string.") def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + """ + Convert a scalar to a JSON-serializable string representation. + + Parameters + ---------- + data : object + The scalar to convert. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + str + A string representation of the scalar, which is a base64-encoded + string of the bytes that make up the scalar. + """ return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) @property def item_size(self) -> int: - # Lets have numpy do the arithmetic here + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return self.to_native_dtype().itemsize diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 2fd140ef91..d523e16940 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -16,6 +16,7 @@ ) import numpy as np +from typing_extensions import ReadOnly from zarr.core.common import NamedConfig from zarr.core.dtype.common import ( @@ -38,7 +39,6 @@ if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat -_DTypeName = Literal["datetime64", "timedelta64"] TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None DateTimeLike = str | int | bytes | np.datetime64 | datetime | None @@ -58,7 +58,7 @@ def datetime_from_int(data: int, *, unit: DateTimeUnit, scale_factor: int) -> np Returns ------- - np.datetime64 + numpy.datetime64 The datetime64 value. """ dtype_name = f"datetime64[{scale_factor}{unit}]" @@ -71,7 +71,7 @@ def datetimelike_to_int(data: np.datetime64 | np.timedelta64) -> int: Parameters ---------- - data : np.datetime64 | np.timedelta64 + data : np.datetime64 | numpy.timedelta64 The value to convert. Returns @@ -101,21 +101,136 @@ def check_json_time(data: JSON) -> TypeGuard[Literal["NaT"] | int]: class TimeConfig(TypedDict): - unit: DateTimeUnit - scale_factor: int + """ + The configuration for the numpy.timedelta64 or numpy.datetime64 data type in Zarr V3. + + Attributes + ---------- + unit : ReadOnly[DateTimeUnit] + A string encoding a unit of time. + scale_factor : ReadOnly[int] + A scale factor. + + Examples + -------- + .. code-block:: python + + {"unit": "ms", "scale_factor": 1} + """ + + unit: ReadOnly[DateTimeUnit] + scale_factor: ReadOnly[int] + + +class DateTime64JSON_V3(NamedConfig[Literal["numpy.datetime64"], TimeConfig]): + """ + The JSON representation of the ``numpy.datetime64`` data type in Zarr V3. + + References + ---------- + This representation is defined in the ``numpy.datetime64`` + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.datetime64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ + +class TimeDelta64JSON_V3(NamedConfig[Literal["numpy.timedelta64"], TimeConfig]): + """ + The JSON representation of the ``TimeDelta64`` data type in Zarr V3. -DateTime64JSONV3 = NamedConfig[Literal["numpy.datetime64"], TimeConfig] -TimeDelta64JSONV3 = NamedConfig[Literal["numpy.timedelta64"], TimeConfig] + References + ---------- + This representation is defined in the numpy.timedelta64 + `specification document `__. + + Examples + -------- + .. code-block:: python + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "ms", + "scale_factor": 1 + } + } + """ + + +class TimeDelta64JSON_V2(DTypeConfig_V2[str, None]): + """ + A wrapper around the JSON representation of the ``TimeDelta64`` data type in Zarr V2. + + The ``name`` field of this class contains the value that would appear under the + ``dtype`` field in Zarr V2 array metadata. + + References + ---------- + The structure of the ``name`` field is defined in the Zarr V2 + `specification document `__. + + + Examples + -------- + .. code-block:: python + + { + "name": "`__. + + + Examples + -------- + .. code-block:: python + + { + "name": " None: @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create an instance of this class from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native NumPy dtype to convert. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the dtype is not a valid representation of this class. + """ + if cls._check_native_dtype(dtype): unit, scale_factor = np.datetime_data(dtype.name) unit = cast("DateTimeUnit", unit) @@ -145,58 +279,104 @@ def to_native_dtype(self) -> BaseTimeDType_co: # Numpy does not allow creating datetime64 or timedelta64 via # np.dtypes.{dtype_name}() # so we use np.dtype with a formatted string. + """ + Convert this data type to a NumPy temporal data type with the appropriate + unit and scale factor. + + Returns + ------- + BaseTimeDType_co + A NumPy data type object representing the time data type with + the specified unit, scale factor, and byte order. + """ + dtype_string = f"{self._numpy_name}[{self.scale_factor}{self.unit}]" return np.dtype(dtype_string).newbyteorder(endianness_to_numpy_str(self.endianness)) # type: ignore[return-value] - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSONV3 | TimeDelta64JSONV3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | DateTime64JSONV3 | TimeDelta64JSONV3: - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return cast( - "DateTime64JSONV3 | TimeDelta64JSONV3", - { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - }, - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> int: + """ + Convert a python object to a JSON representation of a datetime64 or timedelta64 scalar. + + Parameters + ---------- + data : object + The python object to convert. + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + int + The JSON representation of the scalar. + """ return datetimelike_to_int(data) # type: ignore[arg-type] @property def item_size(self) -> int: + """ + The size of a single scalar in bytes. + + Returns + ------- + int + The size of a single scalar in bytes. + """ return 8 @dataclass(frozen=True, kw_only=True, slots=True) class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], HasEndianness): """ - A wrapper for the ``TimeDelta64`` data type defined in numpy. - Scalars of this type can be created by performing arithmetic with ``DateTime64`` scalars. - Like ``DateTime64``, ``TimeDelta64`` is parametrized by a unit, but unlike ``DateTime64``, the - unit for ``TimeDelta64`` is optional. + A Zarr data type for arrays containing NumPy TimeDelta64 data. + + Wraps the ``np.dtypesTimeDelta64DType`` data type. Scalars for this data type + are instances of `np.timedelta64`. + + Attributes + ---------- + dtype_cls : Type[np.dtypesTimeDelta64DType] + The NumPy dtype class for this data type. + scale_factor : int + The scale factor for this data type. + unit : DateTimeUnit + The unit for this data type. + + References + ---------- + The Zarr V2 representation of this data type is defined in the Zarr V2 + `specification document `__. + + The Zarr V3 representation of this data type is defined in the ``numpy.timedelta64`` + `specification document `__ """ # mypy infers the type of np.dtypes.TimeDelta64DType to be # "Callable[[Literal['Y', 'M', 'W', 'D'] | Literal['h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as']], Never]" dtype_cls = np.dtypes.TimeDelta64DType # type: ignore[assignment] - _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names = (">m8", "m8"], Literal["m8", " TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[TimeDelta64JSON_V2]: + """ + Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, + which could be in the form of strings like "m8[10s]". This method serves as a type + guard, helping to refine the type of unknown JSON input by confirming its adherence to the + expected format for NumPy timedelta64 data types. + + The JSON input should contain a "name" key with a value that matches the expected string + pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed + within square brackets, following the base type identifier. + + Returns + ------- + bool + True if the JSON input is a valid representation of this class, + otherwise False. + """ if not check_dtype_spec_v2(data): return False name = data["name"] @@ -214,7 +394,16 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Returns + ------- + TypeGuard[DateTime64JSON_V3] + True if the JSON input is a valid representation of this class, + otherwise False. + """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -225,6 +414,24 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V2-flavored JSON. + + Parameters + ---------- + data : DTypeJSON + The JSON data. + + Returns + ------- + TimeDelta64 + An instance of TimeDelta64. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v2(data): name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -236,6 +443,26 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create a TimeDelta64 from a Zarr V3-flavored JSON. + + The JSON representation of a TimeDelta64 in Zarr V3 is a dict with a 'name' key + with the value 'numpy.timedelta64', and a 'configuration' key with a value of a dict + with a 'unit' key and a 'scale_factor' key. + + For example: + + .. code-block:: json + + { + "name": "numpy.timedelta64", + "configuration": { + "unit": "generic", + "scale_factor": 1 + } + } + + """ if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -248,24 +475,121 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) + @overload + def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + TimeDelta64JSON_V2 | TimeDelta64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: + """ + Check if the input is a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[TimeDeltaLike] + True if the input is a scalar of this data type, False otherwise. + """ if data is None: return True return isinstance(data, str | int | bytes | np.timedelta64 | timedelta) def _cast_scalar_unchecked(self, data: TimeDeltaLike) -> np.timedelta64: + """ + Cast the provided scalar input to a numpy timedelta64 without any type checking. + + This method assumes that the input data is already a valid scalar of this data type, + and does not perform any validation or type checks. It directly casts the input + to a numpy timedelta64 scalar using the unit and scale factor defined in the class. + + Parameters + ---------- + data : TimeDeltaLike + The scalar input data to cast. + + Returns + ------- + numpy.timedelta64 + The input data cast as a numpy timedelta64 scalar. + """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") def cast_scalar(self, data: object) -> np.timedelta64: + """ + Cast the input to a numpy timedelta64 scalar. If the input is not a scalar of this data type, + raise a TypeError. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy timedelta64 scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.timedelta64: + """ + Return a default scalar of this data type. + + This method provides a default value for the timedelta64 scalar, which is + a 'Not-a-Time' (NaT) value. + """ return np.timedelta64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedelta64: + """ + Create a scalar of this data type from JSON input. + + Parameters + ---------- + data : JSON + The JSON representation of the scalar value. + zarr_format : int + The zarr format to use for the JSON representation. + + Returns + ------- + numpy.timedelta64 + The scalar value of this data type. + + Raises + ------ + TypeError + If the input JSON is not a valid representation of a scalar for this data type. + """ if check_json_time(data): return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover @@ -273,19 +597,52 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.timedel @dataclass(frozen=True, kw_only=True, slots=True) class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEndianness): + """ + A Zarr data type for arrays containing NumPy Datetime64 data. + + Wraps the ``np.dtypes.TimeDelta64DType`` data type. Scalars for this data type + are instances of ``np.datetime64``. + + Attributes + ---------- + dtype_cls : Type[np.dtypesTimeDelta64DType] + The numpy dtype class for this data type. + unit : DateTimeUnit + The unit of time for this data type. + scale_factor : int + The scale factor for the time unit. + + References + ---------- + The Zarr V2 representation of this data type is defined in the Zarr V2 + `specification document `__. + + The Zarr V3 representation of this data type is defined in the ``numpy.datetime64`` + `specification document `__ + """ + dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names = (">M8", "M8"], Literal["M8", " TypeGuard[DTypeConfig_V2[str, None]]: + def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V2]: """ - Check that JSON input is a string representation of a NumPy datetime64 data type, like "M8[10s]". This function can be used as a type guard to narrow the type of unknown JSON - input. + Check that the input is a valid JSON representation of this data type. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DateTime64JSON_V2] + True if the input is a valid JSON representation of a NumPy datetime64 data type, + otherwise False. """ if not check_dtype_spec_v2(data): return False @@ -302,7 +659,21 @@ def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]] return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: + """ + Check that the input is a valid JSON representation of this class in Zarr V3. + + Parameters + ---------- + data : DTypeJSON + The JSON data to check. + + Returns + ------- + TypeGuard[DateTime64JSON_V3] + True if the input is a valid JSON representation of a numpy datetime64 data type in Zarr V3, False otherwise. + """ + return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -313,6 +684,29 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSONV3]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V2-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of this class. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ + if cls._check_json_v2(data): name = data["name"] return cls.from_native_dtype(np.dtype(name)) @@ -324,6 +718,28 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: + """ + Create an instance of this data type from a Zarr V3-flavored JSON representation. + + This method checks if the provided JSON data is a valid representation of this class. + If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a + DataTypeValidationError. + + Parameters + ---------- + data : DTypeJSON + The JSON data to parse. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input JSON is not a valid representation of this class. + """ if cls._check_json_v3(data): unit = data["configuration"]["unit"] scale_factor = data["configuration"]["scale_factor"] @@ -336,24 +752,134 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) + @overload + def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: + """ + Serialize this data type to JSON. + + Parameters + ---------- + zarr_format : ZarrFormat + The Zarr format version (2 or 3). + + Returns + ------- + DateTime64JSON_V2 | DateTime64JSON_V3 + The JSON representation of the data type. + + Raises + ------ + ValueError + If the zarr_format is not 2 or 3. + """ + if zarr_format == 2: + name = self.to_native_dtype().str + return {"name": name, "object_codec_id": None} + elif zarr_format == 3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: + """ + Check if the input is convertible to a scalar of this data type. + + Parameters + ---------- + data : object + The object to check. + + Returns + ------- + TypeGuard[DateTimeLike] + True if the input is a scalar of this data type, False otherwise. + """ if data is None: return True return isinstance(data, str | int | bytes | np.datetime64 | datetime) def _cast_scalar_unchecked(self, data: DateTimeLike) -> np.datetime64: + """ + Cast the input to a scalar of this data type without any type checking. + + Parameters + ---------- + data : DateTimeLike + The scalar data to cast. + + Returns + ------- + numpy.datetime64 + The input cast to a NumPy datetime scalar. + """ return self.to_native_dtype().type(data, f"{self.scale_factor}{self.unit}") def cast_scalar(self, data: object) -> np.datetime64: + """ + Cast the input to a scalar of this data type after a type check. + + Parameters + ---------- + data : object + The scalar value to cast. + + Returns + ------- + numpy.datetime64 + The input cast to a NumPy datetime scalar. + + Raises + ------ + TypeError + If the data cannot be converted to a numpy datetime scalar. + """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) - msg = f"Cannot convert object with type {type(data)} to a numpy datetime scalar." + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {self}." + ) raise TypeError(msg) def default_scalar(self) -> np.datetime64: + """ + Return the default scalar value for this data type. + + Returns + ------- + numpy.datetime64 + The default scalar value, which is a 'Not-a-Time' (NaT) value + """ + return np.datetime64("NaT") def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.datetime64: + """ + Read a JSON-serializable value as a scalar. + + Parameters + ---------- + data : JSON + The JSON-serializable value. + zarr_format : ZarrFormat + The zarr format version. + + Returns + ------- + numpy.datetime64 + The numpy datetime scalar. + + Raises + ------ + TypeError + If the input is not a valid integer type. + """ if check_json_time(data): return self._cast_scalar_unchecked(data) raise TypeError(f"Invalid type: {data}. Expected an integer.") # pragma: no cover diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 1d2a97a90a..cb9ab50044 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -23,34 +23,131 @@ # have just 1 registry class in use. @dataclass(frozen=True, kw_only=True) class DataTypeRegistry: + """ + A registry for ZDType classes. + + This registry is a mapping from Zarr data type names to their + corresponding ZDType classes. + + Attributes + ---------- + contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] + The mapping from Zarr data type names to their corresponding + ZDType classes. + """ + contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( default_factory=dict, init=False ) - lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) + _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - def lazy_load(self) -> None: - for e in self.lazy_load_list: + def _lazy_load(self) -> None: + """ + Load all data types from the lazy load list and register them with + the registry. After loading, clear the lazy load list. + """ + for e in self._lazy_load_list: self.register(e.load()._zarr_v3_name, e.load()) - self.lazy_load_list.clear() + self._lazy_load_list.clear() def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - # don't register the same dtype twice + """ + Register a data type with the registry. + + Parameters + ---------- + key : str + The Zarr V3 name of the data type. + cls : type[ZDType[TBaseDType, TBaseScalar]] + The class of the data type to register. + + Notes + ----- + This method is idempotent. If the data type is already registered, this + method does nothing. + """ if key not in self.contents or self.contents[key] != cls: self.contents[key] = cls def unregister(self, key: str) -> None: - """Unregister a data type by its key.""" + """ + Unregister a data type from the registry. + + Parameters + ---------- + key : str + The key associated with the ZDType class to be unregistered. + + Returns + ------- + None + + Raises + ------ + KeyError + If the data type is not found in the registry. + """ if key in self.contents: del self.contents[key] else: raise KeyError(f"Data type '{key}' not found in registry.") def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: + """ + Retrieve a registered ZDType class by its key. + + Parameters + ---------- + key : str + The key associated with the desired ZDType class. + + Returns + ------- + type[ZDType[TBaseDType, TBaseScalar]] + The ZDType class registered under the given key. + + Raises + ------ + KeyError + If the key is not found in the registry. + """ + return self.contents[key] def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a native data type, e.g. a NumPy data type, to a registered ZDType. + + Parameters + ---------- + dtype : TBaseDType + The native data type to match. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the provided NumPy data type. + + Raises + ------ + ValueError + If the data type is a NumPy "Object" type, which is ambiguous, or if multiple + or no Zarr data types are found that match the provided dtype. + + Notes + ----- + This function attempts to resolve a Zarr data type from a given native data type. + If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type + can represent multiple Zarr data types. In such cases, a specific Zarr data type + should be explicitly constructed instead of relying on dynamic resolution. + + If multiple matches are found, it will also raise a ValueError. In this case + conflicting data types must be unregistered, or the Zarr data type should be explicitly + constructed. + """ + if dtype == np.dtype("O"): msg = ( f"Zarr data type resolution from {dtype} failed. " @@ -82,6 +179,27 @@ def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: def match_json( self, data: DTypeJSON, *, zarr_format: ZarrFormat ) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a JSON representation of a data type to a registered ZDType. + + Parameters + ---------- + data : DTypeJSON + The JSON representation of a data type to match. + zarr_format : ZarrFormat + The Zarr format version to consider when matching data types. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the JSON representation. + + Raises + ------ + ValueError + If no matching Zarr data type is found for the given JSON data. + """ + for val in self.contents.values(): try: return val.from_json(data, zarr_format=zarr_format) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 7be97fa4b4..b53018c137 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -80,7 +80,9 @@ class variable, and it should generally be unique across different data types. @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_co]: """ - Check that a native data type matches the dtype_cls class attribute. Used as a type guard. + Check that a native data type matches the dtype_cls class attribute. + + Used as a type guard. Parameters ---------- @@ -98,9 +100,10 @@ def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[TDType_ @abstractmethod def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self: """ - Create a ZDType instance from a native data type. The default implementation first performs - a type check via ``cls._check_native_dtype``. If that type check succeeds, the ZDType class - instance is created. + Create a ZDType instance from a native data type. + + The base implementation first performs a type check via ``cls._check_native_dtype``. + If that type check succeeds, the ZDType class instance is created. This method is used when taking a user-provided native data type, like a NumPy data type, and creating the corresponding ZDType instance from them. @@ -150,8 +153,7 @@ def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> S Parameters ---------- data : DTypeJSON - The JSON representation of the data type. The type annotation includes - Mapping[str, object] to accommodate typed dictionaries. + The JSON representation of the data type. zarr_format : ZarrFormat The zarr format version. @@ -159,7 +161,7 @@ def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> S Returns ------- Self - The wrapped data type. + An instance of this data type. """ if zarr_format == 2: return cls._from_json_v2(data) @@ -211,6 +213,7 @@ def _check_scalar(self, data: object) -> bool: def cast_scalar(self, data: object) -> TScalar_co: """ Cast a python object to the wrapped scalar type. + The type of the provided scalar is first checked for compatibility. If it's incompatible with the associated scalar type, a ``TypeError`` will be raised. @@ -228,8 +231,9 @@ def cast_scalar(self, data: object) -> TScalar_co: @abstractmethod def default_scalar(self) -> TScalar_co: """ - Get the default scalar value for the wrapped data type. This is a method, rather than an - attribute, because the default value for some data types depends on parameters that are + Get the default scalar value for the wrapped data type. + + This is a method, rather than an attribute, because the default value for some data types depends on parameters that are not known until a concrete data type is wrapped. For example, data types parametrized by a length like fixed-length strings or bytes will generate scalars consistent with that length. @@ -263,8 +267,10 @@ def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> TSca @abstractmethod def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON: """ - Serialize a python object to the JSON representation of a scalar. The value will first be - cast to the scalar type associated with this ZDType, then serialized to JSON. + Serialize a python object to the JSON representation of a scalar. + + The value will first be cast to the scalar type associated with this ZDType, then serialized + to JSON. Parameters ---------- diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4c8ced21f4..bad710ed43 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -2181,7 +2181,7 @@ def create_hierarchy( group already exists at path ``a``, then this function will leave the group at ``a`` as-is. Yields - ------- + ------ tuple[str, Array | Group]. Examples diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 6e3789543b..98485c7d65 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -1,3 +1,79 @@ -from zarr.core.dtype import ZDType, data_type_registry +from zarr.core.dtype import ( + Bool, + Complex64, + Complex128, + DataTypeValidationError, + DateTime64, + DateTime64JSON_V2, + DateTime64JSON_V3, + FixedLengthUTF32, + Float16, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + NullTerminatedBytes, + NullTerminatedBytesJSON_V3, + RawBytes, + RawBytesJSON_V3, + Structured, + StructuredJSON_V2, + StructuredJSON_V3, + TimeDelta64, + TimeDelta64JSON_V2, + TimeDelta64JSON_V3, + UInt8, + UInt16, + UInt32, + UInt64, + VariableLengthBytes, + VariableLengthBytesJSON_V2, + VariableLengthUTF8, + VariableLengthUTF8JSON_V2, + ZDType, + data_type_registry, + parse_data_type, +) -__all__ = ["ZDType", "data_type_registry"] +__all__ = [ + "Bool", + "Complex64", + "Complex128", + "DataTypeValidationError", + "DateTime64", + "DateTime64JSON_V2", + "DateTime64JSON_V3", + "FixedLengthUTF32", + "Float16", + "Float32", + "Float64", + "Int8", + "Int16", + "Int32", + "Int64", + "NullTerminatedBytes", + "NullTerminatedBytesJSON_V3", + "RawBytes", + "RawBytesJSON_V3", + "Structured", + "StructuredJSON_V2", + "StructuredJSON_V3", + "TimeDelta64", + "TimeDelta64", + "TimeDelta64JSON_V2", + "TimeDelta64JSON_V3", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "VariableLengthBytes", + "VariableLengthBytesJSON_V2", + "VariableLengthUTF8", + "VariableLengthUTF8JSON_V2", + "ZDType", + "data_type_registry", + "data_type_registry", + "parse_data_type", +] diff --git a/src/zarr/registry.py b/src/zarr/registry.py index eb345b24b1..189d42abed 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -93,8 +93,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) __ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) + data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) __pipeline_registry.lazy_load_list.extend(entry_points.select(group="zarr.codec_pipeline")) __pipeline_registry.lazy_load_list.extend( diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 0be1c60088..0650d143c6 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -65,7 +65,4 @@ class TestB(TestExample): for fixture_name in metafunc.fixturenames: if hasattr(metafunc.cls, fixture_name): params = getattr(metafunc.cls, fixture_name) - if len(params) == 0: - msg = f"{metafunc.cls}.{fixture_name} is empty. Please provide a non-empty sequence of values." - raise ValueError(msg) - metafunc.parametrize(fixture_name, params, scope="class") + metafunc.parametrize(fixture_name, params, scope="class", ids=str) diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index 010dec2e47..da30214b3b 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -38,4 +38,5 @@ class TestBool(BaseTestZDType): (Bool(), np.True_, np.True_), (Bool(), np.False_, np.False_), ) + invalid_scalar_params = (None,) item_size_params = (Bool(),) diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py index 3f1ba9315e..78980f7809 100644 --- a/tests/test_dtype/test_npy/test_bytes.py +++ b/tests/test_dtype/test_npy/test_bytes.py @@ -24,6 +24,7 @@ class TestNullTerminatedBytes(BaseTestZDType): "|S", "|U10", "|f8", + {"name": "|S4", "object_codec_id": "vlen-bytes"}, ) invalid_json_v3 = ( {"name": "fixed_length_ascii", "configuration": {"length_bits": 0}}, @@ -45,6 +46,7 @@ class TestNullTerminatedBytes(BaseTestZDType): (NullTerminatedBytes(length=2), "ab", np.bytes_("ab")), (NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")), ) + invalid_scalar_params = ((NullTerminatedBytes(length=1), 1.0),) item_size_params = ( NullTerminatedBytes(length=1), NullTerminatedBytes(length=4), @@ -91,6 +93,7 @@ class TestRawBytes(BaseTestZDType): (RawBytes(length=2), b"ab", np.void(b"ab")), (RawBytes(length=4), b"abcd", np.void(b"abcd")), ) + invalid_scalar_params = ((RawBytes(length=1), 1.0),) item_size_params = ( RawBytes(length=1), RawBytes(length=4), @@ -133,11 +136,8 @@ class TestVariableLengthBytes(BaseTestZDType): (VariableLengthBytes(), "ab", b"ab"), (VariableLengthBytes(), "abcdefg", b"abcdefg"), ) - item_size_params = ( - VariableLengthBytes(), - VariableLengthBytes(), - VariableLengthBytes(), - ) + invalid_scalar_params = ((VariableLengthBytes(), 1.0),) + item_size_params = (VariableLengthBytes(),) @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b6a1e799eb..b4ce42be58 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -54,7 +54,7 @@ class TestComplex64(_BaseTestFloat): (Complex64(), complex(-1.0, math.inf), np.complex64(complex(-1.0, math.inf))), (Complex64(), complex(0, math.nan), np.complex64(complex(0, math.nan))), ) - + invalid_scalar_params = ((Complex64(), {"type": "dict"}),) item_size_params = (Complex64(),) @@ -97,4 +97,5 @@ class TestComplex128(_BaseTestFloat): (Complex128(), complex(-1.0, math.inf), np.complex128(complex(-1.0, math.inf))), (Complex128(), complex(0, math.nan), np.complex128(complex(0, math.nan))), ) + invalid_scalar_params = ((Complex128(), {"type": "dict"}),) item_size_params = (Complex128(),) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index e875dc87e3..90fa27c9cf 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -65,7 +65,7 @@ class TestFloat16(_BaseTestFloat): (Float16(), -1.0, np.float16(-1.0)), (Float16(), "NaN", np.float16("NaN")), ) - + invalid_scalar_params = ((Float16(), {"set!"}),) hex_string_params = (("0x7fc0", np.nan), ("0x7fc1", np.nan), ("0x3c00", 1.0)) item_size_params = (Float16(),) @@ -113,7 +113,7 @@ class TestFloat32(_BaseTestFloat): (Float32(), -1.0, np.float32(-1.0)), (Float32(), "NaN", np.float32("NaN")), ) - + invalid_scalar_params = ((Float32(), {"set!"}),) hex_string_params = (("0x7fc00000", np.nan), ("0x7fc00001", np.nan), ("0x3f800000", 1.0)) item_size_params = (Float32(),) @@ -160,7 +160,7 @@ class TestFloat64(_BaseTestFloat): (Float64(), -1.0, np.float64(-1.0)), (Float64(), "NaN", np.float64("NaN")), ) - + invalid_scalar_params = ((Float64(), {"set!"}),) hex_string_params = ( ("0x7ff8000000000000", np.nan), ("0x7ff8000000000001", np.nan), diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index 71257907d5..efc4fae496 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -34,6 +34,7 @@ class TestInt8(BaseTestZDType): (Int8(), 1, np.int8(1)), (Int8(), -1, np.int8(-1)), ) + invalid_scalar_params = ((Int8(), {"set!"}), (Int8(), ("tuple",))) item_size_params = (Int8(),) @@ -68,7 +69,7 @@ class TestInt16(BaseTestZDType): (Int16(), 1, np.int16(1)), (Int16(), -1, np.int16(-1)), ) - + invalid_scalar_params = ((Int16(), {"set!"}), (Int16(), ("tuple",))) item_size_params = (Int16(),) @@ -106,6 +107,7 @@ class TestInt32(BaseTestZDType): (Int32(), 1, np.int32(1)), (Int32(), -1, np.int32(-1)), ) + invalid_scalar_params = ((Int32(), {"set!"}), (Int32(), ("tuple",))) item_size_params = (Int32(),) @@ -140,6 +142,7 @@ class TestInt64(BaseTestZDType): (Int64(), 1, np.int64(1)), (Int64(), -1, np.int64(-1)), ) + invalid_scalar_params = ((Int64(), {"set!"}), (Int64(), ("tuple",))) item_size_params = (Int64(),) @@ -171,6 +174,7 @@ class TestUInt8(BaseTestZDType): (UInt8(), 1, np.uint8(1)), (UInt8(), 0, np.uint8(0)), ) + invalid_scalar_params = ((UInt8(), {"set!"}), (UInt8(), ("tuple",))) item_size_params = (UInt8(),) @@ -205,6 +209,7 @@ class TestUInt16(BaseTestZDType): (UInt16(), 1, np.uint16(1)), (UInt16(), 0, np.uint16(0)), ) + invalid_scalar_params = ((UInt16(), {"set!"}), (UInt16(), ("tuple",))) item_size_params = (UInt16(),) @@ -239,6 +244,7 @@ class TestUInt32(BaseTestZDType): (UInt32(), 1, np.uint32(1)), (UInt32(), 0, np.uint32(0)), ) + invalid_scalar_params = ((UInt32(), {"set!"}), (UInt32(), ("tuple",))) item_size_params = (UInt32(),) @@ -273,4 +279,5 @@ class TestUInt64(BaseTestZDType): (UInt64(), 1, np.uint64(1)), (UInt64(), 0, np.uint64(0)), ) + invalid_scalar_params = ((UInt64(), {"set!"}), (UInt64(), ("tuple",))) item_size_params = (UInt64(),) diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py index 7c3c6a8cd4..2cde6a1ac1 100644 --- a/tests/test_dtype/test_npy/test_string.py +++ b/tests/test_dtype/test_npy/test_string.py @@ -40,6 +40,8 @@ class TestVariableLengthString(BaseTestZDType): (VariableLengthUTF8(), "", np.str_("")), (VariableLengthUTF8(), "hi", np.str_("hi")), ) + # anything can become a string + invalid_scalar_params = (None,) item_size_params = (VariableLengthUTF8(),) else: @@ -74,7 +76,8 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef] (VariableLengthUTF8(), "", np.str_("")), (VariableLengthUTF8(), "hi", np.str_("hi")), ) - + # anything can become a string + invalid_scalar_params = (None,) item_size_params = (VariableLengthUTF8(),) @@ -118,6 +121,8 @@ class TestFixedLengthUTF32(BaseTestZDType): FixedLengthUTF32(length=4), FixedLengthUTF32(length=10), ) + # anything can become a string + invalid_scalar_params = (None,) @pytest.mark.parametrize( diff --git a/tests/test_dtype/test_npy/test_structured.py b/tests/test_dtype/test_npy/test_structured.py index c51aa73ff3..e2cd2a6dfe 100644 --- a/tests/test_dtype/test_npy/test_structured.py +++ b/tests/test_dtype/test_npy/test_structured.py @@ -98,16 +98,21 @@ class TestStructured(BaseTestZDType): ), ) - def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: - if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): - return np.array_equal(scalar1, scalar2) - return super().scalar_equals(scalar1, scalar2) - item_size_params = ( Structured(fields=(("field1", Int32()), ("field2", Float64()))), Structured(fields=(("field1", Int64()), ("field2", Int32()))), ) + invalid_scalar_params = ( + (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "i am a string"), + (Structured(fields=(("field1", Int32()), ("field2", Float64()))), {"type": "dict"}), + ) + + def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: + if hasattr(scalar1, "shape") and hasattr(scalar2, "shape"): + return np.array_equal(scalar1, scalar2) + return super().scalar_equals(scalar1, scalar2) + def test_invalid_size() -> None: """ diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index e201be5cf6..b94b600cbf 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -68,6 +68,10 @@ class TestDateTime64(_TestTimeBase): (DateTime64(unit="s", scale_factor=1), "2005-02-25", np.datetime64("2005-02-25", "s")), (DateTime64(unit="ns", scale_factor=1), "NaT", np.datetime64("NaT")), ) + invalid_scalar_params = ( + (DateTime64(unit="Y", scale_factor=1), 1.3), + (DateTime64(unit="Y", scale_factor=1), [1.3]), + ) item_size_params = (DateTime64(unit="ns", scale_factor=1),) @@ -113,6 +117,10 @@ class TestTimeDelta64(_TestTimeBase): (TimeDelta64(unit="ns", scale_factor=1), "1", np.timedelta64(1, "ns")), (TimeDelta64(unit="ns", scale_factor=1), "NaT", np.timedelta64("NaT")), ) + invalid_scalar_params = ( + (TimeDelta64(unit="Y", scale_factor=1), 1.3), + (TimeDelta64(unit="Y", scale_factor=1), [1.3]), + ) item_size_params = (TimeDelta64(unit="ns", scale_factor=1),) diff --git a/tests/test_dtype/test_wrapper.py b/tests/test_dtype/test_wrapper.py index 8f461f1a77..cc365e86d4 100644 --- a/tests/test_dtype/test_wrapper.py +++ b/tests/test_dtype/test_wrapper.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING, Any, ClassVar import pytest @@ -59,6 +60,16 @@ class BaseTestZDType: A tuple of invalid JSON representations for Zarr format version 3. cast_value_params : ClassVar[tuple[tuple[Any, Any, Any], ...]] A tuple of (dtype, value, expected) tuples for testing ZDType.cast_value. + scalar_v2_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v2 + scalar_v3_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, scalar json) tuples for testing + ZDType.from_json_scalar / ZDType.to_json_scalar for zarr v3 + invalid_scalar_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, value) tuples, where each value is expected to fail ZDType.cast_value. + item_size_params : ClassVar[tuple[Any, ...]] + A tuple of (dtype, expected) tuples for testing ZDType.item_size """ test_cls: type[ZDType[TBaseDType, TBaseScalar]] @@ -76,10 +87,13 @@ class BaseTestZDType: # pairs. the first element of the pair is used to create a dtype instance, and the second # element is the json serialization of the scalar that we want to round-trip. - scalar_v2_params: ClassVar[tuple[tuple[Any, Any], ...]] = () + scalar_v2_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...]] = () scalar_v3_params: ClassVar[tuple[tuple[Any, Any], ...]] = () - cast_value_params: ClassVar[tuple[tuple[Any, Any, Any], ...]] - item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] + cast_value_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any, Any], ...]] = () + # Some data types, like bool and string, can consume any python object as a scalar. + # So we allow passing None in to this test to indicate that it should be skipped. + invalid_scalar_params: ClassVar[tuple[tuple[ZDType[Any, Any], Any], ...] | tuple[None]] = () + item_size_params: ClassVar[tuple[ZDType[Any, Any], ...]] = () def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: # An equality check for json-encoded scalars. This defaults to regular equality, @@ -124,6 +138,21 @@ def test_cast_value(self, cast_value_params: tuple[ZDType[Any, Any], Any, Any]) zdtype, value, expected = cast_value_params observed = zdtype.cast_scalar(value) assert self.scalar_equals(expected, observed) + # check that casting is idempotent + assert self.scalar_equals(zdtype.cast_scalar(observed), observed) + + def test_invalid_scalar( + self, invalid_scalar_params: tuple[ZDType[Any, Any], Any] | None + ) -> None: + if invalid_scalar_params is None: + pytest.skip(f"No test data provided for {self}.{__name__}") + zdtype, data = invalid_scalar_params + msg = ( + f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " + f"data type {zdtype}." + ) + with pytest.raises(TypeError, match=re.escape(msg)): + zdtype.cast_scalar(data) def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: """ @@ -133,4 +162,4 @@ def test_item_size(self, item_size_params: ZDType[Any, Any]) -> None: if isinstance(item_size_params, HasItemSize): assert item_size_params.item_size == item_size_params.to_native_dtype().itemsize else: - pytest.skip(f"Dtype {item_size_params} does not implement HasItemSize") + pytest.skip(f"Data type {item_size_params} does not implement HasItemSize") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index d4e37440a7..95ede9e1d7 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -167,7 +167,7 @@ def set_path() -> Generator[None, None, None]: def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry.lazy_load() + data_type_registry._lazy_load() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 0000000000..c97766364b --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import re +import subprocess +import sys +from pathlib import Path +from typing import Final + +import pytest +import tomlkit +from packaging.requirements import Requirement + +examples_dir = "examples" +script_paths = Path(examples_dir).glob("*.py") + +PEP_723_REGEX: Final = r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$" + +# This is the absolute path to the local Zarr installation. Moving this test to a different directory will break it. +ZARR_PROJECT_PATH = Path(".").absolute() + + +def set_dep(script: str, dependency: str) -> str: + """ + Set a dependency in a PEP-723 script header. + If the package is already in the list, it will be replaced. + If the package is not already in the list, it will be added. + + Source code modified from + https://packaging.python.org/en/latest/specifications/inline-script-metadata/#reference-implementation + """ + match = re.search(PEP_723_REGEX, script) + + if match is None: + raise ValueError(f"PEP-723 header not found in {script}") + + content = "".join( + line[2:] if line.startswith("# ") else line[1:] + for line in match.group("content").splitlines(keepends=True) + ) + + config = tomlkit.parse(content) + for idx, dep in enumerate(tuple(config["dependencies"])): + if Requirement(dep).name == Requirement(dependency).name: + config["dependencies"][idx] = dependency + + new_content = "".join( + f"# {line}" if line.strip() else f"#{line}" + for line in tomlkit.dumps(config).splitlines(keepends=True) + ) + + start, end = match.span("content") + return script[:start] + new_content + script[end:] + + +def resave_script(source_path: Path, dest_path: Path) -> None: + """ + Read a script from source_path and save it to dest_path after inserting the absolute path to the + local Zarr project directory in the PEP-723 header. + """ + source_text = source_path.read_text() + dest_text = set_dep(source_text, f"zarr @ file:///{ZARR_PROJECT_PATH}") + dest_path.write_text(dest_text) + + +@pytest.mark.skipif( + sys.platform in ("win32",), reason="This test fails due for unknown reasons on Windows in CI." +) +@pytest.mark.parametrize("script_path", script_paths) +def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: + dest_path = tmp_path / script_path.name + # We resave the script after inserting the absolute path to the local Zarr project directory, + # and then test its behavior. + # This allows the example to be useful to users who don't have Zarr installed, but also testable. + resave_script(script_path, dest_path) + result = subprocess.run(["uv", "run", str(dest_path)], capture_output=True, text=True) + assert result.returncode == 0, ( + f"Script at {script_path} failed to run. Output: {result.stdout} Error: {result.stderr}" + )