Merge branch 'main' into codec-docstrings

d-v-b · web-flow · commit 1457e40e1bcf · 2025-07-22T15:29:17.000+02:00
diff --git a/changes/3264.fix.rst b/changes/3264.fix.rst
@@ -0,0 +1,4 @@
+- Expand the range of types accepted by ``parse_data_type`` to include strings and Sequences.
+- Move the functionality of ``parse_data_type`` to a new function called ``parse_dtype``. This change
+  ensures that nomenclature is consistent across the codebase. ``parse_data_type`` remains, so this
+  change is not breaking.
diff --git a/changes/3273.doc.rst b/changes/3273.doc.rst
@@ -0,0 +1 @@
+Add a section on codecs to the migration guide.
diff --git a/changes/3280.fix.rst b/changes/3280.fix.rst
@@ -0,0 +1,2 @@
+Fix a regression introduced in 3.1.0 that prevented ``inf``, ``-inf``, and ``nan`` values
+from being stored in ``attributes``.
diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst
@@ -412,17 +412,17 @@ attempt data type resolution against *every* data type class, and if, for some r
 type matches multiple Zarr data types, we treat this as an error and raise an exception.
 
 If you have a NumPy data type and you want to get the corresponding ``ZDType`` instance, you can use
-the ``parse_data_type`` function, which will use the dynamic resolution described above. ``parse_data_type``
+the ``parse_dtype`` function, which will use the dynamic resolution described above. ``parse_dtype``
 handles a range of input types:
 
 - NumPy data types:
 
   .. code-block:: python
 
     >>> import numpy as np
-    >>> from zarr.dtype import parse_data_type
+    >>> from zarr.dtype import parse_dtype
     >>> my_dtype = np.dtype('>M8[10s]')
-    >>> parse_data_type(my_dtype, zarr_format=2)
+    >>> parse_dtype(my_dtype, zarr_format=2)
     DateTime64(endianness='big', scale_factor=10, unit='s')
 
 
@@ -431,7 +431,7 @@ handles a range of input types:
   .. code-block:: python
 
     >>> dtype_str = '>M8[10s]'
-    >>> parse_data_type(dtype_str, zarr_format=2)
+    >>> parse_dtype(dtype_str, zarr_format=2)
     DateTime64(endianness='big', scale_factor=10, unit='s')
 
 - ``ZDType`` instances:
@@ -440,7 +440,7 @@ handles a range of input types:
 
     >>> from zarr.dtype import DateTime64
     >>> zdt = DateTime64(endianness='big', scale_factor=10, unit='s')
-    >>> parse_data_type(zdt, zarr_format=2) # Use a ZDType (this is a no-op)
+    >>> parse_dtype(zdt, zarr_format=2) # Use a ZDType (this is a no-op)
     DateTime64(endianness='big', scale_factor=10, unit='s')
 
 - Python dictionaries (requires ``zarr_format=3``). These dictionaries must be consistent with the
@@ -449,7 +449,7 @@ handles a range of input types:
   .. code-block:: python
 
     >>> dt_dict = {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}
-    >>> parse_data_type(dt_dict, zarr_format=3)
+    >>> parse_dtype(dt_dict, zarr_format=3)
     DateTime64(endianness='little', scale_factor=10, unit='s')
-    >>> parse_data_type(dt_dict, zarr_format=3).to_json(zarr_format=3)
+    >>> parse_dtype(dt_dict, zarr_format=3).to_json(zarr_format=3)
     {'name': 'numpy.datetime64', 'configuration': {'unit': 's', 'scale_factor': 10}}
diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst
@@ -58,7 +58,7 @@ the following actions in order:
      vendor the parts of the specific modules that you need.
 
      * ``zarr.attrs`` has gone, with no replacement
-     * ``zarr.codecs`` has gone, use ``numcodecs`` instead
+     * ``zarr.codecs`` has changed, see "Codecs" section below for more information
      * ``zarr.context`` has gone, with no replacement
      * ``zarr.core`` remains but should be considered private API
      * ``zarr.hierarchy`` has gone, with no replacement (use ``zarr.Group`` inplace of ``zarr.hierarchy.Group``)
@@ -178,6 +178,18 @@ If you are interested in developing a custom store that targets these backends,
 :ref:`developing custom stores <user-guide-custom-stores>` or open an
 `issue <https://github.com/zarr-developers/zarr-python/issues>`_ to discuss your use case.
 
+
+Codecs
+~~~~~~
+Codecs defined in ``numcodecs`` (and also imported into the ``zarr.codecs`` namespace in Zarr-Python 2)
+should still be used when creating Zarr format 2 arrays.
+
+Codecs for creating Zarr format 3 arrays are available in two locations:
+
+- `zarr.codecs` contains Zarr format 3 codecs that are defined in the `codecs section of the Zarr format 3 specification <https://zarr-specs.readthedocs.io/en/latest/v3/codecs/index.html>`_.
+- `numcodecs.zarr3` contains codecs from ``numcodecs`` that can be used to create Zarr format 3 arrays, but are not necessarily part of the Zarr format 3 specification.
+
+
 Dependencies
 ~~~~~~~~~~~~
 
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -72,7 +72,7 @@
     VariableLengthUTF8,
     ZDType,
     ZDTypeLike,
-    parse_data_type,
+    parse_dtype,
 )
 from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
 from zarr.core.indexing import (
@@ -617,7 +617,7 @@ async def _create(
         Deprecated in favor of :func:`zarr.api.asynchronous.create_array`.
         """
 
-        dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format)
+        dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format)
         store_path = await make_store_path(store)
 
         shape = parse_shapelike(shape)
@@ -4238,7 +4238,7 @@ async def init_array(
 
     from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 
-    zdtype = parse_data_type(dtype, zarr_format=zarr_format)
+    zdtype = parse_dtype(dtype, zarr_format=zarr_format)
     shape_parsed = parse_shapelike(shape)
     chunk_key_encoding_parsed = _parse_chunk_key_encoding(
         chunk_key_encoding, zarr_format=zarr_format
diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Final, TypeAlias
 
 from zarr.core.dtype.common import (
@@ -94,6 +95,7 @@
     "ZDType",
     "data_type_registry",
     "parse_data_type",
+    "parse_dtype",
 ]
 
 data_type_registry = DataTypeRegistry()
@@ -188,22 +190,26 @@ def parse_data_type(
     zarr_format: ZarrFormat,
 ) -> ZDType[TBaseDType, TBaseScalar]:
     """
-    Interpret the input as a ZDType instance.
+    Interpret the input as a ZDType.
+
+    This function wraps ``parse_dtype``. The only difference is the function name. This function may
+    be deprecated in a future version of Zarr Python in favor of ``parse_dtype``.
 
     Parameters
     ----------
     dtype_spec : ZDTypeLike
-        The input to be interpreted as a ZDType instance. This could be a native data type
-        (e.g., a NumPy data type), a Python object that can be converted into a native data type,
-        a ZDType instance (in which case the input is returned unchanged), or a JSON object
-        representation of a data type.
+        The input to be interpreted as a ZDType. This could be a ZDType, which will be returned
+        directly, or a JSON representation of a ZDType, or a native dtype, or a python object that
+        can be converted into a native dtype.
     zarr_format : ZarrFormat
-        The zarr format version.
+        The Zarr format version. This parameter is required because this function will attempt to
+        parse the JSON representation of a data type, and the JSON representation of data types
+        varies between Zarr 2 and Zarr 3.
 
     Returns
     -------
     ZDType[TBaseDType, TBaseScalar]
-        The ZDType instance corresponding to the input.
+        The ZDType corresponding to the input.
 
     Examples
     --------
@@ -216,15 +222,57 @@ def parse_data_type(
     >>> parse_data_type({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3)
     DateTime64(endianness='little', scale_factor=10, unit='s')
     """
+    return parse_dtype(dtype_spec, zarr_format=zarr_format)
+
+
+def parse_dtype(
+    dtype_spec: ZDTypeLike,
+    *,
+    zarr_format: ZarrFormat,
+) -> ZDType[TBaseDType, TBaseScalar]:
+    """
+    Convert the input as a ZDType.
+
+    Parameters
+    ----------
+    dtype_spec : ZDTypeLike
+        The input to be converted to a ZDType. This could be a ZDType, which will be returned
+        directly, or a JSON representation of a ZDType, or a numpy dtype, or a python object that
+        can be converted into a native dtype.
+    zarr_format : ZarrFormat
+        The Zarr format version. This parameter is required because this function will attempt to
+        parse the JSON representation of a data type, and the JSON representation of data types
+        varies between Zarr 2 and Zarr 3.
+
+    Returns
+    -------
+    ZDType[TBaseDType, TBaseScalar]
+        The ZDType corresponding to the input.
+
+    Examples
+    --------
+    >>> from zarr.dtype import parse_dtype
+    >>> import numpy as np
+    >>> parse_dtype("int32", zarr_format=2)
+    Int32(endianness='little')
+    >>> parse_dtype(np.dtype('S10'), zarr_format=2)
+    NullTerminatedBytes(length=10)
+    >>> parse_dtype({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3)
+    DateTime64(endianness='little', scale_factor=10, unit='s')
+    """
     if isinstance(dtype_spec, ZDType):
         return dtype_spec
-    # dict and zarr_format 3 means that we have a JSON object representation of the dtype
-    if zarr_format == 3 and isinstance(dtype_spec, Mapping):
-        return get_data_type_from_json(dtype_spec, zarr_format=3)
+    # First attempt to interpret the input as JSON
+    if isinstance(dtype_spec, Mapping | str | Sequence):
+        try:
+            return get_data_type_from_json(dtype_spec, zarr_format=zarr_format)  # type: ignore[arg-type]
+        except ValueError:
+            # no data type matched this JSON-like input
+            pass
     if dtype_spec in VLEN_UTF8_ALIAS:
         # If the dtype request is one of the aliases for variable-length UTF-8 strings,
         # return that dtype.
         return VariableLengthUTF8()  # type: ignore[return-value]
     # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
-    # we can create a numpy dtype from it, and do the dtype inference from that
+    # we can create a native dtype from it, and do the dtype inference from that
     return get_data_type_from_native_dtype(dtype_spec)  # type: ignore[arg-type]
diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py
@@ -336,7 +336,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
         if self.zarr_format == 3:
             return {
                 ZARR_JSON: prototype.buffer.from_bytes(
-                    json.dumps(self.to_dict(), indent=json_indent, allow_nan=False).encode()
+                    json.dumps(self.to_dict(), indent=json_indent, allow_nan=True).encode()
                 )
             }
         else:
@@ -345,7 +345,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
                     json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode()
                 ),
                 ZATTRS_JSON: prototype.buffer.from_bytes(
-                    json.dumps(self.attributes, indent=json_indent, allow_nan=False).encode()
+                    json.dumps(self.attributes, indent=json_indent, allow_nan=True).encode()
                 ),
             }
             if self.consolidated_metadata:
@@ -373,7 +373,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
 
                 items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes(
                     json.dumps(
-                        {"metadata": d, "zarr_consolidated_format": 1}, allow_nan=False
+                        {"metadata": d, "zarr_consolidated_format": 1}, allow_nan=True
                     ).encode()
                 )
 
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -132,10 +132,10 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
         json_indent = config.get("json_indent")
         return {
             ZARRAY_JSON: prototype.buffer.from_bytes(
-                json.dumps(zarray_dict, indent=json_indent, allow_nan=False).encode()
+                json.dumps(zarray_dict, indent=json_indent, allow_nan=True).encode()
             ),
             ZATTRS_JSON: prototype.buffer.from_bytes(
-                json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode()
+                json.dumps(zattrs_dict, indent=json_indent, allow_nan=True).encode()
             ),
         }
 
diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -288,7 +288,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
         d = self.to_dict()
         return {
             ZARR_JSON: prototype.buffer.from_bytes(
-                json.dumps(d, allow_nan=False, indent=json_indent).encode()
+                json.dumps(d, allow_nan=True, indent=json_indent).encode()
             )
         }
 
diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py
@@ -38,7 +38,7 @@
     VariableLengthUTF8JSON_V2,
     ZDType,
     data_type_registry,
-    parse_data_type,
+    parse_dtype,
 )
 
 __all__ = [
@@ -83,5 +83,5 @@
     "ZDType",
     "data_type_registry",
     "data_type_registry",
-    "parse_data_type",
+    "parse_dtype",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import math
 import os
 import pathlib
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
@@ -442,3 +444,21 @@ def skip_object_dtype(dtype: ZDType[Any, Any]) -> None:
             "type resolution"
         )
         pytest.skip(msg)
+
+
+def nan_equal(a: object, b: object) -> bool:
+    """
+    Convenience function for equality comparison between two values ``a`` and ``b``, that might both
+    be NaN. Returns True if both ``a`` and ``b`` are NaN, otherwise returns a == b
+    """
+    if math.isnan(a) and math.isnan(b):  # type: ignore[arg-type]
+        return True
+    return a == b
+
+
+def deep_nan_equal(a: object, b: object) -> bool:
+    if isinstance(a, Mapping) and isinstance(b, Mapping):
+        return all(deep_nan_equal(a[k], b[k]) for k in a)
+    if isinstance(a, Sequence) and isinstance(b, Sequence):
+        return all(deep_nan_equal(a[i], b[i]) for i in range(len(a)))
+    return nan_equal(a, b)
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -53,7 +53,7 @@
     VariableLengthBytes,
     VariableLengthUTF8,
     ZDType,
-    parse_data_type,
+    parse_dtype,
 )
 from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
 from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
@@ -1308,7 +1308,7 @@ async def test_v2_chunk_encoding(
             filters=filters,
         )
         filters_expected, compressor_expected = _parse_chunk_encoding_v2(
-            filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
+            filters=filters, compressor=compressors, dtype=parse_dtype(dtype, zarr_format=2)
         )
         assert arr.metadata.zarr_format == 2  # guard for mypy
         assert arr.metadata.compressor == compressor_expected
diff --git a/tests/test_attributes.py b/tests/test_attributes.py
@@ -1,18 +1,26 @@
+import json
+from typing import Any
+
+import numpy as np
 import pytest
 
 import zarr.core
 import zarr.core.attributes
 import zarr.storage
+from tests.conftest import deep_nan_equal
+from zarr.core.common import ZarrFormat
 
 
-def test_put() -> None:
+@pytest.mark.parametrize("zarr_format", [2, 3])
+@pytest.mark.parametrize(
+    "data", [{"inf": np.inf, "-inf": -np.inf, "nan": np.nan}, {"a": 3, "c": 4}]
+)
+def test_put(data: dict[str, Any], zarr_format: ZarrFormat) -> None:
     store = zarr.storage.MemoryStore()
-    attrs = zarr.core.attributes.Attributes(
-        zarr.Group.from_store(store, attributes={"a": 1, "b": 2})
-    )
-    attrs.put({"a": 3, "c": 4})
-    expected = {"a": 3, "c": 4}
-    assert dict(attrs) == expected
+    attrs = zarr.core.attributes.Attributes(zarr.Group.from_store(store, zarr_format=zarr_format))
+    attrs.put(data)
+    expected = json.loads(json.dumps(data, allow_nan=True))
+    assert deep_nan_equal(dict(attrs), expected)
 
 
 def test_asdict() -> None:
diff --git a/tests/test_dtype/test_npy/test_common.py b/tests/test_dtype/test_npy/test_common.py
diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py
diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add a section on codecs to the migration guide.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+Fix a regression introduced in 3.1.0 that prevented ``inf``, ``-inf``, and ``nan`` values
	`2`	+from being stored in ``attributes``.
Original file line number	Diff line number	Diff line change
`@@ -336,7 +336,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:`
`336`	`336`	`if self.zarr_format == 3:`
`337`	`337`	`return {`
`338`	`338`	`ZARR_JSON: prototype.buffer.from_bytes(`
`339`		`- json.dumps(self.to_dict(), indent=json_indent, allow_nan=False).encode()`
	`339`	`+ json.dumps(self.to_dict(), indent=json_indent, allow_nan=True).encode()`
`340`	`340`	`)`
`341`	`341`	`}`
`342`	`342`	`else:`
`@@ -345,7 +345,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:`
`345`	`345`	`json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode()`
`346`	`346`	`),`
`347`	`347`	`ZATTRS_JSON: prototype.buffer.from_bytes(`
`348`		`- json.dumps(self.attributes, indent=json_indent, allow_nan=False).encode()`
	`348`	`+ json.dumps(self.attributes, indent=json_indent, allow_nan=True).encode()`
`349`	`349`	`),`
`350`	`350`	`}`
`351`	`351`	`if self.consolidated_metadata:`
`@@ -373,7 +373,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:`
`373`	`373`
`374`	`374`	`items[ZMETADATA_V2_JSON] = prototype.buffer.from_bytes(`
`375`	`375`	`json.dumps(`
`376`		`- {"metadata": d, "zarr_consolidated_format": 1}, allow_nan=False`
	`376`	`+ {"metadata": d, "zarr_consolidated_format": 1}, allow_nan=True`
`377`	`377`	`).encode()`
`378`	`378`	`)`
`379`	`379`
Original file line number	Diff line number	Diff line change
`@@ -288,7 +288,7 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:`
`288`	`288`	`d = self.to_dict()`
`289`	`289`	`return {`
`290`	`290`	`ZARR_JSON: prototype.buffer.from_bytes(`
`291`		`- json.dumps(d, allow_nan=False, indent=json_indent).encode()`
	`291`	`+ json.dumps(d, allow_nan=True, indent=json_indent).encode()`
`292`	`292`	`)`
`293`	`293`	`}`
`294`	`294`