support for datetime and timedelta dtypes (#2616)

rvestrum3vg · rvestrum3vg · commit 93b5bc019bb4 · 2025-03-03T14:26:17.000-08:00
* Add support for the datetime dtypes
    * Add support for the timedelta dtypes
    * Add test to validate the fill_values for for datetime
    * Add test to validate the fill_values for for timedelta
    * Add towncrier file for changes
diff --git a/changes/2616.feature.rst b/changes/2616.feature.rst
@@ -0,0 +1 @@
+NumPy’s datetime64 (‘M8’) and timedelta64 (‘m8’) dtypes are supported for Zarr arrays, as long as the units are specified.
diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst
@@ -619,6 +619,22 @@ In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is
 This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total.
 Without the ``shards`` argument, there would be 10,000 chunks stored as individual files.
 
+.. _user-guide-datetime:
+
+Datetime and Timedelta arrays
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+NumPy’s datetime64 (‘M8’) and timedelta64 (‘m8’) dtypes are supported for Zarr arrays, as long as the units are specified. E.g.:
+
+   >>> data = np.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]')
+   >>> z = zarr.create_array(store='data/example-datetime.zarr', shape=data.shape, dtype=data.dtype)
+   >>> z[:] = data
+   >>> z[:]
+   array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]')
+   >>> z[0] = '1999-12-31'
+   >>> z[:]
+   array(['1999-12-31', '2006-01-13', '2010-08-13'], dtype='datetime64[D]')
+
+
 Missing features in 3.0
 -----------------------
 
@@ -639,13 +655,6 @@ Fixed-length string arrays
 
 See the Zarr-Python 2 documentation on `Fixed-length string arrays <https://zarr.readthedocs.io/en/support-v2/tutorial.html#string-arrays>`_ for more details.
 
-.. _user-guide-datetime:
-
-Datetime and Timedelta arrays
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See the Zarr-Python 2 documentation on `Datetime and Timedelta <https://zarr.readthedocs.io/en/support-v2/tutorial.html#datetimes-and-timedeltas>`_ for more details.
-
 .. _user-guide-copy:
 
 Copying and migrating data
diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 import warnings
 from typing import TYPE_CHECKING, TypedDict, overload
 
@@ -174,11 +175,13 @@ def default(self, o: object) -> Any:
             return str(o)
         if np.isscalar(o):
             out: Any
-            if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"):
+            if hasattr(o, "dtype") and o.dtype.kind in "Mm" and hasattr(o, "view"):
                 # https://github.com/zarr-developers/zarr-python/issues/2119
                 # `.item()` on a datetime type might or might not return an
                 # integer, depending on the value.
                 # Explicitly cast to an int first, and then grab .item()
+                if np.isnat(o):
+                    return "NaT"
                 out = o.view("i8").item()
             else:
                 # convert numpy scalar to python type, and pass
@@ -440,12 +443,25 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
 FLOAT = np.float16 | np.float32 | np.float64
 COMPLEX_DTYPE = Literal["complex64", "complex128"]
 COMPLEX = np.complex64 | np.complex128
+DATETIME_DTYPE = Literal["datetime64"]
+DATETIME = np.datetime64
+TIMEDELTA_DTYPE = Literal["timedelta64"]
+TIMEDELTA = np.timedelta64
 STRING_DTYPE = Literal["string"]
 STRING = np.str_
 BYTES_DTYPE = Literal["bytes"]
 BYTES = np.bytes_
 
-ALL_DTYPES = BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | STRING_DTYPE | BYTES_DTYPE
+ALL_DTYPES = (
+    BOOL_DTYPE
+    | INTEGER_DTYPE
+    | FLOAT_DTYPE
+    | COMPLEX_DTYPE
+    | DATETIME_DTYPE
+    | TIMEDELTA_DTYPE
+    | STRING_DTYPE
+    | BYTES_DTYPE
+)
 
 
 @overload
@@ -490,6 +506,20 @@ def parse_fill_value(
 ) -> BYTES: ...
 
 
+@overload
+def parse_fill_value(
+    fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool,
+    dtype: DATETIME_DTYPE,
+) -> DATETIME: ...
+
+
+@overload
+def parse_fill_value(
+    fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool,
+    dtype: TIMEDELTA_DTYPE,
+) -> TIMEDELTA: ...
+
+
 def parse_fill_value(
     fill_value: Any,
     dtype: ALL_DTYPES,
@@ -551,12 +581,24 @@ def parse_fill_value(
         # fill_value != casted_value below.
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=DeprecationWarning)
-            casted_value = np.dtype(np_dtype).type(fill_value)
+            if np.dtype(np_dtype).kind in "Mm":
+                # datetime64 values have an associated precision
+                match = re.search(r"\[(.*?)\]", np.dtype(np_dtype).str)
+                if match:
+                    precision = match.group(1)
+                else:
+                    precision = "s"
+                casted_value = np.dtype(np_dtype).type(fill_value, precision)
+            else:
+                casted_value = np.dtype(np_dtype).type(fill_value)
     except (ValueError, OverflowError, TypeError) as e:
         raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}") from e
     # Check if the value is still representable by the dtype
-    if (fill_value == "NaN" and np.isnan(casted_value)) or (
-        fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value)
+    if (
+        (fill_value == "NaN" and np.isnan(casted_value))
+        or (fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value))
+        or (fill_value == "NaT" and np.isnat(casted_value))
+        or (np.dtype(np_dtype).kind in "Mm" and np.isnat(casted_value) and np.isnat(fill_value))
     ):
         pass
     elif np_dtype.kind == "f":
@@ -576,7 +618,6 @@ def parse_fill_value(
     else:
         if fill_value != casted_value:
             raise ValueError(f"fill value {fill_value!r} is not valid for dtype {data_type}")
-
     return casted_value
 
 
@@ -585,9 +626,17 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic:
         return ""
     elif dtype == DataType.bytes:
         return b""
+    np_dtype = dtype.to_numpy()
+    np_dtype = cast(np.dtype[Any], np_dtype)
+    if np_dtype.kind in "Mm":
+        # datetime64 values have an associated precision
+        match = re.search(r"\[(.*?)\]", np_dtype.str)
+        if match:
+            precision = match.group(1)
+        else:
+            precision = "s"
+        return np_dtype.type("nat", precision)  # type: ignore[misc,call-arg]
     else:
-        np_dtype = dtype.to_numpy()
-        np_dtype = cast(np.dtype[Any], np_dtype)
         return np_dtype.type(0)  # type: ignore[misc]
 
 
@@ -610,6 +659,24 @@ class DataType(Enum):
     float64 = "float64"
     complex64 = "complex64"
     complex128 = "complex128"
+    datetime64ns = ("datetime[ns]",)
+    datetime64ms = ("datetime[ms]",)
+    datetime64s = ("datetime[s]",)
+    datetime64m = ("datetime[m]",)
+    datetime64h = ("datetime[h]",)
+    datetime64D = ("datetime[D]",)
+    datetime64W = ("datetime[W]",)
+    datetime64M = ("datetime[M]",)
+    datetime64Y = ("datetime[Y]",)
+    timedelta64ns = ("deltatime[ns]",)
+    timedelta64ms = ("deltatime[ms]",)
+    timedelta64s = ("deltatime[s]",)
+    timedelta64m = ("deltatime[m]",)
+    timedelta64h = ("deltatime[h]",)
+    timedelta64D = ("deltatime[D]",)
+    timedelta64W = ("deltatime[W]",)
+    timedelta64M = ("deltatime[M]",)
+    timedelta64Y = ("deltatime[Y]",)
     string = "string"
     bytes = "bytes"
 
@@ -630,6 +697,24 @@ def byte_count(self) -> int | None:
             DataType.float64: 8,
             DataType.complex64: 8,
             DataType.complex128: 16,
+            DataType.datetime64ns: 8,
+            DataType.datetime64ms: 8,
+            DataType.datetime64s: 8,
+            DataType.datetime64m: 8,
+            DataType.datetime64h: 8,
+            DataType.datetime64D: 8,
+            DataType.datetime64W: 8,
+            DataType.datetime64M: 8,
+            DataType.datetime64Y: 8,
+            DataType.timedelta64ns: 8,
+            DataType.timedelta64ms: 8,
+            DataType.timedelta64s: 8,
+            DataType.timedelta64m: 8,
+            DataType.timedelta64h: 8,
+            DataType.timedelta64D: 8,
+            DataType.timedelta64W: 8,
+            DataType.timedelta64M: 8,
+            DataType.timedelta64Y: 8,
         }
         try:
             return data_type_byte_counts[self]
@@ -657,6 +742,24 @@ def to_numpy_shortname(self) -> str:
             DataType.float64: "f8",
             DataType.complex64: "c8",
             DataType.complex128: "c16",
+            DataType.datetime64ns: "M8[ns]",
+            DataType.datetime64ms: "M8[ms]",
+            DataType.datetime64s: "M8[s]",
+            DataType.datetime64m: "M8[m]",
+            DataType.datetime64h: "M8[h]",
+            DataType.datetime64D: "M8[D]",
+            DataType.datetime64W: "M8[W]",
+            DataType.datetime64M: "M8[M]",
+            DataType.datetime64Y: "M8[Y]",
+            DataType.timedelta64ns: "m8[ns]",
+            DataType.timedelta64ms: "m8[ms]",
+            DataType.timedelta64s: "m8[s]",
+            DataType.timedelta64m: "m8[m]",
+            DataType.timedelta64h: "m8[h]",
+            DataType.timedelta64D: "m8[D]",
+            DataType.timedelta64W: "m8[W]",
+            DataType.timedelta64M: "m8[M]",
+            DataType.timedelta64Y: "m8[Y]",
         }
         return data_type_to_numpy[self]
 
@@ -700,6 +803,24 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType:
             "<f8": "float64",
             "<c8": "complex64",
             "<c16": "complex128",
+            "<M8[ns]": "datetime64ns",
+            "<M8[ms]": "datetime64ms",
+            "<M8[s]": "datetime64s",
+            "<M8[m]": "datetime64m",
+            "<M8[h]": "datetime64h",
+            "<M8[D]": "datetime64D",
+            "<M8[W]": "datetime64W",
+            "<M8[M]": "datetime64M",
+            "<M8[Y]": "datetime64Y",
+            "<m8[ns]": "timedelta64ns",
+            "<m8[ms]": "timedelta64ms",
+            "<m8[s]": "timedelta64s",
+            "<m8[m]": "timedelta64m",
+            "<m8[h]": "timedelta64h",
+            "<m8[D]": "timedelta64D",
+            "<m8[W]": "timedelta64W",
+            "<m8[M]": "timedelta64M",
+            "<m8[Y]": "timedelta64Y",
         }
         return DataType[dtype_to_data_type[dtype.str]]
 
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -209,7 +209,15 @@ def test_array_v3_fill_value_default(
 @pytest.mark.parametrize("store", ["memory"], indirect=True)
 @pytest.mark.parametrize(
     ("dtype_str", "fill_value"),
-    [("bool", True), ("uint8", 99), ("float32", -99.9), ("complex64", 3 + 4j)],
+    [
+        ("bool", True),
+        ("uint8", 99),
+        ("float32", -99.9),
+        ("complex64", 3 + 4j),
+        ("m8[ns]", 0),
+        ("M8[s]", None),
+        ("<m8[D]", "NaT"),
+    ],
 )
 def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str) -> None:
     shape = (10,)
@@ -221,9 +229,13 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str
         chunks=shape,
         fill_value=fill_value,
     )
-
-    assert arr.fill_value == np.dtype(dtype_str).type(fill_value)
     assert arr.fill_value.dtype == arr.dtype
+    if np.isfinite(arr.fill_value):
+        assert arr.fill_value == np.dtype(dtype_str).type(fill_value)
+    else:
+        if arr.dtype.kind in "Mm":
+            assert np.isnat(arr.fill_value)
+            assert np.isnat(np.dtype(dtype_str).type(fill_value))
 
 
 def test_create_positional_args_deprecated() -> None:
diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py
@@ -313,43 +313,66 @@ def test_json_indent(indent: int):
         assert d == json.dumps(json.loads(d), indent=indent).encode()
 
 
-# @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897])
-# @pytest.mark.parametrize("precision", ["ns", "D"])
-# async def test_datetime_metadata(fill_value: int, precision: str) -> None:
-#     metadata_dict = {
-#         "zarr_format": 3,
-#         "node_type": "array",
-#         "shape": (1,),
-#         "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}},
-#         "data_type": f"<M8[{precision}]",
-#         "chunk_key_encoding": {"name": "default", "separator": "."},
-#         "codecs": (),
-#         "fill_value": np.datetime64(fill_value, precision),
-#     }
-#     metadata = ArrayV3Metadata.from_dict(metadata_dict)
-#     # ensure there isn't a TypeError here.
-#     d = metadata.to_buffer_dict(default_buffer_prototype())
-
-#     result = json.loads(d["zarr.json"].to_bytes())
-#     assert result["fill_value"] == fill_value
-
-
-def test_invalid_dtype_raises() -> None:
+@pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897, "NaT"])
+@pytest.mark.parametrize("precision", ["ns", "ms", "s", "m", "h", "D", "W", "M", "Y"])
+async def test_datetime_metadata(fill_value: int, precision: str) -> None:
     metadata_dict = {
         "zarr_format": 3,
         "node_type": "array",
         "shape": (1,),
         "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}},
-        "data_type": "<M8[ns]",
+        "data_type": f"<M8[{precision}]",
         "chunk_key_encoding": {"name": "default", "separator": "."},
-        "codecs": (),
-        "fill_value": np.datetime64(0, "ns"),
+        "codecs": [BytesCodec()],
+        "fill_value": np.datetime64(fill_value, precision),
+    }
+    metadata = ArrayV3Metadata.from_dict(metadata_dict)
+    # ensure there isn't a TypeError here.
+    d = metadata.to_buffer_dict(default_buffer_prototype())
+    result = json.loads(d["zarr.json"].to_bytes())
+    assert result["fill_value"] == fill_value
+
+
+@pytest.mark.parametrize("fill_value", [None, -1, 0, 1, 2932897, "NaT"])
+@pytest.mark.parametrize("precision", ["ns", "ms", "s", "m", "h", "D", "W", "M", "Y"])
+async def test_deltatime_metadata(fill_value: int, precision: str) -> None:
+    metadata_dict = {
+        "zarr_format": 3,
+        "node_type": "array",
+        "shape": (1,),
+        "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}},
+        "data_type": f"<m8[{precision}]",
+        "chunk_key_encoding": {"name": "default", "separator": "."},
+        "codecs": [BytesCodec()],
+        "fill_value": None if fill_value is None else np.timedelta64(fill_value, precision),
+    }
+    metadata = ArrayV3Metadata.from_dict(metadata_dict)
+    # ensure there isn't a TypeError here.
+    d = metadata.to_buffer_dict(default_buffer_prototype())
+    result = json.loads(d["zarr.json"].to_bytes())
+    if fill_value is None:
+        assert result["fill_value"] == "NaT"
+    else:
+        assert result["fill_value"] == fill_value
+
+
+@pytest.mark.parametrize("data", ["foo", object()])
+def test_invalid_dtype_raises(data) -> None:
+    metadata_dict = {
+        "zarr_format": 3,
+        "node_type": "array",
+        "shape": (1,),
+        "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}},
+        "data_type": data,
+        "chunk_key_encoding": {"name": "default", "separator": "."},
+        "codecs": [BytesCodec()],
+        "fill_value": "",
     }
     with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"):
         ArrayV3Metadata.from_dict(metadata_dict)
 
 
-@pytest.mark.parametrize("data", ["datetime64[s]", "foo", object()])
+@pytest.mark.parametrize("data", ["foo", object()])
 def test_parse_invalid_dtype_raises(data):
     with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"):
         DataType.parse(data)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+NumPy’s datetime64 (‘M8’) and timedelta64 (‘m8’) dtypes are supported for Zarr arrays, as long as the units are specified.`