make strings work

rabernat · rabernat · commit 1ae5e6315b1d · 2024-10-03T16:26:01.000-04:00
diff --git a/src/zarr/codecs/legacy_vlen.py b/src/zarr/codecs/legacy_vlen.py
@@ -3,12 +3,14 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+import numpy as np
 from numcodecs.vlen import VLenUTF8
 
 from zarr.abc.codec import ArrayBytesCodec
 from zarr.core.buffer import Buffer, NDBuffer
 from zarr.core.common import JSON, parse_named_configuration
 from zarr.registry import register_codec
+from zarr.strings import cast_to_string_dtype
 
 if TYPE_CHECKING:
     from typing import Self
@@ -45,8 +47,11 @@ async def _decode_single(
 
         raw_bytes = chunk_bytes.as_array_like()
         decoded = vlen_utf8_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
         decoded.shape = chunk_spec.shape
-        return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
+        # coming out of the code, we know this is safe, so don't issue a warning
+        as_string_dtype = cast_to_string_dtype(decoded, safe=True)
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
 
     async def _encode_single(
         self,
diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py
@@ -313,10 +313,6 @@ class NDBuffer:
     """
 
     def __init__(self, array: NDArrayLike) -> None:
-        # assert array.ndim > 0
-
-        # Commented this out because string arrays have dtype object
-        # TODO: decide how to handle strings (e.g. numpy 2.0 StringDtype)
         # assert array.dtype != object
         self._data = array
 
@@ -470,9 +466,12 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
             # Handle None fill_value for Zarr V2
             return False
         # use array_equal to obtain equal_nan=True functionality
+        # Note from Ryan: doesn't this lead to a huge amount of unnecessary memory allocation on every single chunk?
+        # Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
+        # every single time we have to write data?
         _data, other = np.broadcast_arrays(self._data, other)
         return np.array_equal(
-            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "UST" else False
+            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
         )
 
     def fill(self, value: Any) -> None:
diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -29,6 +29,7 @@
 from zarr.core.config import config
 from zarr.core.metadata.common import ArrayMetadata, parse_attributes
 from zarr.registry import get_codec_class
+from zarr.strings import STRING_DTYPE
 
 
 def parse_zarr_format(data: object) -> Literal[3]:
@@ -312,7 +313,6 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
 FLOAT = np.float16 | np.float32 | np.float64
 COMPLEX_DTYPE = np.dtypes.Complex64DType | np.dtypes.Complex128DType
 COMPLEX = np.complex64 | np.complex128
-STRING = np.str_
 
 
 @overload
@@ -496,7 +496,7 @@ def to_numpy_shortname(self) -> str:
 
     def to_numpy_dtype(self) -> np.dtype[Any]:
         if self == DataType.string:
-            return np.dtypes.StringDType()
+            return STRING_DTYPE
         else:
             return np.dtype(self.to_numpy_shortname())
 
diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py
@@ -8,10 +8,19 @@
 from zarr.codecs import VLenUTF8Codec
 from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
 from zarr.store.common import StorePath
+from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING
+
+numpy_str_dtypes: list[type | None] = [None, str, np.dtypes.StrDType]
+expected_zarr_string_dtype: np.dtype[Any]
+if NUMPY_SUPPORTS_VLEN_STRING:
+    numpy_str_dtypes.append(np.dtypes.StringDType)
+    expected_zarr_string_dtype = np.dtypes.StringDType()
+else:
+    expected_zarr_string_dtype = np.dtype("O")
 
 
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
-@pytest.mark.parametrize("dtype", [None, np.dtypes.StrDType])
+@pytest.mark.parametrize("dtype", numpy_str_dtypes)
 async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
     strings = ["hello", "world", "this", "is", "a", "test"]
     data = np.array(strings).reshape((2, 3))
@@ -32,11 +41,11 @@ async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
     a[:, :] = data
     assert np.array_equal(data, a[:, :])
     assert a.metadata.data_type == DataType.string
-    assert a.dtype == np.dtypes.StringDType()
+    assert a.dtype == expected_zarr_string_dtype
 
     # test round trip
     b = Array.open(sp)
     assert isinstance(b.metadata, ArrayV3Metadata)  # needed for mypy
     assert np.array_equal(data, b[:, :])
     assert b.metadata.data_type == DataType.string
-    assert b.dtype == np.dtypes.StringDType()
+    assert a.dtype == expected_zarr_string_dtype