Skip to content

Commit 1e828b4

Browse files
committed
add missing module
1 parent 7ba7077 commit 1e828b4

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

src/zarr/core/strings.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""This module contains utilities for working with string arrays across
2+
different versions of Numpy.
3+
"""
4+
5+
from typing import Any, Union, cast
6+
from warnings import warn
7+
8+
import numpy as np
9+
10+
# STRING_DTYPE is the in-memory datatype that will be used for V3 string arrays
11+
# when reading data back from Zarr.
12+
# Any valid string-like datatype should be fine for *setting* data.
13+
14+
STRING_DTYPE: Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]
15+
NUMPY_SUPPORTS_VLEN_STRING: bool
16+
17+
18+
def cast_array(
19+
data: np.ndarray[Any, np.dtype[Any]],
20+
) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]:
21+
raise NotImplementedError
22+
23+
24+
try:
25+
# this new vlen string dtype was added in NumPy 2.0
26+
STRING_DTYPE = np.dtypes.StringDType()
27+
NUMPY_SUPPORTS_VLEN_STRING = True
28+
29+
def cast_array(
30+
data: np.ndarray[Any, np.dtype[Any]],
31+
) -> np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType]:
32+
out = data.astype(STRING_DTYPE, copy=False)
33+
return cast(np.ndarray[Any, np.dtypes.StringDType], out)
34+
35+
except AttributeError:
36+
# if not available, we fall back on an object array of strings, as in Zarr < 3
37+
STRING_DTYPE = np.dtypes.ObjectDType()
38+
NUMPY_SUPPORTS_VLEN_STRING = False
39+
40+
def cast_array(
41+
data: np.ndarray[Any, np.dtype[Any]],
42+
) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]:
43+
out = data.astype(STRING_DTYPE, copy=False)
44+
return cast(np.ndarray[Any, np.dtypes.ObjectDType], out)
45+
46+
47+
def cast_to_string_dtype(
48+
data: np.ndarray[Any, np.dtype[Any]], safe: bool = False
49+
) -> np.ndarray[Any, Union["np.dtypes.StringDType", "np.dtypes.ObjectDType"]]:
50+
"""Take any data and attempt to cast to to our preferred string dtype.
51+
52+
data : np.ndarray
53+
The data to cast
54+
55+
safe : bool
56+
If True, do not issue a warning if the data is cast from object to string dtype.
57+
58+
"""
59+
if np.issubdtype(data.dtype, np.str_):
60+
# legacy fixed-width string type (e.g. "<U10")
61+
return cast_array(data)
62+
# out = data.astype(STRING_DTYPE, copy=False)
63+
# return cast(np.ndarray[Any, np.dtypes.StringDType | np.dtypes.ObjectDType], out)
64+
if NUMPY_SUPPORTS_VLEN_STRING:
65+
if np.issubdtype(data.dtype, STRING_DTYPE):
66+
# already a valid string variable length string dtype
67+
return cast_array(data)
68+
if np.issubdtype(data.dtype, np.object_):
69+
# object arrays require more careful handling
70+
if NUMPY_SUPPORTS_VLEN_STRING:
71+
try:
72+
# cast to variable-length string dtype, fail if object contains non-string data
73+
# mypy says "error: Unexpected keyword argument "coerce" for "StringDType" [call-arg]"
74+
# also: Value of type variable "_ScalarType" of "astype" of "ndarray" cannot be "str" [type-var]
75+
out = data.astype(np.dtypes.StringDType(coerce=False), copy=False) # type: ignore[call-arg,type-var]
76+
return cast_array(out)
77+
except ValueError as e:
78+
raise ValueError("Cannot cast object dtype to string dtype") from e
79+
else:
80+
if not safe:
81+
warn(
82+
"Treating object array as valid string array. To avoid this warning, "
83+
"cast the data to a string dtype before passing to Zarr or upgrade to NumPy >= 2.",
84+
stacklevel=2,
85+
)
86+
return cast_array(data)
87+
raise ValueError(f"Cannot cast dtype {data.dtype} to string dtype")

0 commit comments

Comments
 (0)