Skip to content

Commit 632a252

Browse files
committed
Fix UTF generation for numpy in property-based tests
1 parent 87557e3 commit 632a252

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

src/zarr/testing/strategies.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]:
5151
)
5252

5353

54+
def safe_unicode_for_dtype(dtype: np.dtype[np.str_]) -> st.SearchStrategy[str]:
55+
"""Generate UTF-8-safe text constrained to max_len of dtype."""
56+
# account for utf-32 encoding (i.e. 4 bytes/character)
57+
max_len = max(1, dtype.itemsize // 4)
58+
59+
return st.text(
60+
alphabet=st.characters(
61+
blacklist_categories=["Cs"], # Avoid *technically allowed* surrogates
62+
min_codepoint=32,
63+
),
64+
min_size=1,
65+
max_size=max_len,
66+
)
67+
68+
5469
# From https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#node-names
5570
# 1. must not be the empty string ("")
5671
# 2. must not include the character "/"
@@ -86,7 +101,12 @@ def numpy_arrays(
86101
Generate numpy arrays that can be saved in the provided Zarr format.
87102
"""
88103
zarr_format = draw(zarr_formats)
89-
return draw(npst.arrays(dtype=v3_dtypes() if zarr_format == 3 else v2_dtypes(), shape=shapes))
104+
dtype = draw(v3_dtypes() if zarr_format == 3 else v2_dtypes())
105+
if np.issubdtype(dtype, np.str_):
106+
safe_unicode_strings = safe_unicode_for_dtype(dtype)
107+
return draw(npst.arrays(dtype=dtype, shape=shapes, elements=safe_unicode_strings))
108+
109+
return draw(npst.arrays(dtype=dtype, shape=shapes))
90110

91111

92112
@st.composite # type: ignore[misc]

0 commit comments

Comments
 (0)