@@ -51,6 +51,21 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]:
5151 )
5252
5353
54+ def safe_unicode_for_dtype (dtype : np .dtype [np .str_ ]) -> st .SearchStrategy [str ]:
55+ """Generate UTF-8-safe text constrained to max_len of dtype."""
56+ # account for utf-32 encoding (i.e. 4 bytes/character)
57+ max_len = max (1 , dtype .itemsize // 4 )
58+
59+ return st .text (
60+ alphabet = st .characters (
61+ blacklist_categories = ["Cs" ], # Avoid *technically allowed* surrogates
62+ min_codepoint = 32 ,
63+ ),
64+ min_size = 1 ,
65+ max_size = max_len ,
66+ )
67+
68+
5469# From https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#node-names
5570# 1. must not be the empty string ("")
5671# 2. must not include the character "/"
@@ -86,7 +101,12 @@ def numpy_arrays(
86101 Generate numpy arrays that can be saved in the provided Zarr format.
87102 """
88103 zarr_format = draw (zarr_formats )
89- return draw (npst .arrays (dtype = v3_dtypes () if zarr_format == 3 else v2_dtypes (), shape = shapes ))
104+ dtype = draw (v3_dtypes () if zarr_format == 3 else v2_dtypes ())
105+ if np .issubdtype (dtype , np .str_ ):
106+ safe_unicode_strings = safe_unicode_for_dtype (dtype )
107+ return draw (npst .arrays (dtype = dtype , shape = shapes , elements = safe_unicode_strings ))
108+
109+ return draw (npst .arrays (dtype = dtype , shape = shapes ))
90110
91111
92112@st .composite # type: ignore[misc]
0 commit comments