diff --git a/changes/2801.bugfix.rst b/changes/2801.bugfix.rst new file mode 100644 index 0000000000..294934aacf --- /dev/null +++ b/changes/2801.bugfix.rst @@ -0,0 +1 @@ +Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index b948651ce6..81bb482b06 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -51,6 +51,21 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: ) +def safe_unicode_for_dtype(dtype: np.dtype[np.str_]) -> st.SearchStrategy[str]: + """Generate UTF-8-safe text constrained to max_len of dtype.""" + # account for utf-32 encoding (i.e. 4 bytes/character) + max_len = max(1, dtype.itemsize // 4) + + return st.text( + alphabet=st.characters( + blacklist_categories=["Cs"], # Avoid *technically allowed* surrogates + min_codepoint=32, + ), + min_size=1, + max_size=max_len, + ) + + # From https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#node-names # 1. must not be the empty string ("") # 2. must not include the character "/" @@ -86,7 +101,12 @@ def numpy_arrays( Generate numpy arrays that can be saved in the provided Zarr format. """ zarr_format = draw(zarr_formats) - return draw(npst.arrays(dtype=v3_dtypes() if zarr_format == 3 else v2_dtypes(), shape=shapes)) + dtype = draw(v3_dtypes() if zarr_format == 3 else v2_dtypes()) + if np.issubdtype(dtype, np.str_): + safe_unicode_strings = safe_unicode_for_dtype(dtype) + return draw(npst.arrays(dtype=dtype, shape=shapes, elements=safe_unicode_strings)) + + return draw(npst.arrays(dtype=dtype, shape=shapes)) @st.composite # type: ignore[misc]