Skip to content

Commit 2bffe1a

Browse files
committed
add failing test for round-tripping vlen strings
1 parent e9241b9 commit 2bffe1a

File tree

5 files changed

+130
-69
lines changed

5 files changed

+130
-69
lines changed

src/zarr/core/array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4259,11 +4259,11 @@ def _get_default_chunk_encoding_v3(
42594259
# api for this.
42604260
# All of these things are acceptable right now because there is only 1 serializer that affects
42614261
# endianness, but this design will not last if this situation changes.
4262-
if "endian" in serializer["configuration"]:
4262+
if serializer.get("configuration") is not None:
42634263
if isinstance(dtype, HasEndianness):
42644264
serializer["configuration"]["endian"] = dtype.endianness
42654265
else:
4266-
serializer["configuration"].pop("endian")
4266+
serializer["configuration"].pop("endian", None)
42674267

42684268
return (
42694269
tuple(_parse_array_array_codec(f) for f in filters),

src/zarr/core/dtype/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,12 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[_BaseDType,
9999
# this is a valid _VoidDTypeLike check
100100
na_dtype = np.dtype([tuple(d) for d in dtype])
101101
else:
102-
na_dtype = np.dtype(dtype)
102+
if dtype == "|T16":
103+
# `|T16` is the numpy dtype str form for variable length strings. unfortunately
104+
# numpy cannot create these directly from np.dtype("|T16")
105+
na_dtype = np.dtypes.StringDType()
106+
else:
107+
na_dtype = np.dtype(dtype)
103108
else:
104109
na_dtype = dtype
105110
return data_type_registry.match_dtype(na_dtype)

src/zarr/core/dtype/_numpy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
907907
raise DataTypeValidationError(f"Invalid type: {data}. Expected a string.")
908908

909909
def check_value(self, data: object) -> bool:
910-
return isinstance(data, np.bytes_ | str | bytes)
910+
return isinstance(data, np.bytes_ | str | bytes | np.void)
911911

912912
def _cast_value_unsafe(self, value: object) -> np.void:
913913
return self.to_dtype().type(value) # type: ignore[call-overload, no-any-return]

src/zarr/core/dtype/common.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,10 @@ def bytes_from_json(data: str, zarr_format: ZarrFormat) -> bytes:
356356
"""
357357
if zarr_format == 2:
358358
return base64.b64decode(data.encode("ascii"))
359-
raise NotImplementedError(f"Invalid zarr format: {zarr_format}. Expected 2.")
359+
# TODO: differentiate these as needed. This is a spec question.
360+
if zarr_format == 3:
361+
return base64.b64decode(data.encode("ascii"))
362+
raise ValueError(f"Invalid zarr format: {zarr_format}. Expected 2 or 3.")
360363

361364

362365
def float_from_json_v2(data: JSONFloat) -> float:

tests/test_array.py

Lines changed: 117 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,13 @@
3939
from zarr.core.chunk_grids import _auto_partition
4040
from zarr.core.common import JSON, MemoryOrder, ZarrFormat
4141
from zarr.core.dtype import get_data_type_from_native_dtype
42-
from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str
42+
from zarr.core.dtype._numpy import (
43+
DateTime64,
44+
Float64,
45+
Int16,
46+
Structured,
47+
endianness_from_numpy_str,
48+
)
4349
from zarr.core.dtype.common import Endianness
4450
from zarr.core.dtype.wrapper import ZDType
4551
from zarr.core.group import AsyncGroup
@@ -936,12 +942,59 @@ def test_chunks_and_shards(store: Store) -> None:
936942
assert arr_v2.shards is None
937943

938944
@staticmethod
939-
@pytest.mark.parametrize(
940-
("dtype", "fill_value_expected"), [("<U4", ""), ("<S4", b""), ("i", 0), ("f", 0.0)]
941-
)
942-
def test_default_fill_value(dtype: str, fill_value_expected: object, store: Store) -> None:
945+
@pytest.mark.parametrize("dtype", zdtype_examples)
946+
def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None:
947+
"""
948+
Test that the fill value of an array is set to the default value for the dtype object
949+
"""
943950
a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype)
944-
assert a.fill_value == fill_value_expected
951+
if isinstance(dtype, DateTime64) and np.isnat(a.fill_value):
952+
assert np.isnat(dtype.default_value())
953+
else:
954+
assert a.fill_value == dtype.default_value()
955+
956+
@staticmethod
957+
@pytest.mark.parametrize("dtype", zdtype_examples)
958+
def test_dtype_forms(dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat) -> None:
959+
"""
960+
Test that the same array is produced from a ZDType instance, a numpy dtype, or a numpy string
961+
"""
962+
a = zarr.create_array(
963+
store, name="a", shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format
964+
)
965+
b = zarr.create_array(
966+
store,
967+
name="b",
968+
shape=(5,),
969+
chunks=(5,),
970+
dtype=dtype.to_dtype(),
971+
zarr_format=zarr_format,
972+
)
973+
assert a.dtype == b.dtype
974+
975+
# Structured dtypes do not have a numpy string representation that uniquely identifies them
976+
if not isinstance(dtype, Structured):
977+
c = zarr.create_array(
978+
store,
979+
name="c",
980+
shape=(5,),
981+
chunks=(5,),
982+
dtype=dtype.to_dtype().str,
983+
zarr_format=zarr_format,
984+
)
985+
assert a.dtype == c.dtype
986+
987+
@staticmethod
988+
@pytest.mark.parametrize("dtype", zdtype_examples)
989+
def test_dtype_roundtrip(
990+
dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat
991+
) -> None:
992+
"""
993+
Test that creating an array, then opening it, gets the same array.
994+
"""
995+
a = zarr.create_array(store, shape=(5,), chunks=(5,), dtype=dtype, zarr_format=zarr_format)
996+
b = zarr.open_array(store)
997+
assert a.dtype == b.dtype
945998

946999
@staticmethod
9471000
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U3", "S4", "V1"])
@@ -1266,6 +1319,64 @@ async def test_name(store: Store, zarr_format: ZarrFormat, path: str | None) ->
12661319
store=store, path=parent_path, mode="r", zarr_format=zarr_format
12671320
)
12681321

1322+
@staticmethod
1323+
@pytest.mark.parametrize("endianness", get_args(Endianness))
1324+
def test_default_endianness(
1325+
store: Store, zarr_format: ZarrFormat, endianness: Endianness
1326+
) -> None:
1327+
"""
1328+
Test that that endianness is correctly set when creating an array when not specifying a serializer
1329+
"""
1330+
dtype = Int16(endianness=endianness)
1331+
arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format)
1332+
assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness
1333+
if zarr_format == 3:
1334+
assert isinstance(arr.metadata, ArrayV3Metadata) # mypy
1335+
assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr]
1336+
1337+
@staticmethod
1338+
@pytest.mark.parametrize("endianness", get_args(Endianness))
1339+
def test_explicit_endianness(store: Store, endianness: Endianness) -> None:
1340+
"""
1341+
Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error
1342+
"""
1343+
if endianness == "little":
1344+
dtype = Int16(endianness="big")
1345+
else:
1346+
dtype = Int16(endianness="little")
1347+
1348+
serializer = BytesCodec(endian=endianness)
1349+
1350+
msg = (
1351+
f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). "
1352+
"The endianness of the serializer and the dtype must match."
1353+
)
1354+
1355+
with pytest.raises(ValueError, match=re.escape(msg)):
1356+
_ = zarr.create_array(
1357+
store=store,
1358+
shape=(1,),
1359+
dtype=dtype,
1360+
zarr_format=3,
1361+
serializer=serializer,
1362+
)
1363+
1364+
# additional check for the case where the serializer has endian=None
1365+
none_serializer = dataclasses.replace(serializer, endian=None)
1366+
msg = (
1367+
f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). "
1368+
"The endianness of the serializer and the dtype must match."
1369+
)
1370+
1371+
with pytest.raises(ValueError, match=re.escape(msg)):
1372+
_ = zarr.create_array(
1373+
store=store,
1374+
shape=(1,),
1375+
dtype=dtype,
1376+
zarr_format=3,
1377+
serializer=none_serializer,
1378+
)
1379+
12691380

12701381
async def test_scalar_array() -> None:
12711382
arr = zarr.array(1.5)
@@ -1384,61 +1495,3 @@ async def test_sharding_coordinate_selection() -> None:
13841495
)
13851496
arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4))
13861497
assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all()
1387-
1388-
1389-
@pytest.mark.parametrize("store", ["memory"], indirect=True)
1390-
@pytest.mark.parametrize("endianness", get_args(Endianness))
1391-
def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None:
1392-
"""
1393-
Test that that endianness is correctly set when creating an array when not specifying a serializer
1394-
"""
1395-
dtype = Int16(endianness=endianness)
1396-
arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format)
1397-
assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness
1398-
if zarr_format == 3:
1399-
assert isinstance(arr.metadata, ArrayV3Metadata) # mypy
1400-
assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr]
1401-
1402-
1403-
@pytest.mark.parametrize("store", ["memory"], indirect=True)
1404-
@pytest.mark.parametrize("endianness", get_args(Endianness))
1405-
def test_explicit_endianness(store: Store, endianness: Endianness) -> None:
1406-
"""
1407-
Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error
1408-
"""
1409-
if endianness == "little":
1410-
dtype = Int16(endianness="big")
1411-
else:
1412-
dtype = Int16(endianness="little")
1413-
1414-
serializer = BytesCodec(endian=endianness)
1415-
1416-
msg = (
1417-
f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). "
1418-
"The endianness of the serializer and the dtype must match."
1419-
)
1420-
1421-
with pytest.raises(ValueError, match=re.escape(msg)):
1422-
_ = zarr.create_array(
1423-
store=store,
1424-
shape=(1,),
1425-
dtype=dtype,
1426-
zarr_format=3,
1427-
serializer=serializer,
1428-
)
1429-
1430-
# additional check for the case where the serializer has endian=None
1431-
none_serializer = dataclasses.replace(serializer, endian=None)
1432-
msg = (
1433-
f"The endianness of the requested serializer ({none_serializer}) does not match the endianness of the dtype ({dtype.endianness}). "
1434-
"The endianness of the serializer and the dtype must match."
1435-
)
1436-
1437-
with pytest.raises(ValueError, match=re.escape(msg)):
1438-
_ = zarr.create_array(
1439-
store=store,
1440-
shape=(1,),
1441-
dtype=dtype,
1442-
zarr_format=3,
1443-
serializer=none_serializer,
1444-
)

0 commit comments

Comments
 (0)