Skip to content

Commit a276c84

Browse files
committed
fix endianness
1 parent 9989c64 commit a276c84

File tree

3 files changed

+71
-10
lines changed

3 files changed

+71
-10
lines changed

src/zarr/core/array.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
ZDTypeLike,
7070
parse_data_type,
7171
)
72+
from zarr.core.dtype._numpy import HasEndianness
7273
from zarr.core.indexing import (
7374
BasicIndexer,
7475
BasicSelection,
@@ -4246,6 +4247,24 @@ def _get_default_chunk_encoding_v3(
42464247
else:
42474248
serializer = zarr_config.get("array.v3_default_serializer.default")
42484249

4250+
# Modify the default serializer so that it matches the endianness of the dtype, otherwise unset the
4251+
# endian key
4252+
4253+
# This is effective problematic for many reasons:
4254+
# - we are assuming that endianness is set by the serializer, when it could also be changed
4255+
# by any one of the filters.
4256+
# - we are assuming that the serializer has a specific configuration. A different serializer that
4257+
# alters endianness might not use the same configuration structure.
4258+
# - we are mutating a configuration dictionary. It would be much better to work with the codec
4259+
# api for this.
4260+
# All of these things are acceptable right now because there is only 1 serializer that affects
4261+
# endianness, but this design will not last if this situation changes.
4262+
if "endian" in serializer["configuration"]:
4263+
if isinstance(dtype, HasEndianness):
4264+
serializer["configuration"]["endian"] = dtype.endianness
4265+
else:
4266+
serializer["configuration"].pop("endian")
4267+
42494268
return (
42504269
tuple(_parse_array_array_codec(f) for f in filters),
42514270
_parse_array_bytes_codec(serializer),
@@ -4352,6 +4371,20 @@ def _parse_chunk_encoding_v3(
43524371
out_array_bytes = default_array_bytes
43534372
else:
43544373
out_array_bytes = _parse_array_bytes_codec(serializer)
4374+
# check that the endianness of the requested serializer matches the dtype of the data, if applicable
4375+
if (
4376+
isinstance(out_array_bytes, BytesCodec)
4377+
and isinstance(dtype, HasEndianness)
4378+
and (
4379+
out_array_bytes.endian is None
4380+
or str(out_array_bytes.endian.value) != dtype.endianness
4381+
)
4382+
):
4383+
msg = (
4384+
f"The endianness of the requested serializer ({out_array_bytes}) does not match the endianness of the dtype ({dtype.endianness}). "
4385+
"The endianness of the serializer and the dtype must match."
4386+
)
4387+
raise ValueError(msg)
43554388

43564389
if compressors is None:
43574390
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()

src/zarr/dtype.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from zarr.core.dtype import ZDType, data_type_registry
2+
3+
__all__ = ["ZDType", "data_type_registry"]

tests/test_array.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@
3939
from zarr.core.chunk_grids import _auto_partition
4040
from zarr.core.common import JSON, MemoryOrder, ZarrFormat
4141
from zarr.core.dtype import get_data_type_from_native_dtype
42-
from zarr.core.dtype._numpy import Float64, endianness_from_numpy_str
42+
from zarr.core.dtype._numpy import Float64, Int16, endianness_from_numpy_str
4343
from zarr.core.dtype.common import Endianness
4444
from zarr.core.dtype.wrapper import ZDType
4545
from zarr.core.group import AsyncGroup
4646
from zarr.core.indexing import BasicIndexer, ceildiv
47+
from zarr.core.metadata.v3 import ArrayV3Metadata
4748
from zarr.core.sync import sync
4849
from zarr.errors import ContainsArrayError, ContainsGroupError
4950
from zarr.storage import LocalStore, MemoryStore, StorePath
@@ -53,7 +54,6 @@
5354
if TYPE_CHECKING:
5455
from zarr.core.array_spec import ArrayConfigLike
5556
from zarr.core.metadata.v2 import ArrayV2Metadata
56-
from zarr.core.metadata.v3 import ArrayV3Metadata
5757

5858

5959
@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
@@ -1388,16 +1388,41 @@ async def test_sharding_coordinate_selection() -> None:
13881388

13891389
@pytest.mark.parametrize("store", ["memory"], indirect=True)
13901390
@pytest.mark.parametrize("endianness", get_args(Endianness))
1391-
def test_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None:
1391+
def test_default_endianness(store: Store, zarr_format: ZarrFormat, endianness: Endianness) -> None:
13921392
"""
1393-
Test that that endianness is correctly set when creating an array.
1393+
Test that that endianness is correctly set when creating an array when not specifying a serializer
1394+
"""
1395+
dtype = Int16(endianness=endianness)
1396+
arr = zarr.create_array(store=store, shape=(1,), dtype=dtype, zarr_format=zarr_format)
1397+
assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness
1398+
if zarr_format == 3:
1399+
assert isinstance(arr.metadata, ArrayV3Metadata) # mypy
1400+
assert str(arr.metadata.codecs[0].endian.value) == endianness # type: ignore[union-attr]
1401+
1402+
1403+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
1404+
@pytest.mark.parametrize("endianness", get_args(Endianness))
1405+
def test_explicit_endianness(store: Store, endianness: Endianness) -> None:
1406+
"""
1407+
Test that that a mismatch between the bytescodec endianness and the dtype endianness is an error
13941408
"""
13951409
if endianness == "little":
1396-
np_dtype = "<i2"
1410+
dtype = Int16(endianness="big")
13971411
else:
1398-
np_dtype = ">i2"
1412+
dtype = Int16(endianness="little")
13991413

1400-
arr = zarr.create_array(store=store, shape=(1,), dtype=np_dtype, zarr_format=zarr_format)
1401-
assert endianness_from_numpy_str(arr[:].dtype.byteorder) == endianness
1402-
if zarr_format == 3:
1403-
assert str(arr.metadata.codecs[0].endian.value) == endianness
1414+
serializer = BytesCodec(endian=endianness)
1415+
1416+
msg = (
1417+
f"The endianness of the requested serializer ({serializer}) does not match the endianness of the dtype ({dtype.endianness}). "
1418+
"The endianness of the serializer and the dtype must match."
1419+
)
1420+
1421+
with pytest.raises(ValueError, match=re.escape(msg)):
1422+
_ = zarr.create_array(
1423+
store=store,
1424+
shape=(1,),
1425+
dtype=dtype,
1426+
zarr_format=3,
1427+
serializer=serializer,
1428+
)

0 commit comments

Comments
 (0)