fix config test failures

d-v-b · d-v-b · commit 2a7b5a8cead0 · 2025-03-17T16:11:26.000+01:00
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -30,6 +30,7 @@
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec
 from zarr.abc.store import Store, set_or_delete
 from zarr.codecs._v2 import V2Codec
+from zarr.codecs.bytes import BytesCodec
 from zarr.core._info import ArrayInfo
 from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config
 from zarr.core.attributes import Attributes
@@ -4231,7 +4232,6 @@ def _get_default_chunk_encoding_v3(
         compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}")
     else:
         compressors = zarr_config.get("array.v3_default_compressors.default")
-
     if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"):
         serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}")
     else:
@@ -4353,6 +4353,14 @@ def _parse_chunk_encoding_v3(
 
         out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
 
+    # specialize codecs as needed given the dtype
+
+    # TODO: refactor so that the config only contains the name of the codec, and we use the dtype
+    # to create the codec instance, instead of storing a dict representation of a full codec.
+
+    if isinstance(out_array_bytes, BytesCodec) and dtype.to_dtype().itemsize == 1:
+        # The default endianness in the bytescodec might not be None, so we need to replace it
+        out_array_bytes = replace(out_array_bytes, endian=None)
     return out_array_array, out_array_bytes, out_bytes_bytes
 
 
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -84,7 +84,7 @@ def enable_gpu(self) -> ConfigSet:
                     "fixed_length_ucs4": [{"id": "vlen-utf8"}],
                     "fixed_length_ascii": [{"id": "vlen-bytes"}],
                 },
-                "v3_default_filters": {"default": ()},
+                "v3_default_filters": {"default": []},
                 "v3_default_serializer": {
                     "default": {"name": "bytes", "configuration": {"endian": "little"}},
                     "variable_length_utf8": {"name": "vlen-utf8"},
diff --git a/src/zarr/core/dtype/_numpy.py b/src/zarr/core/dtype/_numpy.py
@@ -711,7 +711,7 @@ def to_dict(self) -> dict[str, JSON]:
     @classmethod
     def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self:
         unit = dtype.name[dtype.name.rfind("[") + 1 : dtype.name.rfind("]")]
-        if unit not in get_args(DateUnit | TimeUnit):
+        if unit not in get_args(DateUnit) and unit not in get_args(TimeUnit):
             raise DataTypeValidationError('Invalid unit for "numpy.datetime64"')
         return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder))
 
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -1,4 +1,5 @@
 import dataclasses
+import inspect
 import json
 import math
 import multiprocessing as mp
@@ -28,8 +29,6 @@
 from zarr.core.array import (
     CompressorsLike,
     FiltersLike,
-    _get_default_chunk_encoding_v2,
-    _get_default_chunk_encoding_v3,
     _parse_chunk_encoding_v2,
     _parse_chunk_encoding_v3,
     chunks_initialized,
@@ -1064,13 +1063,23 @@ async def test_default_filters_compressors(
             shape=(10,),
             zarr_format=zarr_format,
         )
+
+        sig = inspect.signature(create_array)
+
         if zarr_format == 3:
-            expected_filters, expected_serializer, expected_compressors = (
-                _get_default_chunk_encoding_v3(dtype=zdtype)
+            expected_filters, expected_serializer, expected_compressors = _parse_chunk_encoding_v3(
+                compressors=sig.parameters["compressors"].default,
+                filters=sig.parameters["filters"].default,
+                serializer=sig.parameters["serializer"].default,
+                dtype=zdtype,
             )
 
         elif zarr_format == 2:
-            default_filters, default_compressors = _get_default_chunk_encoding_v2(dtype=zdtype)
+            default_filters, default_compressors = _parse_chunk_encoding_v2(
+                compressor=sig.parameters["compressors"].default,
+                filters=sig.parameters["filters"].default,
+                dtype=zdtype,
+            )
             if default_filters is None:
                 expected_filters = ()
             else:
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -19,10 +19,12 @@
     GzipCodec,
     ShardingCodec,
 )
+from zarr.core.array import create_array
 from zarr.core.array_spec import ArraySpec
 from zarr.core.buffer import NDBuffer
 from zarr.core.codec_pipeline import BatchedCodecPipeline
 from zarr.core.config import BadConfigError, config
+from zarr.core.dtype import get_data_type_from_numpy
 from zarr.core.indexing import SelectorTuple
 from zarr.registry import (
     fully_qualified_name,
@@ -52,33 +54,24 @@ def test_config_defaults_set() -> None:
             "array": {
                 "order": "C",
                 "write_empty_chunks": False,
-                "v2_default_compressor": {
-                    "numeric": {"id": "zstd", "level": 0, "checksum": False},
-                    "string": {"id": "zstd", "level": 0, "checksum": False},
-                    "bytes": {"id": "zstd", "level": 0, "checksum": False},
-                },
+                "v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}},
                 "v2_default_filters": {
-                    "numeric": None,
-                    "string": [{"id": "vlen-utf8"}],
-                    "bytes": [{"id": "vlen-bytes"}],
-                    "raw": None,
+                    "default": None,
+                    "variable_length_utf8": [{"id": "vlen-utf8"}],
+                    "fixed_length_ucs4": [{"id": "vlen-utf8"}],
+                    "fixed_length_ascii": [{"id": "vlen-bytes"}],
                 },
-                "v3_default_filters": {"numeric": [], "string": [], "bytes": []},
+                "v3_default_filters": {"default": []},
                 "v3_default_serializer": {
-                    "numeric": {"name": "bytes", "configuration": {"endian": "little"}},
-                    "string": {"name": "vlen-utf8"},
-                    "bytes": {"name": "vlen-bytes"},
+                    "default": {"name": "bytes", "configuration": {"endian": "little"}},
+                    "variable_length_utf8": {"name": "vlen-utf8"},
+                    "fixed_length_ucs4": {"name": "vlen-utf8"},
+                    "r*": {"name": "vlen-bytes"},
                 },
                 "v3_default_compressors": {
-                    "numeric": [
-                        {"name": "zstd", "configuration": {"level": 0, "checksum": False}},
-                    ],
-                    "string": [
-                        {"name": "zstd", "configuration": {"level": 0, "checksum": False}},
-                    ],
-                    "bytes": [
+                    "default": [
                         {"name": "zstd", "configuration": {"level": 0, "checksum": False}},
-                    ],
+                    ]
                 },
             },
             "async": {"concurrency": 10, "timeout": None},
@@ -306,26 +299,22 @@ class NewCodec2(BytesCodec):
 
 @pytest.mark.parametrize("dtype", ["int", "bytes", "str"])
 async def test_default_codecs(dtype: str) -> None:
-    with config.set(
-        {
-            "array.v3_default_compressors": {  # test setting non-standard codecs
-                "numeric": [
-                    {"name": "gzip", "configuration": {"level": 5}},
-                ],
-                "string": [
-                    {"name": "gzip", "configuration": {"level": 5}},
-                ],
-                "bytes": [
-                    {"name": "gzip", "configuration": {"level": 5}},
-                ],
-            }
-        }
-    ):
-        arr = await zarr.api.asynchronous.create_array(
+    """
+    Test that the default compressors are sensitive to the current setting of the config.
+    """
+    zdtype = get_data_type_from_numpy(dtype)
+    expected_compressors = (GzipCodec(),)
+    new_conf = {
+        f"array.v3_default_compressors.{zdtype._zarr_v3_name}": [
+            c.to_dict() for c in expected_compressors
+        ]
+    }
+    with config.set(new_conf):
+        arr = await create_array(
             shape=(100,),
             chunks=(100,),
-            dtype=np.dtype(dtype),
+            dtype=dtype,
             zarr_format=3,
             store=MemoryStore(),
         )
-        assert arr.compressors == (GzipCodec(),)
+        assert arr.compressors == expected_compressors