zarr-developers · normanrz · Dec 19, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 import numcodecs
+import numpy as np
 from numcodecs.compat import ensure_ndarray_like
 
 from zarr.abc.codec import ArrayBytesCodec
@@ -43,10 +44,16 @@ async def _decode_single(
 
         # view as numpy array with correct dtype
         chunk = ensure_ndarray_like(chunk)
+        print(chunk)
+        print(chunk.dtype)
         # special case object dtype, because incorrect handling can lead to
         # segfaults and other bad things happening
         if chunk_spec.dtype != object:
-            chunk = chunk.view(chunk_spec.dtype)
+            try:
+                chunk = chunk.view(chunk_spec.dtype)
+            except TypeError:
+                chunk = np.array(chunk).astype(chunk_spec.dtype)
+
         elif chunk.dtype != object:
             # If we end up here, someone must have hacked around with the filters.
             # We cannot deal with object arrays unless there is an object

diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -493,14 +493,6 @@ async def create(
                 order=order,
             )
         elif zarr_format == 2:
-            if dtype is str or dtype == "str":
-                # another special case: zarr v2 added the vlen-utf8 codec
-                vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
-                if filters and not any(x["id"] == "vlen-utf8" for x in filters):
-                    filters = list(filters) + [vlen_codec]
-                else:
-                    filters = [vlen_codec]
-
             if codecs is not None:
                 raise ValueError(
                     "codecs cannot be used for arrays with version 2. Use filters and compressor instead."

diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -64,6 +64,11 @@ def reset(self) -> None:
             },
             "buffer": "zarr.core.buffer.cpu.Buffer",
             "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
+            "v2_dtype_kind_to_default_filters_and_compressor": {
+                "biufcmM": ["zstd"],
+                "U": ["vlen-utf8"],
+                "OSV": ["vlen-bytes"],
+            },
         }
     ],
 )

diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable
 from enum import Enum
 from functools import cached_property
-from typing import TYPE_CHECKING, TypedDict, cast
+from typing import TYPE_CHECKING, Any, TypedDict, cast
 
 from zarr.abc.metadata import Metadata
 
@@ -71,6 +71,14 @@ def __init__(
         shape_parsed = parse_shapelike(shape)
         dtype_parsed = parse_dtype(dtype)
         chunks_parsed = parse_shapelike(chunks)
+        if not filters and not compressor:
+            filters, compressor = _default_filters_and_compressor(dtype_parsed)
+        if dtype is str or dtype == "str":
+            vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
+            if filters and not any(x["id"] == "vlen-utf8" for x in filters):
+                filters = list(filters) + [vlen_codec]
+            else:
+                filters = [vlen_codec]
         compressor_parsed = parse_compressor(compressor)
         order_parsed = parse_indexing_order(order)
         dimension_separator_parsed = parse_separator(dimension_separator)
@@ -326,3 +334,20 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
         return ""
     else:
         return dtype.type(0)
+
+
+def _default_filters_and_compressor(
+    dtype: np.dtype[Any],
+) -> tuple[list[dict[str, str]], dict[str, str] | None]:
+    """Get the default filters and compressor for a dtype.
+
+    The config contains a mapping from numpy dtype kind to the default compressor.
+    https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
+    """
+    dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_filters_and_compressor")
+    for dtype_kinds, filters_and_compressor in dtype_kind_to_default_compressor.items():
+        if dtype.kind in dtype_kinds:
+            filters = [{"id": f} for f in filters_and_compressor]
+            compressor = None
+            return filters, compressor
+    return [], None
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -4,6 +4,7 @@
 from itertools import accumulate
 from typing import Any, Literal
 
+import numcodecs
 import numpy as np
 import pytest
 
@@ -455,6 +456,7 @@ def test_info_v2(self) -> None:
             _read_only=False,
             _store_type="MemoryStore",
             _count_bytes=128,
+            _filters=(numcodecs.Zstd(),),
         )
         assert result == expected
 

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -63,6 +63,11 @@ def test_config_defaults_set() -> None:
                 "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
                 "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
             },
+            "v2_dtype_kind_to_default_filters_and_compressor": {
+                "biufcmM": ["zstd"],
+                "U": ["vlen-utf8"],
+                "OSV": ["vlen-bytes"],
+            },
         }
     ]
     assert config.get("array.order") == "C"

diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py
@@ -11,7 +11,7 @@
 from zarr.core.buffer import cpu
 from zarr.core.group import ConsolidatedMetadata, GroupMetadata
 from zarr.core.metadata import ArrayV2Metadata
-from zarr.core.metadata.v2 import parse_zarr_format
+from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format
 
 if TYPE_CHECKING:
     from typing import Any
@@ -77,6 +77,15 @@ def test_metadata_to_dict(
         assert observed["dimension_separator"] == expected_dimension_sep
         observed.pop("dimension_separator")
 
+    if not filters and not compressor:
+        assert observed["filters"], observed["compressor"] == _default_filters_and_compressor(
+            np.dtype(data_type)
+        )
+        observed.pop("filters")
+        observed.pop("compressor")
+        expected.pop("filters")
+        expected.pop("compressor")
+
     assert observed == expected
 
 

diff --git a/tests/test_v2.py b/tests/test_v2.py
@@ -11,7 +11,7 @@
 import zarr
 import zarr.core.buffer
 import zarr.storage
-from zarr import Array
+from zarr import Array, config
 from zarr.storage import MemoryStore, StorePath
 
 
@@ -82,47 +82,76 @@ def test_codec_pipeline() -> None:
 
 @pytest.mark.parametrize("dtype", ["|S", "|V"])
 async def test_v2_encode_decode(dtype):
-    store = zarr.storage.MemoryStore()
-    g = zarr.group(store=store, zarr_format=2)
-    g.create_array(
-        name="foo",
-        shape=(3,),
-        chunks=(3,),
-        dtype=dtype,
-        fill_value=b"X",
-    )
-
-    result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
-    assert result is not None
-
-    serialized = json.loads(result.to_bytes())
-    expected = {
-        "chunks": [3],
-        "compressor": None,
-        "dtype": f"{dtype}0",
-        "fill_value": "WA==",
-        "filters": None,
-        "order": "C",
-        "shape": [3],
-        "zarr_format": 2,
-        "dimension_separator": ".",
-    }
-    assert serialized == expected
-
-    data = zarr.open_array(store=store, path="foo")[:]
-    expected = np.full((3,), b"X", dtype=dtype)
-    np.testing.assert_equal(data, expected)
+    with config.set(
+        {
+            "v2_dtype_kind_to_default_filters_and_compressor": {
+                "SV": ["vlen-bytes"],
+            },
+        }
+    ):
+        store = zarr.storage.MemoryStore()
+        g = zarr.group(store=store, zarr_format=2)
+        g.create_array(
+            name="foo",
+            shape=(3,),
+            chunks=(3,),
+            dtype=dtype,
+            fill_value=b"X",
+        )
+
+        result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
+        assert result is not None
+
+        serialized = json.loads(result.to_bytes())
+        expected = {
+            "chunks": [3],
+            "compressor": None,
+            "dtype": f"{dtype}0",
+            "fill_value": "WA==",
+            "filters": [{"id": "vlen-bytes"}],
+            "order": "C",
+            "shape": [3],
+            "zarr_format": 2,
+            "dimension_separator": ".",
+        }
+        assert serialized == expected
+
+        data = zarr.open_array(store=store, path="foo")[:]
+        expected = np.full((3,), b"X", dtype=dtype)
+        np.testing.assert_equal(data, expected)
+
+
+@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]])
+def test_v2_encode_decode_with_data(dtype_value):
+    dtype, value = dtype_value
+    with config.set(
+        {
+            "v2_dtype_kind_to_default_filters_and_compressor": {
+                "U": ["vlen-utf8"],
+                "OSV": ["vlen-bytes"],
+            },
+        }
+    ):
+        expected = np.full((3,), value, dtype=dtype)
+        a = zarr.create(
+            shape=(3,),
+            zarr_format=2,
+            dtype=dtype,
+        )
+        a[:] = expected
+        data = a[:]
+        np.testing.assert_equal(data, expected)
 
 
 @pytest.mark.parametrize("dtype", [str, "str"])
 async def test_create_dtype_str(dtype: Any) -> None:
     arr = zarr.create(shape=3, dtype=dtype, zarr_format=2)
     assert arr.dtype.kind == "O"
     assert arr.metadata.to_dict()["dtype"] == "|O"
-    assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),)
-    arr[:] = ["a", "bb", "ccc"]
+    assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
+    arr[:] = [b"a", b"bb", b"ccc"]
     result = arr[:]
-    np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))
+    np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object"))
 
 
 @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype="<i4")], [numcodecs.Zlib(level=2)]])
@@ -132,3 +161,22 @@ def test_v2_filters_codecs(filters: Any) -> None:
     arr[:] = array_fixture
     result = arr[:]
     np.testing.assert_array_equal(result, array_fixture)
+
+
+@pytest.mark.parametrize(
+    "dtype_expected",
+    [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]],
+)
+def test_default_filters_and_compressor(dtype_expected: Any) -> None:
+    with config.set(
+        {
+            "v2_dtype_kind_to_default_filters_and_compressor": {
+                "biufcmM": ["zstd"],
+                "U": ["vlen-utf8"],
+                "OSV": ["vlen-bytes"],
+            },
+        }
+    ):
+        dtype, expected = dtype_expected
+        arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype)
+        assert arr.metadata.filters[0].codec_id == expected