merge

normanrz · normanrz · commit 305fdb74cd52 · 2025-01-02T11:26:34.000+01:00
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -680,10 +680,6 @@ async def _create_v2(
             dimension_separator = "."
 
         dtype = parse_dtype(dtype, zarr_format=2)
-        if not filters:
-            filters = _default_filters(dtype)
-        if not compressor:
-            compressor = _default_compressor(dtype)
 
         # inject VLenUTF8 for str dtype if not already present
         if np.issubdtype(dtype, np.str_):
@@ -3501,13 +3497,16 @@ def _get_default_codecs(
     Iterable[dict[str, JSON] | ArrayArrayCodec]
     | ArrayArrayCodec
     | Iterable[numcodecs.abc.Codec]
+    | numcodecs.abc.Codec
     | Literal["auto"]
+    | None
 )
 CompressorsParam: TypeAlias = (
     Iterable[dict[str, JSON] | BytesBytesCodec]
     | BytesBytesCodec
     | numcodecs.abc.Codec
     | Literal["auto"]
+    | None
 )
 ArrayBytesCodecParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"]
 
@@ -3568,12 +3567,16 @@ async def create_array(
         of ``ArrayArrayCodec``.
         If ``filters`` and ``compressors`` are not specified, then the default codecs for
         Zarr v3 will be used.
-        These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
+        These defaults can be changed by modifying the value of ``array.v3_default_codecs``
+        in :mod:`zarr.core.config`.
+        Use ``None`` to omit default filters.
 
         For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the
         the order if your filters is consistent with the behavior of each filter.
         If no ``filters`` are provided, a default set of filters will be used.
-        These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
+        These defaults can be changed by modifying the value of ``array.v2_default_filters``
+        in :mod:`zarr.core.config`.
+        Use ``None`` to omit default filters.
     compressors : Iterable[Codec], optional
         List of compressors to apply to the array. Compressors are applied in order, and after any
         filters are applied (if any are specified).
@@ -3582,11 +3585,16 @@ async def create_array(
         returns another bytestream. Multiple compressors my be provided for Zarr v3.
         If ``filters`` and ``compressors`` are not specified, then the default codecs for
         Zarr v3 will be used.
-        These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
+        These defaults can be changed by modifying the value of ``array.v3_default_codecs``
+        in :mod:`zarr.core.config`.
+        Use ``None`` to omit default compressors.
 
-        For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2.
+        For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may
+        be provided for Zarr v2.
         If no ``compressors`` are provided, a default compressor will be used.
-        These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
+        These defaults can be changed by modifying the value of ``array.v2_default_compressor``
+        in :mod:`zarr.core.config`.
+        Use ``None`` to omit the default compressor.
     array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional
         Array-to-bytes codec to use for encoding the array data.
         Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion.
@@ -3671,6 +3679,7 @@ async def create_array(
         filters_parsed, compressor_parsed = _parse_chunk_encoding_v2(
             compressor=compressors, filters=filters, dtype=np.dtype(dtype)
         )
+
         if dimension_names is not None:
             raise ValueError("Zarr v2 arrays do not support dimension names.")
         if order is None:
@@ -3836,26 +3845,34 @@ def _parse_chunk_encoding_v2(
     """
     default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
 
-    _filters: tuple[numcodecs.abc.Codec, ...] | None = None
-    _compressor: numcodecs.abc.Codec | None = None
+    _filters: tuple[numcodecs.abc.Codec, ...] | None
+    _compressor: numcodecs.abc.Codec | None
 
-    if compressor == "auto":
+    if compressor is None or compressor == ():
+        _compressor = None
+    elif compressor == "auto":
         _compressor = default_compressor
+    elif isinstance(compressor, tuple | list) and len(compressor) == 1:
+        _compressor = parse_compressor(compressor[0])
     else:
         if isinstance(compressor, Iterable) and not isinstance(compressor, dict):
             msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead."
             raise TypeError(msg)
         _compressor = parse_compressor(compressor)
 
-    if filters == "auto":
+    if filters is None:
+        _filters = None
+    elif filters == "auto":
         _filters = default_filters
     else:
-        if isinstance(filters, Iterable) and not all(
-            isinstance(f, numcodecs.abc.Codec) for f in filters
-        ):
-            raise TypeError(
-                "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs."
-            )
+        if isinstance(filters, Iterable):
+            for idx, f in enumerate(filters):
+                if not isinstance(f, numcodecs.abc.Codec):
+                    msg = (
+                        "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. "
+                        f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec."
+                    )
+                    raise TypeError(msg)
         _filters = parse_filters(filters)
 
     return _filters, _compressor
@@ -3876,9 +3893,13 @@ def _parse_chunk_encoding_v3(
     )
     maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
     maybe_array_array: Iterable[Codec | dict[str, JSON]]
+    out_bytes_bytes: tuple[BytesBytesCodec, ...]
+    if compressors is None:
+        out_bytes_bytes = ()
 
-    if compressors == "auto":
+    elif compressors == "auto":
         out_bytes_bytes = default_bytes_bytes
+
     else:
         if isinstance(compressors, dict | Codec):
             maybe_bytes_bytes = (compressors,)
@@ -3888,8 +3909,10 @@ def _parse_chunk_encoding_v3(
             maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
 
         out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
-
-    if filters == "auto":
+    out_array_array: tuple[ArrayArrayCodec, ...]
+    if filters is None:
+        out_array_array = ()
+    elif filters == "auto":
         out_array_array = default_array_array
     else:
         if isinstance(filters, dict | Codec):
diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py
@@ -241,6 +241,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
                 msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
                 raise TypeError(msg)
         return tuple(out)
+    # take a single codec instance and wrap it in a tuple
+    if isinstance(data, numcodecs.abc.Codec):
+        return (data,)
     msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
     raise TypeError(msg)
 
diff --git a/src/zarr/storage/memory.py b/src/zarr/storage/memory.py
@@ -19,7 +19,7 @@
 
 class MemoryStore(Store):
     """
-    In-memory store for testing purposes.
+    In-memory store.
 
     Parameters
     ----------
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -26,6 +26,7 @@
     FiltersParam,
     _get_default_chunk_encoding_v2,
     _get_default_chunk_encoding_v3,
+    _parse_chunk_encoding_v2,
     _parse_chunk_encoding_v3,
     chunks_initialized,
     create_array,
@@ -953,42 +954,26 @@ def test_chunks_and_shards() -> None:
     assert arr_v2.shards is None
 
 
-@pytest.mark.parametrize("store", ["memory"], indirect=True)
-@pytest.mark.parametrize(
-    "compressors",
-    [
-        "auto",
-        (ZstdCodec(level=3),),
-        (ZstdCodec(level=3), GzipCodec(level=0)),
-        ZstdCodec(level=3),
-        {"name": "zstd", "configuration": {"level": 3}},
-        ({"name": "zstd", "configuration": {"level": 3}},),
-    ],
-)
-async def test_create_array_v3_compressors(
-    store: MemoryStore, compressors: CompressorsParam
-) -> None:
-    """
-    Test various possibilities for the compressors parameter to create_array
-    """
-    dtype = "uint8"
-    arr = await create_array(
-        store=store,
-        dtype=dtype,
-        shape=(10,),
-        zarr_format=3,
-        compressors=compressors,
-    )
-    _, _, bb_codecs_expected = _parse_chunk_encoding_v3(
-        filters=(), compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype)
-    )
-    # TODO: find a better way to get the compressors from the array.
-    assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected  # type: ignore[attr-defined]
+def test_create_array_default_fill_values() -> None:
+    a = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="<U4")
+    assert a.fill_value == ""
+
+    b = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="<S4")
+    assert b.fill_value == b""
+
+    c = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="i")
+    assert c.fill_value == 0
+
+    d = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="f")
+    assert d.fill_value == 0.0
 
 
 @pytest.mark.parametrize("store", ["memory"], indirect=True)
 @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
-async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: str) -> None:
+@pytest.mark.parametrize("empty_value", [None, ()])
+async def test_create_array_no_filters_compressors(
+    store: MemoryStore, dtype: str, empty_value: Any
+) -> None:
     """
     Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked.
     """
@@ -999,30 +984,21 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
         dtype=dtype,
         shape=(10,),
         zarr_format=2,
-        compressors=None,
-        filters=None,
+        compressors=empty_value,
+        filters=empty_value,
     )
-    assert arr.metadata.filters is None  # type: ignore[union-attr]
-    assert arr.metadata.compressor is None  # type: ignore[union-attr]
-
-    arr = await create_array(
-        store=store,
-        dtype=dtype,
-        shape=(10,),
-        zarr_format=2,
-        compressors=(),
-        filters=(),
-    )
-    assert arr.metadata.filters == ()  # type: ignore[union-attr]
+    # The v2 metadata stores None and () separately
+    assert arr.metadata.filters == empty_value  # type: ignore[union-attr]
+    # The v2 metadata does not allow tuple for compressor, therefore it is turned into None
     assert arr.metadata.compressor is None  # type: ignore[union-attr]
 
     # v3
     arr = await create_array(
         store=store,
         dtype=dtype,
         shape=(10,),
-        compressors=(),
-        filters=(),
+        compressors=empty_value,
+        filters=empty_value,
     )
     if dtype == "str":
         assert arr.metadata.codecs == [VLenUTF8Codec()]  # type: ignore[union-attr]
@@ -1031,10 +1007,26 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
 
 
 @pytest.mark.parametrize("store", ["memory"], indirect=True)
+@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
+@pytest.mark.parametrize(
+    "compressors",
+    [
+        "auto",
+        None,
+        (),
+        (ZstdCodec(level=3),),
+        (ZstdCodec(level=3), GzipCodec(level=0)),
+        ZstdCodec(level=3),
+        {"name": "zstd", "configuration": {"level": 3}},
+        ({"name": "zstd", "configuration": {"level": 3}},),
+    ],
+)
 @pytest.mark.parametrize(
     "filters",
     [
         "auto",
+        None,
+        (),
         (
             TransposeCodec(
                 order=[
@@ -1063,23 +1055,60 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
         ({"name": "transpose", "configuration": {"order": [0]}},),
     ],
 )
-async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam) -> None:
+async def test_create_array_v3_chunk_encoding(
+    store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str
+) -> None:
     """
-    Test various possibilities for the filters parameter to create_array
+    Test various possibilities for the compressors and filters parameter to create_array
     """
-    dtype = "uint8"
     arr = await create_array(
         store=store,
         dtype=dtype,
         shape=(10,),
         zarr_format=3,
         filters=filters,
+        compressors=compressors,
     )
-    aa_codecs_expected, _, _ = _parse_chunk_encoding_v3(
-        filters=filters, compressors=(), array_bytes_codec="auto", dtype=np.dtype(dtype)
+    aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3(
+        filters=filters, compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype)
     )
-    # TODO: find a better way to get the filters from the array.
+    # TODO: find a better way to get the filters / compressors from the array.
     assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected  # type: ignore[attr-defined]
+    assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected  # type: ignore[attr-defined]
+
+
+@pytest.mark.parametrize("store", ["memory"], indirect=True)
+@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
+@pytest.mark.parametrize(
+    "compressors",
+    [
+        "auto",
+        None,
+        numcodecs.Zstd(level=3),
+        (),
+        (numcodecs.Zstd(level=3),),
+    ],
+)
+@pytest.mark.parametrize(
+    "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)]
+)
+async def test_create_array_v2_chunk_encoding(
+    store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str
+) -> None:
+    arr = await create_array(
+        store=store,
+        dtype=dtype,
+        shape=(10,),
+        zarr_format=2,
+        compressors=compressors,
+        filters=filters,
+    )
+    filters_expected, compressor_expected = _parse_chunk_encoding_v2(
+        filters=filters, compressor=compressors, dtype=np.dtype(dtype)
+    )
+    # TODO: find a better way to get the filters/compressor from the array.
+    assert arr.metadata.compressor == compressor_expected  # type: ignore[union-attr]
+    assert arr.metadata.filters == filters_expected  # type: ignore[union-attr]
 
 
 @pytest.mark.parametrize("store", ["memory"], indirect=True)