Skip to content

Commit 305fdb7

Browse files
committed
merge
2 parents 2f6f8a0 + 9fb8a33 commit 305fdb7

File tree

4 files changed

+132
-77
lines changed

4 files changed

+132
-77
lines changed

src/zarr/core/array.py

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -680,10 +680,6 @@ async def _create_v2(
680680
dimension_separator = "."
681681

682682
dtype = parse_dtype(dtype, zarr_format=2)
683-
if not filters:
684-
filters = _default_filters(dtype)
685-
if not compressor:
686-
compressor = _default_compressor(dtype)
687683

688684
# inject VLenUTF8 for str dtype if not already present
689685
if np.issubdtype(dtype, np.str_):
@@ -3501,13 +3497,16 @@ def _get_default_codecs(
35013497
Iterable[dict[str, JSON] | ArrayArrayCodec]
35023498
| ArrayArrayCodec
35033499
| Iterable[numcodecs.abc.Codec]
3500+
| numcodecs.abc.Codec
35043501
| Literal["auto"]
3502+
| None
35053503
)
35063504
CompressorsParam: TypeAlias = (
35073505
Iterable[dict[str, JSON] | BytesBytesCodec]
35083506
| BytesBytesCodec
35093507
| numcodecs.abc.Codec
35103508
| Literal["auto"]
3509+
| None
35113510
)
35123511
ArrayBytesCodecParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"]
35133512

@@ -3568,12 +3567,16 @@ async def create_array(
35683567
of ``ArrayArrayCodec``.
35693568
If ``filters`` and ``compressors`` are not specified, then the default codecs for
35703569
Zarr v3 will be used.
3571-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
3570+
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3571+
in :mod:`zarr.core.config`.
3572+
Use ``None`` to omit default filters.
35723573
35733574
For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the
35743575
the order if your filters is consistent with the behavior of each filter.
35753576
If no ``filters`` are provided, a default set of filters will be used.
3576-
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
3577+
These defaults can be changed by modifying the value of ``array.v2_default_filters``
3578+
in :mod:`zarr.core.config`.
3579+
Use ``None`` to omit default filters.
35773580
compressors : Iterable[Codec], optional
35783581
List of compressors to apply to the array. Compressors are applied in order, and after any
35793582
filters are applied (if any are specified).
@@ -3582,11 +3585,16 @@ async def create_array(
35823585
returns another bytestream. Multiple compressors my be provided for Zarr v3.
35833586
If ``filters`` and ``compressors`` are not specified, then the default codecs for
35843587
Zarr v3 will be used.
3585-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
3588+
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3589+
in :mod:`zarr.core.config`.
3590+
Use ``None`` to omit default compressors.
35863591
3587-
For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2.
3592+
For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may
3593+
be provided for Zarr v2.
35883594
If no ``compressors`` are provided, a default compressor will be used.
3589-
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
3595+
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
3596+
in :mod:`zarr.core.config`.
3597+
Use ``None`` to omit the default compressor.
35903598
array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional
35913599
Array-to-bytes codec to use for encoding the array data.
35923600
Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion.
@@ -3671,6 +3679,7 @@ async def create_array(
36713679
filters_parsed, compressor_parsed = _parse_chunk_encoding_v2(
36723680
compressor=compressors, filters=filters, dtype=np.dtype(dtype)
36733681
)
3682+
36743683
if dimension_names is not None:
36753684
raise ValueError("Zarr v2 arrays do not support dimension names.")
36763685
if order is None:
@@ -3836,26 +3845,34 @@ def _parse_chunk_encoding_v2(
38363845
"""
38373846
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
38383847

3839-
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
3840-
_compressor: numcodecs.abc.Codec | None = None
3848+
_filters: tuple[numcodecs.abc.Codec, ...] | None
3849+
_compressor: numcodecs.abc.Codec | None
38413850

3842-
if compressor == "auto":
3851+
if compressor is None or compressor == ():
3852+
_compressor = None
3853+
elif compressor == "auto":
38433854
_compressor = default_compressor
3855+
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
3856+
_compressor = parse_compressor(compressor[0])
38443857
else:
38453858
if isinstance(compressor, Iterable) and not isinstance(compressor, dict):
38463859
msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead."
38473860
raise TypeError(msg)
38483861
_compressor = parse_compressor(compressor)
38493862

3850-
if filters == "auto":
3863+
if filters is None:
3864+
_filters = None
3865+
elif filters == "auto":
38513866
_filters = default_filters
38523867
else:
3853-
if isinstance(filters, Iterable) and not all(
3854-
isinstance(f, numcodecs.abc.Codec) for f in filters
3855-
):
3856-
raise TypeError(
3857-
"For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs."
3858-
)
3868+
if isinstance(filters, Iterable):
3869+
for idx, f in enumerate(filters):
3870+
if not isinstance(f, numcodecs.abc.Codec):
3871+
msg = (
3872+
"For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. "
3873+
f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec."
3874+
)
3875+
raise TypeError(msg)
38593876
_filters = parse_filters(filters)
38603877

38613878
return _filters, _compressor
@@ -3876,9 +3893,13 @@ def _parse_chunk_encoding_v3(
38763893
)
38773894
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
38783895
maybe_array_array: Iterable[Codec | dict[str, JSON]]
3896+
out_bytes_bytes: tuple[BytesBytesCodec, ...]
3897+
if compressors is None:
3898+
out_bytes_bytes = ()
38793899

3880-
if compressors == "auto":
3900+
elif compressors == "auto":
38813901
out_bytes_bytes = default_bytes_bytes
3902+
38823903
else:
38833904
if isinstance(compressors, dict | Codec):
38843905
maybe_bytes_bytes = (compressors,)
@@ -3888,8 +3909,10 @@ def _parse_chunk_encoding_v3(
38883909
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
38893910

38903911
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
3891-
3892-
if filters == "auto":
3912+
out_array_array: tuple[ArrayArrayCodec, ...]
3913+
if filters is None:
3914+
out_array_array = ()
3915+
elif filters == "auto":
38933916
out_array_array = default_array_array
38943917
else:
38953918
if isinstance(filters, dict | Codec):

src/zarr/core/metadata/v2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
241241
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
242242
raise TypeError(msg)
243243
return tuple(out)
244+
# take a single codec instance and wrap it in a tuple
245+
if isinstance(data, numcodecs.abc.Codec):
246+
return (data,)
244247
msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
245248
raise TypeError(msg)
246249

src/zarr/storage/memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
class MemoryStore(Store):
2121
"""
22-
In-memory store for testing purposes.
22+
In-memory store.
2323
2424
Parameters
2525
----------

tests/test_array.py

Lines changed: 83 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
FiltersParam,
2727
_get_default_chunk_encoding_v2,
2828
_get_default_chunk_encoding_v3,
29+
_parse_chunk_encoding_v2,
2930
_parse_chunk_encoding_v3,
3031
chunks_initialized,
3132
create_array,
@@ -953,42 +954,26 @@ def test_chunks_and_shards() -> None:
953954
assert arr_v2.shards is None
954955

955956

956-
@pytest.mark.parametrize("store", ["memory"], indirect=True)
957-
@pytest.mark.parametrize(
958-
"compressors",
959-
[
960-
"auto",
961-
(ZstdCodec(level=3),),
962-
(ZstdCodec(level=3), GzipCodec(level=0)),
963-
ZstdCodec(level=3),
964-
{"name": "zstd", "configuration": {"level": 3}},
965-
({"name": "zstd", "configuration": {"level": 3}},),
966-
],
967-
)
968-
async def test_create_array_v3_compressors(
969-
store: MemoryStore, compressors: CompressorsParam
970-
) -> None:
971-
"""
972-
Test various possibilities for the compressors parameter to create_array
973-
"""
974-
dtype = "uint8"
975-
arr = await create_array(
976-
store=store,
977-
dtype=dtype,
978-
shape=(10,),
979-
zarr_format=3,
980-
compressors=compressors,
981-
)
982-
_, _, bb_codecs_expected = _parse_chunk_encoding_v3(
983-
filters=(), compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype)
984-
)
985-
# TODO: find a better way to get the compressors from the array.
986-
assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined]
957+
def test_create_array_default_fill_values() -> None:
958+
a = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="<U4")
959+
assert a.fill_value == ""
960+
961+
b = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="<S4")
962+
assert b.fill_value == b""
963+
964+
c = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="i")
965+
assert c.fill_value == 0
966+
967+
d = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype="f")
968+
assert d.fill_value == 0.0
987969

988970

989971
@pytest.mark.parametrize("store", ["memory"], indirect=True)
990972
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
991-
async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: str) -> None:
973+
@pytest.mark.parametrize("empty_value", [None, ()])
974+
async def test_create_array_no_filters_compressors(
975+
store: MemoryStore, dtype: str, empty_value: Any
976+
) -> None:
992977
"""
993978
Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked.
994979
"""
@@ -999,30 +984,21 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
999984
dtype=dtype,
1000985
shape=(10,),
1001986
zarr_format=2,
1002-
compressors=None,
1003-
filters=None,
987+
compressors=empty_value,
988+
filters=empty_value,
1004989
)
1005-
assert arr.metadata.filters is None # type: ignore[union-attr]
1006-
assert arr.metadata.compressor is None # type: ignore[union-attr]
1007-
1008-
arr = await create_array(
1009-
store=store,
1010-
dtype=dtype,
1011-
shape=(10,),
1012-
zarr_format=2,
1013-
compressors=(),
1014-
filters=(),
1015-
)
1016-
assert arr.metadata.filters == () # type: ignore[union-attr]
990+
# The v2 metadata stores None and () separately
991+
assert arr.metadata.filters == empty_value # type: ignore[union-attr]
992+
# The v2 metadata does not allow tuple for compressor, therefore it is turned into None
1017993
assert arr.metadata.compressor is None # type: ignore[union-attr]
1018994

1019995
# v3
1020996
arr = await create_array(
1021997
store=store,
1022998
dtype=dtype,
1023999
shape=(10,),
1024-
compressors=(),
1025-
filters=(),
1000+
compressors=empty_value,
1001+
filters=empty_value,
10261002
)
10271003
if dtype == "str":
10281004
assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr]
@@ -1031,10 +1007,26 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
10311007

10321008

10331009
@pytest.mark.parametrize("store", ["memory"], indirect=True)
1010+
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
1011+
@pytest.mark.parametrize(
1012+
"compressors",
1013+
[
1014+
"auto",
1015+
None,
1016+
(),
1017+
(ZstdCodec(level=3),),
1018+
(ZstdCodec(level=3), GzipCodec(level=0)),
1019+
ZstdCodec(level=3),
1020+
{"name": "zstd", "configuration": {"level": 3}},
1021+
({"name": "zstd", "configuration": {"level": 3}},),
1022+
],
1023+
)
10341024
@pytest.mark.parametrize(
10351025
"filters",
10361026
[
10371027
"auto",
1028+
None,
1029+
(),
10381030
(
10391031
TransposeCodec(
10401032
order=[
@@ -1063,23 +1055,60 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st
10631055
({"name": "transpose", "configuration": {"order": [0]}},),
10641056
],
10651057
)
1066-
async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam) -> None:
1058+
async def test_create_array_v3_chunk_encoding(
1059+
store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str
1060+
) -> None:
10671061
"""
1068-
Test various possibilities for the filters parameter to create_array
1062+
Test various possibilities for the compressors and filters parameter to create_array
10691063
"""
1070-
dtype = "uint8"
10711064
arr = await create_array(
10721065
store=store,
10731066
dtype=dtype,
10741067
shape=(10,),
10751068
zarr_format=3,
10761069
filters=filters,
1070+
compressors=compressors,
10771071
)
1078-
aa_codecs_expected, _, _ = _parse_chunk_encoding_v3(
1079-
filters=filters, compressors=(), array_bytes_codec="auto", dtype=np.dtype(dtype)
1072+
aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3(
1073+
filters=filters, compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype)
10801074
)
1081-
# TODO: find a better way to get the filters from the array.
1075+
# TODO: find a better way to get the filters / compressors from the array.
10821076
assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined]
1077+
assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined]
1078+
1079+
1080+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
1081+
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
1082+
@pytest.mark.parametrize(
1083+
"compressors",
1084+
[
1085+
"auto",
1086+
None,
1087+
numcodecs.Zstd(level=3),
1088+
(),
1089+
(numcodecs.Zstd(level=3),),
1090+
],
1091+
)
1092+
@pytest.mark.parametrize(
1093+
"filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)]
1094+
)
1095+
async def test_create_array_v2_chunk_encoding(
1096+
store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str
1097+
) -> None:
1098+
arr = await create_array(
1099+
store=store,
1100+
dtype=dtype,
1101+
shape=(10,),
1102+
zarr_format=2,
1103+
compressors=compressors,
1104+
filters=filters,
1105+
)
1106+
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
1107+
filters=filters, compressor=compressors, dtype=np.dtype(dtype)
1108+
)
1109+
# TODO: find a better way to get the filters/compressor from the array.
1110+
assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr]
1111+
assert arr.metadata.filters == filters_expected # type: ignore[union-attr]
10831112

10841113

10851114
@pytest.mark.parametrize("store", ["memory"], indirect=True)

0 commit comments

Comments
 (0)