Skip to content

Commit 5c63de3

Browse files
committed
merged #2652 in
1 parent c8b96a5 commit 5c63de3

File tree

7 files changed

+126
-134
lines changed

7 files changed

+126
-134
lines changed

docs/user-guide/config.rst

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Configuration options include the following:
2828

2929
- Default Zarr format ``default_zarr_version``
3030
- Default array order in memory ``array.order``
31-
- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor``
31+
- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor``
3232
- Whether empty chunks are written to storage ``array.write_empty_chunks``
3333
- Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
3434
- Selections of implementations of codecs, codec pipelines and buffers
@@ -54,19 +54,20 @@ This is the current default configuration::
5454
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
5555
'numeric': None,
5656
'string': [{'id': 'vlen-utf8'}]},
57-
'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'},
58-
{'configuration': {'checksum': False,
59-
'level': 0},
60-
'name': 'zstd'}],
61-
'numeric': [{'configuration': {'endian': 'little'},
62-
'name': 'bytes'},
63-
{'configuration': {'checksum': False,
64-
'level': 0},
65-
'name': 'zstd'}],
66-
'string': [{'name': 'vlen-utf8'},
67-
{'configuration': {'checksum': False,
68-
'level': 0},
69-
'name': 'zstd'}]},
57+
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
58+
'level': 0},
59+
'name': 'zstd'}],
60+
'numeric': [{'configuration': {'checksum': False,
61+
'level': 0},
62+
'name': 'zstd'}],
63+
'string': [{'configuration': {'checksum': False,
64+
'level': 0},
65+
'name': 'zstd'}]},
66+
'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []},
67+
'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'},
68+
'numeric': {'configuration': {'endian': 'little'},
69+
'name': 'bytes'},
70+
'string': {'name': 'vlen-utf8'}},
7071
'write_empty_chunks': False},
7172
'async': {'concurrency': 10, 'timeout': None},
7273
'buffer': 'zarr.core.buffer.cpu.Buffer',

src/zarr/api/asynchronous.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,8 @@ async def create(
892892
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
893893
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
894894
895-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
895+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
896+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
896897
compressor : Codec, optional
897898
Primary compressor to compress chunk data.
898899
V2 only. V3 arrays should use ``codecs`` instead.

src/zarr/api/synchronous.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -788,9 +788,8 @@ def create_array(
788788
For Zarr v3, a "filter" is a codec that takes an array and returns an array,
789789
and these values must be instances of ``ArrayArrayCodec``, or dict representations
790790
of ``ArrayArrayCodec``.
791-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
792-
Zarr v3 will be used.
793-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
791+
If no ``filters`` are provided, a default set of filters will be used.
792+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
794793
in :mod:`zarr.core.config`.
795794
Use ``None`` to omit default filters.
796795
@@ -806,22 +805,22 @@ def create_array(
806805
807806
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
808807
returns another bytestream. Multiple compressors my be provided for Zarr v3.
809-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
810-
Zarr v3 will be used.
811-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
808+
If no ``compressors`` are provided, a default set of compressors will be used.
809+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
812810
in :mod:`zarr.core.config`.
813811
Use ``None`` to omit default compressors.
814812
815813
For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may
816814
be provided for Zarr v2.
817-
If no ``compressors`` are provided, a default compressor will be used.
818-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
815+
If no ``compressor`` is provided, a default compressor will be used.
819816
in :mod:`zarr.core.config`.
820817
Use ``None`` to omit the default compressor.
821818
serializer : dict[str, JSON] | ArrayBytesCodec, optional
822819
Array-to-bytes codec to use for encoding the array data.
823820
Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion.
824-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
821+
If no ``serializer`` is provided, a default serializer will be used.
822+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
823+
in :mod:`zarr.core.config`.
825824
fill_value : Any, optional
826825
Fill value for the array.
827826
order : {"C", "F"}, optional

src/zarr/core/array.py

Lines changed: 55 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,8 @@ async def create(
469469
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
470470
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
471471
472-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
472+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
473+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
473474
dimension_names : Iterable[str], optional
474475
The names of the dimensions (default is None).
475476
V3 only. V2 arrays should not use this parameter.
@@ -1715,7 +1716,8 @@ def create(
17151716
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
17161717
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
17171718
1718-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
1719+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
1720+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
17191721
dimension_names : Iterable[str], optional
17201722
The names of the dimensions (default is None).
17211723
V3 only. V2 arrays should not use this parameter.
@@ -3698,17 +3700,9 @@ def _build_parents(
36983700

36993701
def _get_default_codecs(
37003702
np_dtype: np.dtype[Any],
3701-
) -> list[dict[str, JSON]]:
3702-
default_codecs = zarr_config.get("array.v3_default_codecs")
3703-
dtype = DataType.from_numpy(np_dtype)
3704-
if dtype == DataType.string:
3705-
dtype_key = "string"
3706-
elif dtype == DataType.bytes:
3707-
dtype_key = "bytes"
3708-
else:
3709-
dtype_key = "numeric"
3710-
3711-
return cast(list[dict[str, JSON]], default_codecs[dtype_key])
3703+
) -> tuple[Codec, ...]:
3704+
filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype)
3705+
return filters + (serializer,) + compressors
37123706

37133707

37143708
FiltersLike: TypeAlias = (
@@ -3785,9 +3779,8 @@ async def create_array(
37853779
For Zarr v3, a "filter" is a codec that takes an array and returns an array,
37863780
and these values must be instances of ``ArrayArrayCodec``, or dict representations
37873781
of ``ArrayArrayCodec``.
3788-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3789-
Zarr v3 will be used.
3790-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3782+
If no ``filters`` are provided, a default set of filters will be used.
3783+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
37913784
in :mod:`zarr.core.config`.
37923785
Use ``None`` to omit default filters.
37933786
@@ -3803,22 +3796,22 @@ async def create_array(
38033796
38043797
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
38053798
returns another bytestream. Multiple compressors my be provided for Zarr v3.
3806-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3807-
Zarr v3 will be used.
3808-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3799+
If no ``compressors`` are provided, a default set of compressors will be used.
3800+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
38093801
in :mod:`zarr.core.config`.
38103802
Use ``None`` to omit default compressors.
38113803
38123804
For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may
38133805
be provided for Zarr v2.
3814-
If no ``compressors`` are provided, a default compressor will be used.
3815-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
3806+
If no ``compressor`` is provided, a default compressor will be used.
38163807
in :mod:`zarr.core.config`.
38173808
Use ``None`` to omit the default compressor.
38183809
serializer : dict[str, JSON] | ArrayBytesCodec, optional
38193810
Array-to-bytes codec to use for encoding the array data.
38203811
Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion.
3821-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
3812+
If no ``serializer`` is provided, a default serializer will be used.
3813+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
3814+
in :mod:`zarr.core.config`.
38223815
fill_value : Any, optional
38233816
Fill value for the array.
38243817
order : {"C", "F"}, optional
@@ -3997,7 +3990,6 @@ def _get_default_chunk_encoding_v3(
39973990
"""
39983991
Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype.
39993992
"""
4000-
default_codecs = zarr_config.get("array.v3_default_codecs")
40013993
dtype = DataType.from_numpy(np_dtype)
40023994
if dtype == DataType.string:
40033995
dtype_key = "string"
@@ -4006,31 +3998,34 @@ def _get_default_chunk_encoding_v3(
40063998
else:
40073999
dtype_key = "numeric"
40084000

4009-
codec_dicts = default_codecs[dtype_key]
4010-
codecs = tuple(_resolve_codec(c) for c in codec_dicts)
4011-
array_bytes_maybe = None
4012-
array_array: list[ArrayArrayCodec] = []
4013-
bytes_bytes: list[BytesBytesCodec] = []
4001+
default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key)
4002+
default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key)
4003+
default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key)
40144004

4015-
for codec in codecs:
4016-
if isinstance(codec, ArrayBytesCodec):
4017-
if array_bytes_maybe is not None:
4018-
raise ValueError(
4019-
f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. "
4020-
"Only one array-to-bytes codec is allowed."
4021-
)
4022-
array_bytes_maybe = codec
4023-
elif isinstance(codec, ArrayArrayCodec):
4024-
array_array.append(codec)
4025-
elif isinstance(codec, BytesBytesCodec):
4026-
bytes_bytes.append(codec)
4027-
else:
4028-
raise TypeError(f"Unexpected codec type: {type(codec)}")
4005+
filters_list: list[ArrayArrayCodec] = []
4006+
compressors_list: list[BytesBytesCodec] = []
40294007

4030-
if array_bytes_maybe is None:
4008+
serializer = _resolve_codec(default_serializer)
4009+
if serializer is None:
40314010
raise ValueError("Required ArrayBytesCodec was not found.")
4011+
if not isinstance(serializer, ArrayBytesCodec):
4012+
raise TypeError(f"Expected ArrayBytesCodec, got: {type(serializer)}")
4013+
4014+
for codec_dict in default_filters:
4015+
codec = _resolve_codec(codec_dict)
4016+
if isinstance(codec, ArrayArrayCodec):
4017+
filters_list.append(codec)
4018+
else:
4019+
raise TypeError(f"Expected ArrayArrayCodec, got: {type(codec)}")
40324020

4033-
return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes)
4021+
for codec_dict in default_compressors:
4022+
codec = _resolve_codec(codec_dict)
4023+
if isinstance(codec, BytesBytesCodec):
4024+
compressors_list.append(codec)
4025+
else:
4026+
raise TypeError(f"Expected BytesBytesCodec, got: {type(codec)}")
4027+
4028+
return tuple(filters_list), serializer, tuple(compressors_list)
40344029

40354030

40364031
def _get_default_chunk_encoding_v2(
@@ -4114,21 +4109,7 @@ def _parse_chunk_encoding_v3(
41144109
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
41154110
maybe_array_array: Iterable[Codec | dict[str, JSON]]
41164111
out_bytes_bytes: tuple[BytesBytesCodec, ...]
4117-
if compressors is None:
4118-
out_bytes_bytes = ()
4119-
4120-
elif compressors == "auto":
4121-
out_bytes_bytes = default_bytes_bytes
41224112

4123-
else:
4124-
if isinstance(compressors, dict | Codec):
4125-
maybe_bytes_bytes = (compressors,)
4126-
elif compressors is None:
4127-
maybe_bytes_bytes = ()
4128-
else:
4129-
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4130-
4131-
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
41324113
out_array_array: tuple[ArrayArrayCodec, ...]
41334114
if filters is None:
41344115
out_array_array = ()
@@ -4148,6 +4129,22 @@ def _parse_chunk_encoding_v3(
41484129
else:
41494130
out_array_bytes = _parse_array_bytes_codec(serializer)
41504131

4132+
if compressors is None:
4133+
out_bytes_bytes = ()
4134+
4135+
elif compressors == "auto":
4136+
out_bytes_bytes = default_bytes_bytes
4137+
4138+
else:
4139+
if isinstance(compressors, dict | Codec):
4140+
maybe_bytes_bytes = (compressors,)
4141+
elif compressors is None:
4142+
maybe_bytes_bytes = ()
4143+
else:
4144+
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4145+
4146+
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
4147+
41514148
return out_array_array, out_array_bytes, out_bytes_bytes
41524149

41534150

src/zarr/core/config.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,20 @@ def reset(self) -> None:
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
7878
},
79-
"v3_default_codecs": {
79+
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
80+
"v3_default_serializer": {
81+
"numeric": {"name": "bytes", "configuration": {"endian": "little"}},
82+
"string": {"name": "vlen-utf8"},
83+
"bytes": {"name": "vlen-bytes"},
84+
},
85+
"v3_default_compressors": {
8086
"numeric": [
81-
{"name": "bytes", "configuration": {"endian": "little"}},
8287
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8388
],
8489
"string": [
85-
{"name": "vlen-utf8"},
8690
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8791
],
8892
"bytes": [
89-
{"name": "vlen-bytes"},
9093
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
9194
],
9295
},

0 commit comments

Comments
 (0)