Skip to content

Commit a358786

Browse files
authored
Merge branch 'main' into group-array-data
2 parents e7d28a5 + bc5877b commit a358786

File tree

14 files changed

+454
-243
lines changed

14 files changed

+454
-243
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ ci:
66
default_stages: [pre-commit, pre-push]
77
repos:
88
- repo: https://github.com/astral-sh/ruff-pre-commit
9-
rev: v0.8.2
9+
rev: v0.8.6
1010
hooks:
1111
- id: ruff
1212
args: ["--fix", "--show-fixes"]
@@ -22,7 +22,7 @@ repos:
2222
- id: check-yaml
2323
- id: trailing-whitespace
2424
- repo: https://github.com/pre-commit/mirrors-mypy
25-
rev: v1.13.0
25+
rev: v1.14.1
2626
hooks:
2727
- id: mypy
2828
files: src|tests

docs/user-guide/config.rst

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Configuration options include the following:
2828

2929
- Default Zarr format ``default_zarr_version``
3030
- Default array order in memory ``array.order``
31-
- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor``
31+
- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor``
3232
- Whether empty chunks are written to storage ``array.write_empty_chunks``
3333
- Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
3434
- Selections of implementations of codecs, codec pipelines and buffers
@@ -54,19 +54,20 @@ This is the current default configuration::
5454
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
5555
'numeric': None,
5656
'string': [{'id': 'vlen-utf8'}]},
57-
'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'},
58-
{'configuration': {'checksum': False,
59-
'level': 0},
60-
'name': 'zstd'}],
61-
'numeric': [{'configuration': {'endian': 'little'},
62-
'name': 'bytes'},
63-
{'configuration': {'checksum': False,
64-
'level': 0},
65-
'name': 'zstd'}],
66-
'string': [{'name': 'vlen-utf8'},
67-
{'configuration': {'checksum': False,
68-
'level': 0},
69-
'name': 'zstd'}]},
57+
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
58+
'level': 0},
59+
'name': 'zstd'}],
60+
'numeric': [{'configuration': {'checksum': False,
61+
'level': 0},
62+
'name': 'zstd'}],
63+
'string': [{'configuration': {'checksum': False,
64+
'level': 0},
65+
'name': 'zstd'}]},
66+
'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []},
67+
'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'},
68+
'numeric': {'configuration': {'endian': 'little'},
69+
'name': 'bytes'},
70+
'string': {'name': 'vlen-utf8'}},
7071
'write_empty_chunks': False},
7172
'async': {'concurrency': 10, 'timeout': None},
7273
'buffer': 'zarr.core.buffer.cpu.Buffer',

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ test = [
6969
"s3fs",
7070
"pytest-asyncio",
7171
"pytest-accept",
72-
"moto[s3]",
72+
"moto[s3,server]",
7373
"requests",
7474
"rich",
7575
"mypy",

src/zarr/api/asynchronous.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,6 @@ async def consolidate_metadata(
188188
group.store_path.store._check_writable()
189189

190190
members_metadata = {k: v.metadata async for k, v in group.members(max_depth=None)}
191-
192191
# While consolidating, we want to be explicit about when child groups
193192
# are empty by inserting an empty dict for consolidated_metadata.metadata
194193
for k, v in members_metadata.items():
@@ -892,7 +891,8 @@ async def create(
892891
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
893892
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
894893
895-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
894+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
895+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
896896
compressor : Codec, optional
897897
Primary compressor to compress chunk data.
898898
Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead.

src/zarr/api/synchronous.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -788,9 +788,8 @@ def create_array(
788788
For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
789789
and these values must be instances of ``ArrayArrayCodec``, or dict representations
790790
of ``ArrayArrayCodec``.
791-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
792-
Zarr format 3 will be used.
793-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
791+
If no ``filters`` are provided, a default set of filters will be used.
792+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
794793
in :mod:`zarr.core.config`.
795794
Use ``None`` to omit default filters.
796795
@@ -806,22 +805,22 @@ def create_array(
806805
807806
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
808807
returns another bytestream. Multiple compressors my be provided for Zarr format 3.
809-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
810-
Zarr format 3 will be used.
811-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
808+
If no ``compressors`` are provided, a default set of compressors will be used.
809+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
812810
in :mod:`zarr.core.config`.
813811
Use ``None`` to omit default compressors.
814812
815813
For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
816814
be provided for Zarr format 2.
817-
If no ``compressors`` are provided, a default compressor will be used.
818-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
815+
If no ``compressor`` is provided, a default compressor will be used.
819816
in :mod:`zarr.core.config`.
820817
Use ``None`` to omit the default compressor.
821818
serializer : dict[str, JSON] | ArrayBytesCodec, optional
822819
Array-to-bytes codec to use for encoding the array data.
823820
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.
824-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
821+
If no ``serializer`` is provided, a default serializer will be used.
822+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
823+
in :mod:`zarr.core.config`.
825824
fill_value : Any, optional
826825
Fill value for the array.
827826
order : {"C", "F"}, optional

src/zarr/core/array.py

Lines changed: 39 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@
110110
_parse_array_array_codec,
111111
_parse_array_bytes_codec,
112112
_parse_bytes_bytes_codec,
113-
_resolve_codec,
114113
get_pipeline_class,
115114
)
116115
from zarr.storage import StoreLike, make_store_path
@@ -469,7 +468,8 @@ async def create(
469468
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
470469
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
471470
472-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
471+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
472+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
473473
dimension_names : Iterable[str], optional
474474
The names of the dimensions (default is None).
475475
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
@@ -1715,7 +1715,8 @@ def create(
17151715
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
17161716
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
17171717
1718-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
1718+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
1719+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
17191720
dimension_names : Iterable[str], optional
17201721
The names of the dimensions (default is None).
17211722
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
@@ -1994,10 +1995,11 @@ def path(self) -> str:
19941995

19951996
@property
19961997
def name(self) -> str:
1998+
"""Array name following h5py convention."""
19971999
return self._async_array.name
19982000

19992001
@property
2000-
def basename(self) -> str | None:
2002+
def basename(self) -> str:
20012003
"""Final component of name."""
20022004
return self._async_array.basename
20032005

@@ -3698,17 +3700,9 @@ def _build_parents(
36983700

36993701
def _get_default_codecs(
37003702
np_dtype: np.dtype[Any],
3701-
) -> list[dict[str, JSON]]:
3702-
default_codecs = zarr_config.get("array.v3_default_codecs")
3703-
dtype = DataType.from_numpy(np_dtype)
3704-
if dtype == DataType.string:
3705-
dtype_key = "string"
3706-
elif dtype == DataType.bytes:
3707-
dtype_key = "bytes"
3708-
else:
3709-
dtype_key = "numeric"
3710-
3711-
return cast(list[dict[str, JSON]], default_codecs[dtype_key])
3703+
) -> tuple[Codec, ...]:
3704+
filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype)
3705+
return filters + (serializer,) + compressors
37123706

37133707

37143708
FiltersLike: TypeAlias = (
@@ -3785,9 +3779,8 @@ async def create_array(
37853779
For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
37863780
and these values must be instances of ``ArrayArrayCodec``, or dict representations
37873781
of ``ArrayArrayCodec``.
3788-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3789-
Zarr format 3 will be used.
3790-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3782+
If no ``filters`` are provided, a default set of filters will be used.
3783+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
37913784
in :mod:`zarr.core.config`.
37923785
Use ``None`` to omit default filters.
37933786
@@ -3803,22 +3796,22 @@ async def create_array(
38033796
38043797
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
38053798
returns another bytestream. Multiple compressors my be provided for Zarr format 3.
3806-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3807-
Zarr format 3 will be used.
3808-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3799+
If no ``compressors`` are provided, a default set of compressors will be used.
3800+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
38093801
in :mod:`zarr.core.config`.
38103802
Use ``None`` to omit default compressors.
38113803
38123804
For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
38133805
be provided for Zarr format 2.
3814-
If no ``compressors`` are provided, a default compressor will be used.
3815-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
3806+
If no ``compressor`` is provided, a default compressor will be used.
38163807
in :mod:`zarr.core.config`.
38173808
Use ``None`` to omit the default compressor.
38183809
serializer : dict[str, JSON] | ArrayBytesCodec, optional
38193810
Array-to-bytes codec to use for encoding the array data.
38203811
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.
3821-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
3812+
If no ``serializer`` is provided, a default serializer will be used.
3813+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
3814+
in :mod:`zarr.core.config`.
38223815
fill_value : Any, optional
38233816
Fill value for the array.
38243817
order : {"C", "F"}, optional
@@ -3997,7 +3990,6 @@ def _get_default_chunk_encoding_v3(
39973990
"""
39983991
Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype.
39993992
"""
4000-
default_codecs = zarr_config.get("array.v3_default_codecs")
40013993
dtype = DataType.from_numpy(np_dtype)
40023994
if dtype == DataType.string:
40033995
dtype_key = "string"
@@ -4006,31 +3998,15 @@ def _get_default_chunk_encoding_v3(
40063998
else:
40073999
dtype_key = "numeric"
40084000

4009-
codec_dicts = default_codecs[dtype_key]
4010-
codecs = tuple(_resolve_codec(c) for c in codec_dicts)
4011-
array_bytes_maybe = None
4012-
array_array: list[ArrayArrayCodec] = []
4013-
bytes_bytes: list[BytesBytesCodec] = []
4014-
4015-
for codec in codecs:
4016-
if isinstance(codec, ArrayBytesCodec):
4017-
if array_bytes_maybe is not None:
4018-
raise ValueError(
4019-
f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. "
4020-
"Only one array-to-bytes codec is allowed."
4021-
)
4022-
array_bytes_maybe = codec
4023-
elif isinstance(codec, ArrayArrayCodec):
4024-
array_array.append(codec)
4025-
elif isinstance(codec, BytesBytesCodec):
4026-
bytes_bytes.append(codec)
4027-
else:
4028-
raise TypeError(f"Unexpected codec type: {type(codec)}")
4001+
default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key)
4002+
default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key)
4003+
default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key)
40294004

4030-
if array_bytes_maybe is None:
4031-
raise ValueError("Required ArrayBytesCodec was not found.")
4005+
filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters)
4006+
serializer = _parse_array_bytes_codec(default_serializer)
4007+
compressors = tuple(_parse_bytes_bytes_codec(codec_dict) for codec_dict in default_compressors)
40324008

4033-
return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes)
4009+
return filters, serializer, compressors
40344010

40354011

40364012
def _get_default_chunk_encoding_v2(
@@ -4111,34 +4087,15 @@ def _parse_chunk_encoding_v3(
41114087
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
41124088
dtype
41134089
)
4114-
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
4115-
maybe_array_array: Iterable[Codec | dict[str, JSON]]
4116-
out_bytes_bytes: tuple[BytesBytesCodec, ...]
4117-
if compressors is None:
4118-
out_bytes_bytes = ()
4119-
4120-
elif compressors == "auto":
4121-
out_bytes_bytes = default_bytes_bytes
41224090

4123-
else:
4124-
if isinstance(compressors, dict | Codec):
4125-
maybe_bytes_bytes = (compressors,)
4126-
elif compressors is None:
4127-
maybe_bytes_bytes = ()
4128-
else:
4129-
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4130-
4131-
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
4132-
out_array_array: tuple[ArrayArrayCodec, ...]
41334091
if filters is None:
4134-
out_array_array = ()
4092+
out_array_array: tuple[ArrayArrayCodec, ...] = ()
41354093
elif filters == "auto":
41364094
out_array_array = default_array_array
41374095
else:
4096+
maybe_array_array: Iterable[Codec | dict[str, JSON]]
41384097
if isinstance(filters, dict | Codec):
41394098
maybe_array_array = (filters,)
4140-
elif filters is None:
4141-
maybe_array_array = ()
41424099
else:
41434100
maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters)
41444101
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
@@ -4148,6 +4105,19 @@ def _parse_chunk_encoding_v3(
41484105
else:
41494106
out_array_bytes = _parse_array_bytes_codec(serializer)
41504107

4108+
if compressors is None:
4109+
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
4110+
elif compressors == "auto":
4111+
out_bytes_bytes = default_bytes_bytes
4112+
else:
4113+
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
4114+
if isinstance(compressors, dict | Codec):
4115+
maybe_bytes_bytes = (compressors,)
4116+
else:
4117+
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4118+
4119+
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
4120+
41514121
return out_array_array, out_array_bytes, out_bytes_bytes
41524122

41534123

src/zarr/core/config.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,20 @@ def reset(self) -> None:
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
7878
},
79-
"v3_default_codecs": {
79+
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
80+
"v3_default_serializer": {
81+
"numeric": {"name": "bytes", "configuration": {"endian": "little"}},
82+
"string": {"name": "vlen-utf8"},
83+
"bytes": {"name": "vlen-bytes"},
84+
},
85+
"v3_default_compressors": {
8086
"numeric": [
81-
{"name": "bytes", "configuration": {"endian": "little"}},
8287
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8388
],
8489
"string": [
85-
{"name": "vlen-utf8"},
8690
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8791
],
8892
"bytes": [
89-
{"name": "vlen-bytes"},
9093
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
9194
],
9295
},

0 commit comments

Comments
 (0)