Skip to content

Commit 0797c15

Browse files
committed
refactor default chunk encoding to skip config. add tests for deprecated config keys
1 parent 9d97b24 commit 0797c15

File tree

4 files changed

+159
-128
lines changed

4 files changed

+159
-128
lines changed

src/zarr/core/array.py

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
from zarr.abc.store import Store, set_or_delete
3131
from zarr.codecs._v2 import V2Codec
3232
from zarr.codecs.bytes import BytesCodec
33+
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
34+
from zarr.codecs.zstd import ZstdCodec
3335
from zarr.core._info import ArrayInfo
3436
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
3537
from zarr.core.attributes import Attributes
@@ -72,7 +74,7 @@
7274
ZDTypeLike,
7375
parse_data_type,
7476
)
75-
from zarr.core.dtype.common import HasEndianness, HasItemSize
77+
from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
7678
from zarr.core.indexing import (
7779
BasicIndexer,
7880
BasicSelection,
@@ -710,7 +712,10 @@ def _create_metadata_v3(
710712

711713
shape = parse_shapelike(shape)
712714
if codecs is None:
713-
filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype)
715+
filters = default_filters_v3(dtype)
716+
serializer = default_serializer_v3(dtype)
717+
compressors = default_compressors_v3(dtype)
718+
714719
codecs_parsed = (*filters, serializer, *compressors)
715720
else:
716721
codecs_parsed = tuple(codecs)
@@ -850,10 +855,9 @@ async def _create_v2(
850855
else:
851856
await ensure_no_existing_node(store_path, zarr_format=2)
852857

853-
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
854858
compressor_parsed: CompressorLikev2
855859
if compressor == "auto":
856-
compressor_parsed = default_compressor
860+
compressor_parsed = default_compressor_v2(dtype)
857861
elif isinstance(compressor, BytesBytesCodec):
858862
raise ValueError(
859863
"Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -863,7 +867,7 @@ async def _create_v2(
863867
compressor_parsed = compressor
864868

865869
if filters is None:
866-
filters = default_filters
870+
filters = default_filters_v2(dtype)
867871

868872
metadata = cls._create_metadata_v2(
869873
shape=shape,
@@ -4654,19 +4658,80 @@ def _get_default_chunk_encoding_v3(
46544658
)
46554659

46564660

4657-
def _get_default_chunk_encoding_v2(
4658-
dtype: ZDType[TBaseDType, TBaseScalar],
4659-
) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
4661+
def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]:
46604662
"""
4661-
Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4663+
Given a data type, return the default filters for that data type.
4664+
4665+
This is an empty tuple. No data types have default filters.
46624666
"""
4663-
dtype_category = categorize_data_type(dtype)
4664-
filters = zarr_config.get("array.v2_default_filters").get(dtype_category)
4665-
compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category)
4666-
if filters is not None:
4667-
filters = tuple(numcodecs.get_codec(f) for f in filters)
4667+
return ()
46684668

4669-
return filters, numcodecs.get_codec(compressor)
4669+
4670+
def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]:
4671+
"""
4672+
Given a data type, return the default compressors for that data type.
4673+
4674+
This is just a tuple containing ``ZstdCodec``
4675+
"""
4676+
return (ZstdCodec(),)
4677+
4678+
4679+
def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
4680+
"""
4681+
Given a data type, return the default serializer for that data type.
4682+
4683+
The default serializer for most data types is the ``BytesCodec``, which may or may not be
4684+
parameterized with an endianness, depending on whether the data type has endianness. Variable
4685+
length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4686+
``VLenBytesCodec``, respectively.
4687+
4688+
"""
4689+
serializer: ArrayBytesCodec = BytesCodec()
4690+
4691+
if isinstance(dtype, HasEndianness):
4692+
serializer = BytesCodec(endian="little")
4693+
elif isinstance(dtype, HasObjectCodec):
4694+
if dtype.object_codec_id == "vlen-bytes":
4695+
serializer = VLenBytesCodec()
4696+
elif dtype.object_codec_id == "vlen-utf8":
4697+
serializer = VLenUTF8Codec()
4698+
else:
4699+
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}"
4700+
raise ValueError(msg)
4701+
return serializer
4702+
4703+
4704+
def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None:
4705+
"""
4706+
Given a data type, return the default filters for that data type.
4707+
4708+
For data types that require an object codec, namely variable length data types,
4709+
this is a tuple containing the object codec. Otherwise it's ``None``.
4710+
"""
4711+
if isinstance(dtype, HasObjectCodec):
4712+
if dtype.object_codec_id == "vlen-bytes":
4713+
from numcodecs import VLenBytes
4714+
4715+
return (VLenBytes(),)
4716+
elif dtype.object_codec_id == "vlen-utf8":
4717+
from numcodecs import VLenUTF8
4718+
4719+
return (VLenUTF8(),)
4720+
else:
4721+
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id}"
4722+
raise ValueError(msg)
4723+
return None
4724+
4725+
4726+
def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec:
4727+
"""
4728+
Given a data type, return the default compressors for that data type.
4729+
4730+
This is just the numcodecs ``Zstd`` codec.
4731+
"""
4732+
from numcodecs import Zstd
4733+
4734+
return Zstd(level=0, checksum=False)
46704735

46714736

46724737
def _parse_chunk_encoding_v2(
@@ -4678,14 +4743,13 @@ def _parse_chunk_encoding_v2(
46784743
"""
46794744
Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
46804745
"""
4681-
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
46824746
_filters: tuple[numcodecs.abc.Codec, ...] | None
46834747
_compressor: numcodecs.abc.Codec | None
46844748

46854749
if compressor is None or compressor == ():
46864750
_compressor = None
46874751
elif compressor == "auto":
4688-
_compressor = default_compressor
4752+
_compressor = default_compressor_v2(dtype)
46894753
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
46904754
_compressor = parse_compressor(compressor[0])
46914755
else:
@@ -4697,7 +4761,7 @@ def _parse_chunk_encoding_v2(
46974761
if filters is None:
46984762
_filters = None
46994763
elif filters == "auto":
4700-
_filters = default_filters
4764+
_filters = default_filters_v2(dtype)
47014765
else:
47024766
if isinstance(filters, Iterable):
47034767
for idx, f in enumerate(filters):
@@ -4722,14 +4786,11 @@ def _parse_chunk_encoding_v3(
47224786
"""
47234787
Generate chunk encoding classes for v3 arrays with optional defaults.
47244788
"""
4725-
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
4726-
dtype
4727-
)
47284789

47294790
if filters is None:
47304791
out_array_array: tuple[ArrayArrayCodec, ...] = ()
47314792
elif filters == "auto":
4732-
out_array_array = default_array_array
4793+
out_array_array = default_filters_v3(dtype)
47334794
else:
47344795
maybe_array_array: Iterable[Codec | dict[str, JSON]]
47354796
if isinstance(filters, dict | Codec):
@@ -4739,7 +4800,7 @@ def _parse_chunk_encoding_v3(
47394800
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
47404801

47414802
if serializer == "auto":
4742-
out_array_bytes = default_array_bytes
4803+
out_array_bytes = default_serializer_v3(dtype)
47434804
else:
47444805
# TODO: ensure that the serializer is compatible with the ndarray produced by the
47454806
# array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4749,7 +4810,7 @@ def _parse_chunk_encoding_v3(
47494810
if compressors is None:
47504811
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
47514812
elif compressors == "auto":
4752-
out_bytes_bytes = default_bytes_bytes
4813+
out_bytes_bytes = default_compressors_v3(dtype)
47534814
else:
47544815
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
47554816
if isinstance(compressors, dict | Codec):

src/zarr/core/config.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,25 @@ def enable_gpu(self) -> ConfigSet:
7878
)
7979

8080

81+
# these keys were removed from the config as part of the 3.1.0 release.
82+
# these deprecations should be removed in 3.1.1 or thereabouts.
83+
deprecations = {
84+
"array.v2_default_compressor.numeric": None,
85+
"array.v2_default_compressor.string": None,
86+
"array.v2_default_compressor.bytes": None,
87+
"array.v2_default_filters.string": None,
88+
"array.v2_default_filters.bytes": None,
89+
"array.v3_default_filters.numeric": None,
90+
"array.v3_default_filters.raw": None,
91+
"array.v3_default_filters.bytes": None,
92+
"array.v3_default_serializer.numeric": None,
93+
"array.v3_default_serializer.string": None,
94+
"array.v3_default_serializer.bytes": None,
95+
"array.v3_default_compressors.string": None,
96+
"array.v3_default_compressors.bytes": None,
97+
"array.v3_default_compressors": None,
98+
}
99+
81100
# The default configuration for zarr
82101
config = Config(
83102
"zarr",
@@ -87,27 +106,6 @@ def enable_gpu(self) -> ConfigSet:
87106
"array": {
88107
"order": "C",
89108
"write_empty_chunks": False,
90-
"v2_default_compressor": {
91-
"default": {"id": "zstd", "level": 0, "checksum": False},
92-
"variable-length-string": {"id": "zstd", "level": 0, "checksum": False},
93-
},
94-
"v2_default_filters": {
95-
"default": None,
96-
"variable-length-string": [{"id": "vlen-utf8"}],
97-
},
98-
"v3_default_filters": {"default": [], "variable-length-string": []},
99-
"v3_default_serializer": {
100-
"default": {"name": "bytes", "configuration": {"endian": "little"}},
101-
"variable-length-string": {"name": "vlen-utf8"},
102-
},
103-
"v3_default_compressors": {
104-
"default": [
105-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
106-
],
107-
"variable-length-string": [
108-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}}
109-
],
110-
},
111109
},
112110
"async": {"concurrency": 10, "timeout": None},
113111
"threading": {"max_workers": None},
@@ -132,6 +130,7 @@ def enable_gpu(self) -> ConfigSet:
132130
"ndbuffer": "zarr.buffer.cpu.NDBuffer",
133131
}
134132
],
133+
deprecations=deprecations,
135134
)
136135

137136

tests/test_config.py

Lines changed: 27 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
from collections.abc import Iterable
3-
from typing import TYPE_CHECKING, Any
3+
from typing import Any
44
from unittest import mock
55
from unittest.mock import Mock
66

@@ -16,16 +16,13 @@
1616
BloscCodec,
1717
BytesCodec,
1818
Crc32cCodec,
19-
GzipCodec,
2019
ShardingCodec,
2120
)
22-
from zarr.core.array import create_array
2321
from zarr.core.array_spec import ArraySpec
2422
from zarr.core.buffer import NDBuffer
2523
from zarr.core.buffer.core import Buffer
2624
from zarr.core.codec_pipeline import BatchedCodecPipeline
2725
from zarr.core.config import BadConfigError, config
28-
from zarr.core.dtype import Int8, VariableLengthUTF8
2926
from zarr.core.indexing import SelectorTuple
3027
from zarr.registry import (
3128
fully_qualified_name,
@@ -38,17 +35,13 @@
3835
register_ndbuffer,
3936
register_pipeline,
4037
)
41-
from zarr.storage import MemoryStore
4238
from zarr.testing.buffer import (
4339
NDBufferUsingTestNDArrayLike,
4440
StoreExpectingTestBuffer,
4541
TestBuffer,
4642
TestNDArrayLike,
4743
)
4844

49-
if TYPE_CHECKING:
50-
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
51-
5245

5346
def test_config_defaults_set() -> None:
5447
# regression test for available defaults
@@ -60,27 +53,6 @@ def test_config_defaults_set() -> None:
6053
"array": {
6154
"order": "C",
6255
"write_empty_chunks": False,
63-
"v2_default_compressor": {
64-
"default": {"id": "zstd", "level": 0, "checksum": False},
65-
"variable-length-string": {"id": "zstd", "level": 0, "checksum": False},
66-
},
67-
"v2_default_filters": {
68-
"default": None,
69-
"variable-length-string": [{"id": "vlen-utf8"}],
70-
},
71-
"v3_default_filters": {"default": [], "variable-length-string": []},
72-
"v3_default_serializer": {
73-
"default": {"name": "bytes", "configuration": {"endian": "little"}},
74-
"variable-length-string": {"name": "vlen-utf8"},
75-
},
76-
"v3_default_compressors": {
77-
"default": [
78-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
79-
],
80-
"variable-length-string": [
81-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}}
82-
],
83-
},
8456
},
8557
"async": {"concurrency": 10, "timeout": None},
8658
"threading": {"max_workers": None},
@@ -323,29 +295,31 @@ class NewCodec2(BytesCodec):
323295
get_codec_class("new_codec")
324296

325297

326-
@pytest.mark.parametrize("dtype_category", ["variable-length-string", "default"])
327-
@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
328-
async def test_default_codecs(dtype_category: str) -> None:
298+
@pytest.mark.parametrize(
299+
"key",
300+
[
301+
"array.v2_default_compressor.numeric",
302+
"array.v2_default_compressor.string",
303+
"array.v2_default_compressor.bytes",
304+
"array.v2_default_filters.string",
305+
"array.v2_default_filters.bytes",
306+
"array.v3_default_filters.numeric",
307+
"array.v3_default_filters.raw",
308+
"array.v3_default_filters.bytes",
309+
"array.v3_default_serializer.numeric",
310+
"array.v3_default_serializer.string",
311+
"array.v3_default_serializer.bytes",
312+
"array.v3_default_compressors.string",
313+
"array.v3_default_compressors.bytes",
314+
"array.v3_default_compressors",
315+
],
316+
)
317+
def test_deprecated_config(key: str) -> None:
329318
"""
330-
Test that the default compressors are sensitive to the current setting of the config.
319+
Test that a valuerror is raised when setting the default chunk encoding for a given
320+
data type category
331321
"""
332-
zdtype: ZDType[TBaseDType, TBaseScalar]
333-
if dtype_category == "variable-length-string":
334-
zdtype = VariableLengthUTF8() # type: ignore[assignment]
335-
else:
336-
zdtype = Int8()
337-
expected_compressors = (GzipCodec(),)
338-
new_conf = {
339-
f"array.v3_default_compressors.{dtype_category}": [
340-
c.to_dict() for c in expected_compressors
341-
]
342-
}
343-
with config.set(new_conf):
344-
arr = await create_array(
345-
shape=(100,),
346-
chunks=(100,),
347-
dtype=zdtype,
348-
zarr_format=3,
349-
store=MemoryStore(),
350-
)
351-
assert arr.compressors == expected_compressors
322+
323+
with pytest.raises(ValueError):
324+
with zarr.config.set({key: "foo"}):
325+
pass

0 commit comments

Comments
 (0)