1818from zarr .abc .codec import ArrayArrayCodec , ArrayBytesCodec , BytesBytesCodec , Codec
1919from zarr .abc .store import Store , set_or_delete
2020from zarr .codecs ._v2 import V2Codec
21- from zarr .codecs .zstd import ZstdCodec
2221from zarr .core ._info import ArrayInfo
2322from zarr .core .array_spec import ArrayConfig , ArrayConfigParams , parse_array_config
2423from zarr .core .attributes import Attributes
8786 ArrayV3MetadataDict ,
8887 T_ArrayMetadata ,
8988)
90- from zarr .core .metadata .v2 import _default_filters_and_compressor
89+ from zarr .core .metadata .v2 import (
90+ _default_compressor ,
91+ _default_filters ,
92+ )
9193from zarr .core .metadata .v3 import DataType , parse_node_type_array
9294from zarr .core .sync import sync
9395from zarr .errors import MetadataValidationError
@@ -438,8 +440,8 @@ async def create(
438440 If no codecs are provided, default codecs will be used:
439441
440442 - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
441- - For Unicode strings, the default is ``VLenUTF8Codec``.
442- - For bytes or objects, the default is ``VLenBytesCodec``.
443+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
444+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
443445
444446 These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
445447 dimension_names : Iterable[str], optional
@@ -460,14 +462,14 @@ async def create(
460462 order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
461463 filters : list[dict[str, JSON]], optional
462464 Sequence of filters to use to encode chunk data prior to compression.
463- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
464- nor ``filters`` are provided, a default compressor will be used. (see
465- ``compressor `` for details)
465+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
466+ are provided, a default set of filters will be used.
467+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
466468 compressor : dict[str, JSON], optional
467469 The compressor used to compress the data (default is None).
468470 V2 only. V3 arrays should use ``codecs`` instead.
469471
470- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
472+ If no ``compressor`` is provided, a default compressor will be used:
471473
472474 - For numeric arrays, the default is ``ZstdCodec``.
473475 - For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -677,8 +679,10 @@ async def _create_v2(
677679 dimension_separator = "."
678680
679681 dtype = parse_dtype (dtype , zarr_format = 2 )
680- if not filters and not compressor :
681- filters , compressor = _default_filters_and_compressor (dtype )
682+ if not filters :
683+ filters = _default_filters (dtype )
684+ if not compressor :
685+ compressor = _default_compressor (dtype )
682686 if np .issubdtype (dtype , np .str_ ):
683687 filters = filters or []
684688 if not any (x ["id" ] == "vlen-utf8" for x in filters ):
@@ -1572,8 +1576,8 @@ def create(
15721576 If no codecs are provided, default codecs will be used:
15731577
15741578 - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
1575- - For Unicode strings, the default is ``VLenUTF8Codec``.
1576- - For bytes or objects, the default is ``VLenBytesCodec``.
1579+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
1580+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
15771581
15781582 These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
15791583 dimension_names : Iterable[str], optional
@@ -1594,14 +1598,14 @@ def create(
15941598 order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``.
15951599 filters : list[dict[str, JSON]], optional
15961600 Sequence of filters to use to encode chunk data prior to compression.
1597- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
1598- nor ``filters`` are provided, a default compressor will be used. (see
1599- ``compressor `` for details)
1601+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
1602+ are provided, a default set of filters will be used.
1603+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
16001604 compressor : dict[str, JSON], optional
16011605 Primary compressor to compress chunk data.
16021606 V2 only. V3 arrays should use ``codecs`` instead.
16031607
1604- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
1608+ If no ``compressor`` is provided, a default compressor will be used:
16051609
16061610 - For numeric arrays, the default is ``ZstdCodec``.
16071611 - For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -3455,7 +3459,7 @@ def _get_default_codecs(
34553459 else :
34563460 dtype_key = "numeric"
34573461
3458- return [{ "name" : codec_id , "configuration" : {}} for codec_id in default_codecs [dtype_key ] ]
3462+ return default_codecs [dtype_key ]
34593463
34603464
34613465FiltersParam : TypeAlias = (
@@ -3673,49 +3677,56 @@ def _get_default_encoding_v3(
36733677 else :
36743678 dtype_key = "numeric"
36753679
3676- codec_names = default_codecs [dtype_key ]
3677- array_bytes_cls , * rest = tuple (get_codec_class (codec_name ) for codec_name in codec_names )
3678- array_bytes : ArrayBytesCodec = cast (ArrayBytesCodec , array_bytes_cls ())
3679- # TODO: we should compress bytes and strings by default!
3680- # The current default codecs only lists names, and strings / bytes are not compressed at all,
3681- # so we insert the ZstdCodec at the end of the list as a default
3682- bytes_bytes : tuple [BytesBytesCodec , ...]
3683- array_array : tuple [ArrayArrayCodec , ...] = ()
3684- if len (rest ) == 0 :
3685- bytes_bytes = (ZstdCodec (),)
3686- else :
3687- bytes_bytes = cast (tuple [BytesBytesCodec , ...], tuple (r () for r in rest ))
3680+ codec_dicts = default_codecs [dtype_key ]
3681+ codecs = tuple (get_codec_class (c ["name" ]).from_dict (c ) for c in codec_dicts )
3682+ array_bytes_maybe = None
3683+ array_array : list [ArrayArrayCodec ] = []
3684+ bytes_bytes : list [BytesBytesCodec ] = []
3685+
3686+ for codec in codecs :
3687+ if isinstance (codec , ArrayBytesCodec ):
3688+ if array_bytes_maybe is not None :
3689+ raise ValueError (
3690+ f"Got two instances of ArrayBytesCodec: { array_bytes_maybe } and { codec } . "
3691+ "Only one array-to-bytes codec is allowed."
3692+ )
3693+ array_bytes_maybe = codec
3694+ elif isinstance (codec , ArrayArrayCodec ):
3695+ array_array .append (codec )
3696+ elif isinstance (codec , BytesBytesCodec ):
3697+ bytes_bytes .append (codec )
3698+ else :
3699+ raise TypeError (f"Unexpected codec type: { type (codec )} " )
36883700
3689- return array_array , array_bytes , bytes_bytes
3701+ if array_bytes_maybe is None :
3702+ raise ValueError ("Required ArrayBytesCodec was not found." )
3703+
3704+ return tuple (array_array ), array_bytes_maybe , tuple (bytes_bytes )
36903705
36913706
36923707def _get_default_chunk_encoding_v2 (
36933708 dtype : np .dtype [Any ],
3694- ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec ]:
3709+ ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec | None ]:
36953710 """
36963711 Get the default chunk encoding for zarr v2 arrays, given a dtype
36973712 """
3698- codec_id_dict = zarr_config .get ("array.v2_default_compressor" )
3699-
37003713 if dtype .kind in "biufcmM" :
37013714 dtype_key = "numeric"
3702- codec_type = "compressor"
37033715 elif dtype .kind in "U" :
37043716 dtype_key = "string"
3705- codec_type = "filter"
37063717 elif dtype .kind in "OSV" :
37073718 dtype_key = "bytes"
3708- codec_type = "filter"
37093719 else :
37103720 raise ValueError (f"Unsupported dtype kind { dtype .kind } " )
3711- codec_id = codec_id_dict [dtype_key ]
3712- codec_instance = numcodecs .get_codec ({"id" : codec_id })
3713- if codec_type == "compressor" :
3714- return (), codec_instance
3715- elif codec_type == "filter" :
3716- return codec_instance , numcodecs .Zstd ()
3717- else :
3718- raise ValueError (f"Unsupported codec type { codec_type } " )
3721+
3722+ compressor_dict = zarr_config .get ("array.v2_default_compressor" ).get (dtype_key , None )
3723+ filter_dicts = zarr_config .get ("array.v2_default_filters" ).get (dtype_key , [])
3724+
3725+ compressor = None
3726+ if compressor_dict is not None :
3727+ compressor = numcodecs .get_codec (compressor_dict )
3728+ filters = tuple (numcodecs .get_codec (f ) for f in filter_dicts )
3729+ return filters , compressor
37193730
37203731
37213732def _parse_chunk_encoding_v2 (
0 commit comments