1818from zarr .abc .codec import ArrayArrayCodec , ArrayBytesCodec , BytesBytesCodec , Codec
1919from zarr .abc .store import Store , set_or_delete
2020from zarr .codecs ._v2 import V2Codec
21- from zarr .codecs .zstd import ZstdCodec
2221from zarr .core ._info import ArrayInfo
2322from zarr .core .array_spec import ArrayConfig , ArrayConfigParams , parse_array_config
2423from zarr .core .attributes import Attributes
8786 ArrayV3MetadataDict ,
8887 T_ArrayMetadata ,
8988)
90- from zarr .core .metadata .v2 import _default_filters_and_compressor
89+ from zarr .core .metadata .v2 import (
90+ _default_compressor ,
91+ _default_filters ,
92+ )
9193from zarr .core .metadata .v3 import DataType , parse_node_type_array
9294from zarr .core .sync import sync
9395from zarr .errors import MetadataValidationError
@@ -438,8 +440,8 @@ async def create(
438440 If no codecs are provided, default codecs will be used:
439441
440442 - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
441- - For Unicode strings, the default is ``VLenUTF8Codec``.
442- - For bytes or objects, the default is ``VLenBytesCodec``.
443+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
444+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
443445
444446 These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
445447 dimension_names : Iterable[str], optional
@@ -460,14 +462,14 @@ async def create(
460462 order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
461463 filters : list[dict[str, JSON]], optional
462464 Sequence of filters to use to encode chunk data prior to compression.
463- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
464- nor ``filters`` are provided, a default compressor will be used. (see
465- ``compressor `` for details)
465+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
466+ are provided, a default set of filters will be used.
467+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
466468 compressor : dict[str, JSON], optional
467469 The compressor used to compress the data (default is None).
468470 V2 only. V3 arrays should use ``codecs`` instead.
469471
470- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
472+ If no ``compressor`` is provided, a default compressor will be used:
471473
472474 - For numeric arrays, the default is ``ZstdCodec``.
473475 - For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -677,8 +679,10 @@ async def _create_v2(
677679 dimension_separator = "."
678680
679681 dtype = parse_dtype (dtype , zarr_format = 2 )
680- if not filters and not compressor :
681- filters , compressor = _default_filters_and_compressor (dtype )
682+ if not filters :
683+ filters = _default_filters (dtype )
684+ if not compressor :
685+ compressor = _default_compressor (dtype )
682686 if np .issubdtype (dtype , np .str_ ):
683687 filters = filters or []
684688 if not any (x ["id" ] == "vlen-utf8" for x in filters ):
@@ -1572,8 +1576,8 @@ def create(
15721576 If no codecs are provided, default codecs will be used:
15731577
15741578 - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
1575- - For Unicode strings, the default is ``VLenUTF8Codec``.
1576- - For bytes or objects, the default is ``VLenBytesCodec``.
1579+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
1580+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
15771581
15781582 These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
15791583 dimension_names : Iterable[str], optional
@@ -1594,14 +1598,14 @@ def create(
15941598 order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``.
15951599 filters : list[dict[str, JSON]], optional
15961600 Sequence of filters to use to encode chunk data prior to compression.
1597- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
1598- nor ``filters`` are provided, a default compressor will be used. (see
1599- ``compressor `` for details)
1601+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
1602+ are provided, a default set of filters will be used.
1603+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
16001604 compressor : dict[str, JSON], optional
16011605 Primary compressor to compress chunk data.
16021606 V2 only. V3 arrays should use ``codecs`` instead.
16031607
1604- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
1608+ If no ``compressor`` is provided, a default compressor will be used:
16051609
16061610 - For numeric arrays, the default is ``ZstdCodec``.
16071611 - For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -3455,7 +3459,7 @@ def _get_default_codecs(
34553459 else :
34563460 dtype_key = "numeric"
34573461
3458- return [{ "name" : codec_id , "configuration" : {}} for codec_id in default_codecs [dtype_key ] ]
3462+ return default_codecs [dtype_key ]
34593463
34603464
34613465FiltersParam : TypeAlias = (
@@ -3672,49 +3676,56 @@ def _get_default_encoding_v3(
36723676 else :
36733677 dtype_key = "numeric"
36743678
3675- codec_names = default_codecs [dtype_key ]
3676- array_bytes_cls , * rest = tuple (get_codec_class (codec_name ) for codec_name in codec_names )
3677- array_bytes : ArrayBytesCodec = cast (ArrayBytesCodec , array_bytes_cls ())
3678- # TODO: we should compress bytes and strings by default!
3679- # The current default codecs only lists names, and strings / bytes are not compressed at all,
3680- # so we insert the ZstdCodec at the end of the list as a default
3681- bytes_bytes : tuple [BytesBytesCodec , ...]
3682- array_array : tuple [ArrayArrayCodec , ...] = ()
3683- if len (rest ) == 0 :
3684- bytes_bytes = (ZstdCodec (),)
3685- else :
3686- bytes_bytes = cast (tuple [BytesBytesCodec , ...], tuple (r () for r in rest ))
3679+ codec_dicts = default_codecs [dtype_key ]
3680+ codecs = tuple (get_codec_class (c ["name" ]).from_dict (c ) for c in codec_dicts )
3681+ array_bytes_maybe = None
3682+ array_array : list [ArrayArrayCodec ] = []
3683+ bytes_bytes : list [BytesBytesCodec ] = []
3684+
3685+ for codec in codecs :
3686+ if isinstance (codec , ArrayBytesCodec ):
3687+ if array_bytes_maybe is not None :
3688+ raise ValueError (
3689+ f"Got two instances of ArrayBytesCodec: { array_bytes_maybe } and { codec } . "
3690+ "Only one array-to-bytes codec is allowed."
3691+ )
3692+ array_bytes_maybe = codec
3693+ elif isinstance (codec , ArrayArrayCodec ):
3694+ array_array .append (codec )
3695+ elif isinstance (codec , BytesBytesCodec ):
3696+ bytes_bytes .append (codec )
3697+ else :
3698+ raise TypeError (f"Unexpected codec type: { type (codec )} " )
36873699
3688- return array_array , array_bytes , bytes_bytes
3700+ if array_bytes_maybe is None :
3701+ raise ValueError ("Required ArrayBytesCodec was not found." )
3702+
3703+ return tuple (array_array ), array_bytes_maybe , tuple (bytes_bytes )
36893704
36903705
36913706def _get_default_chunk_encoding_v2 (
36923707 dtype : np .dtype [Any ],
3693- ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec ]:
3708+ ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec | None ]:
36943709 """
36953710 Get the default chunk encoding for zarr v2 arrays, given a dtype
36963711 """
3697- codec_id_dict = zarr_config .get ("array.v2_default_compressor" )
3698-
36993712 if dtype .kind in "biufcmM" :
37003713 dtype_key = "numeric"
3701- codec_type = "compressor"
37023714 elif dtype .kind in "U" :
37033715 dtype_key = "string"
3704- codec_type = "filter"
37053716 elif dtype .kind in "OSV" :
37063717 dtype_key = "bytes"
3707- codec_type = "filter"
37083718 else :
37093719 raise ValueError (f"Unsupported dtype kind { dtype .kind } " )
3710- codec_id = codec_id_dict [dtype_key ]
3711- codec_instance = numcodecs .get_codec ({"id" : codec_id })
3712- if codec_type == "compressor" :
3713- return (), codec_instance
3714- elif codec_type == "filter" :
3715- return codec_instance , numcodecs .Zstd ()
3716- else :
3717- raise ValueError (f"Unsupported codec type { codec_type } " )
3720+
3721+ compressor_dict = zarr_config .get ("array.v2_default_compressor" ).get (dtype_key , None )
3722+ filter_dicts = zarr_config .get ("array.v2_default_filters" ).get (dtype_key , [])
3723+
3724+ compressor = None
3725+ if compressor_dict is not None :
3726+ compressor = numcodecs .get_codec (compressor_dict )
3727+ filters = tuple (numcodecs .get_codec (f ) for f in filter_dicts )
3728+ return filters , compressor
37183729
37193730
37203731def _parse_chunk_encoding_v2 (
0 commit comments