3030from zarr .abc .store import Store , set_or_delete
3131from zarr .codecs ._v2 import V2Codec
3232from zarr .codecs .bytes import BytesCodec
33+ from zarr .codecs .vlen_utf8 import VLenBytesCodec , VLenUTF8Codec
34+ from zarr .codecs .zstd import ZstdCodec
3335from zarr .core ._info import ArrayInfo
3436from zarr .core .array_spec import ArrayConfig , ArrayConfigLike , parse_array_config
3537from zarr .core .attributes import Attributes
7274 ZDTypeLike ,
7375 parse_data_type ,
7476)
75- from zarr .core .dtype .common import HasEndianness , HasItemSize
77+ from zarr .core .dtype .common import HasEndianness , HasItemSize , HasObjectCodec
7678from zarr .core .indexing import (
7779 BasicIndexer ,
7880 BasicSelection ,
@@ -710,7 +712,10 @@ def _create_metadata_v3(
710712
711713 shape = parse_shapelike (shape )
712714 if codecs is None :
713- filters , serializer , compressors = _get_default_chunk_encoding_v3 (dtype )
715+ filters = default_filters_v3 (dtype )
716+ serializer = default_serializer_v3 (dtype )
717+ compressors = default_compressors_v3 (dtype )
718+
714719 codecs_parsed = (* filters , serializer , * compressors )
715720 else :
716721 codecs_parsed = tuple (codecs )
@@ -850,10 +855,9 @@ async def _create_v2(
850855 else :
851856 await ensure_no_existing_node (store_path , zarr_format = 2 )
852857
853- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
854858 compressor_parsed : CompressorLikev2
855859 if compressor == "auto" :
856- compressor_parsed = default_compressor
860+ compressor_parsed = default_compressor_v2 ( dtype )
857861 elif isinstance (compressor , BytesBytesCodec ):
858862 raise ValueError (
859863 "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -863,7 +867,7 @@ async def _create_v2(
863867 compressor_parsed = compressor
864868
865869 if filters is None :
866- filters = default_filters
870+ filters = default_filters_v2 ( dtype )
867871
868872 metadata = cls ._create_metadata_v2 (
869873 shape = shape ,
@@ -4654,19 +4658,80 @@ def _get_default_chunk_encoding_v3(
46544658 )
46554659
46564660
4657- def _get_default_chunk_encoding_v2 (
4658- dtype : ZDType [TBaseDType , TBaseScalar ],
4659- ) -> tuple [tuple [numcodecs .abc .Codec , ...] | None , numcodecs .abc .Codec | None ]:
4661+ def default_filters_v3 (dtype : ZDType [Any , Any ]) -> tuple [ArrayArrayCodec , ...]:
46604662 """
4661- Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4663+ Given a data type, return the default filters for that data type.
4664+
4665+ This is an empty tuple. No data types have default filters.
46624666 """
4663- dtype_category = categorize_data_type (dtype )
4664- filters = zarr_config .get ("array.v2_default_filters" ).get (dtype_category )
4665- compressor = zarr_config .get ("array.v2_default_compressor" ).get (dtype_category )
4666- if filters is not None :
4667- filters = tuple (numcodecs .get_codec (f ) for f in filters )
4667+ return ()
46684668
4669- return filters , numcodecs .get_codec (compressor )
4669+
4670+ def default_compressors_v3 (dtype : ZDType [Any , Any ]) -> tuple [BytesBytesCodec , ...]:
4671+ """
4672+ Given a data type, return the default compressors for that data type.
4673+
4674+ This is just a tuple containing ``ZstdCodec``
4675+ """
4676+ return (ZstdCodec (),)
4677+
4678+
4679+ def default_serializer_v3 (dtype : ZDType [Any , Any ]) -> ArrayBytesCodec :
4680+ """
4681+ Given a data type, return the default serializer for that data type.
4682+
4683+ The default serializer for most data types is the ``BytesCodec``, which may or may not be
4684+ parameterized with an endianness, depending on whether the data type has endianness. Variable
4685+ length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4686+ ``VLenBytesCodec``, respectively.
4687+
4688+ """
4689+ serializer : ArrayBytesCodec = BytesCodec ()
4690+
4691+ if isinstance (dtype , HasEndianness ):
4692+ serializer = BytesCodec (endian = "little" )
4693+ elif isinstance (dtype , HasObjectCodec ):
4694+ if dtype .object_codec_id == "vlen-bytes" :
4695+ serializer = VLenBytesCodec ()
4696+ elif dtype .object_codec_id == "vlen-utf8" :
4697+ serializer = VLenUTF8Codec ()
4698+ else :
4699+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id } "
4700+ raise ValueError (msg )
4701+ return serializer
4702+
4703+
4704+ def default_filters_v2 (dtype : ZDType [Any , Any ]) -> tuple [numcodecs .abc .Codec ] | None :
4705+ """
4706+ Given a data type, return the default filters for that data type.
4707+
4708+ For data types that require an object codec, namely variable length data types,
4709+ this is a tuple containing the object codec. Otherwise it's ``None``.
4710+ """
4711+ if isinstance (dtype , HasObjectCodec ):
4712+ if dtype .object_codec_id == "vlen-bytes" :
4713+ from numcodecs import VLenBytes
4714+
4715+ return (VLenBytes (),)
4716+ elif dtype .object_codec_id == "vlen-utf8" :
4717+ from numcodecs import VLenUTF8
4718+
4719+ return (VLenUTF8 (),)
4720+ else :
4721+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id } "
4722+ raise ValueError (msg )
4723+ return None
4724+
4725+
4726+ def default_compressor_v2 (dtype : ZDType [Any , Any ]) -> numcodecs .abc .Codec :
4727+ """
4728+ Given a data type, return the default compressors for that data type.
4729+
4730+ This is just the numcodecs ``Zstd`` codec.
4731+ """
4732+ from numcodecs import Zstd
4733+
4734+ return Zstd (level = 0 , checksum = False )
46704735
46714736
46724737def _parse_chunk_encoding_v2 (
@@ -4678,14 +4743,13 @@ def _parse_chunk_encoding_v2(
46784743 """
46794744 Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
46804745 """
4681- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
46824746 _filters : tuple [numcodecs .abc .Codec , ...] | None
46834747 _compressor : numcodecs .abc .Codec | None
46844748
46854749 if compressor is None or compressor == ():
46864750 _compressor = None
46874751 elif compressor == "auto" :
4688- _compressor = default_compressor
4752+ _compressor = default_compressor_v2 ( dtype )
46894753 elif isinstance (compressor , tuple | list ) and len (compressor ) == 1 :
46904754 _compressor = parse_compressor (compressor [0 ])
46914755 else :
@@ -4697,7 +4761,7 @@ def _parse_chunk_encoding_v2(
46974761 if filters is None :
46984762 _filters = None
46994763 elif filters == "auto" :
4700- _filters = default_filters
4764+ _filters = default_filters_v2 ( dtype )
47014765 else :
47024766 if isinstance (filters , Iterable ):
47034767 for idx , f in enumerate (filters ):
@@ -4722,14 +4786,11 @@ def _parse_chunk_encoding_v3(
47224786 """
47234787 Generate chunk encoding classes for v3 arrays with optional defaults.
47244788 """
4725- default_array_array , default_array_bytes , default_bytes_bytes = _get_default_chunk_encoding_v3 (
4726- dtype
4727- )
47284789
47294790 if filters is None :
47304791 out_array_array : tuple [ArrayArrayCodec , ...] = ()
47314792 elif filters == "auto" :
4732- out_array_array = default_array_array
4793+ out_array_array = default_filters_v3 ( dtype )
47334794 else :
47344795 maybe_array_array : Iterable [Codec | dict [str , JSON ]]
47354796 if isinstance (filters , dict | Codec ):
@@ -4739,7 +4800,7 @@ def _parse_chunk_encoding_v3(
47394800 out_array_array = tuple (_parse_array_array_codec (c ) for c in maybe_array_array )
47404801
47414802 if serializer == "auto" :
4742- out_array_bytes = default_array_bytes
4803+ out_array_bytes = default_serializer_v3 ( dtype )
47434804 else :
47444805 # TODO: ensure that the serializer is compatible with the ndarray produced by the
47454806 # array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4749,7 +4810,7 @@ def _parse_chunk_encoding_v3(
47494810 if compressors is None :
47504811 out_bytes_bytes : tuple [BytesBytesCodec , ...] = ()
47514812 elif compressors == "auto" :
4752- out_bytes_bytes = default_bytes_bytes
4813+ out_bytes_bytes = default_compressors_v3 ( dtype )
47534814 else :
47544815 maybe_bytes_bytes : Iterable [Codec | dict [str , JSON ]]
47554816 if isinstance (compressors , dict | Codec ):
0 commit comments