2929from zarr .abc .store import Store , set_or_delete
3030from zarr .codecs ._v2 import V2Codec
3131from zarr .codecs .bytes import BytesCodec
32+ from zarr .codecs .vlen_utf8 import VLenBytesCodec , VLenUTF8Codec
33+ from zarr .codecs .zstd import ZstdCodec
3234from zarr .core ._info import ArrayInfo
3335from zarr .core .array_spec import ArrayConfig , ArrayConfigLike , parse_array_config
3436from zarr .core .attributes import Attributes
6769from zarr .core .config import categorize_data_type
6870from zarr .core .config import config as zarr_config
6971from zarr .core .dtype import (
72+ VariableLengthBytes ,
73+ VariableLengthUTF8 ,
7074 ZDType ,
7175 ZDTypeLike ,
7276 parse_data_type ,
7377)
74- from zarr .core .dtype .common import HasEndianness , HasItemSize
78+ from zarr .core .dtype .common import HasEndianness , HasItemSize , HasObjectCodec
7579from zarr .core .indexing import (
7680 BasicIndexer ,
7781 BasicSelection ,
108112)
109113from zarr .core .metadata .v2 import (
110114 CompressorLikev2 ,
115+ get_object_codec_id ,
111116 parse_compressor ,
112117 parse_filters ,
113118)
@@ -708,7 +713,10 @@ def _create_metadata_v3(
708713
709714 shape = parse_shapelike (shape )
710715 if codecs is None :
711- filters , serializer , compressors = _get_default_chunk_encoding_v3 (dtype )
716+ filters = default_filters_v3 (dtype )
717+ serializer = default_serializer_v3 (dtype )
718+ compressors = default_compressors_v3 (dtype )
719+
712720 codecs_parsed = (* filters , serializer , * compressors )
713721 else :
714722 codecs_parsed = tuple (codecs )
@@ -848,10 +856,9 @@ async def _create_v2(
848856 else :
849857 await ensure_no_existing_node (store_path , zarr_format = 2 )
850858
851- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
852859 compressor_parsed : CompressorLikev2
853860 if compressor == "auto" :
854- compressor_parsed = default_compressor
861+ compressor_parsed = default_compressor_v2 ( dtype )
855862 elif isinstance (compressor , BytesBytesCodec ):
856863 raise ValueError (
857864 "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -861,7 +868,7 @@ async def _create_v2(
861868 compressor_parsed = compressor
862869
863870 if filters is None :
864- filters = default_filters
871+ filters = default_filters_v2 ( dtype )
865872
866873 metadata = cls ._create_metadata_v2 (
867874 shape = shape ,
@@ -4641,19 +4648,80 @@ def _get_default_chunk_encoding_v3(
46414648 )
46424649
46434650
4644- def _get_default_chunk_encoding_v2 (
4645- dtype : ZDType [TBaseDType , TBaseScalar ],
4646- ) -> tuple [tuple [numcodecs .abc .Codec , ...] | None , numcodecs .abc .Codec | None ]:
4651+ def default_filters_v3 (dtype : ZDType [Any , Any ]) -> tuple [ArrayArrayCodec , ...]:
46474652 """
4648- Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4653+ Given a data type, return the default filters for that data type.
4654+
4655+ This is an empty tuple. No data types have default filters.
46494656 """
4650- dtype_category = categorize_data_type (dtype )
4651- filters = zarr_config .get ("array.v2_default_filters" ).get (dtype_category )
4652- compressor = zarr_config .get ("array.v2_default_compressor" ).get (dtype_category )
4653- if filters is not None :
4654- filters = tuple (numcodecs .get_codec (f ) for f in filters )
4657+ return ()
4658+
4659+
4660+ def default_compressors_v3 (dtype : ZDType [Any , Any ]) -> tuple [BytesBytesCodec , ...]:
4661+ """
4662+ Given a data type, return the default compressors for that data type.
4663+
4664+ This is just a tuple containing ``ZstdCodec``
4665+ """
4666+ return (ZstdCodec (),)
4667+
4668+
4669+ def default_serializer_v3 (dtype : ZDType [Any , Any ]) -> ArrayBytesCodec :
4670+ """
4671+ Given a data type, return the default serializer for that data type.
4672+
4673+ The default serializer for most data types is the ``BytesCodec``, which may or may not be
4674+ parameterized with an endianness, depending on whether the data type has endianness. Variable
4675+ length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4676+ ``VLenBytesCodec``, respectively.
4677+
4678+ """
4679+ serializer : ArrayBytesCodec = BytesCodec (endian = None )
4680+
4681+ if isinstance (dtype , HasEndianness ):
4682+ serializer = BytesCodec (endian = "little" )
4683+ elif isinstance (dtype , HasObjectCodec ):
4684+ if dtype .object_codec_id == "vlen-bytes" :
4685+ serializer = VLenBytesCodec ()
4686+ elif dtype .object_codec_id == "vlen-utf8" :
4687+ serializer = VLenUTF8Codec ()
4688+ else :
4689+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4690+ raise ValueError (msg )
4691+ return serializer
4692+
4693+
4694+ def default_filters_v2 (dtype : ZDType [Any , Any ]) -> tuple [numcodecs .abc .Codec ] | None :
4695+ """
4696+ Given a data type, return the default filters for that data type.
4697+
4698+ For data types that require an object codec, namely variable length data types,
4699+ this is a tuple containing the object codec. Otherwise it's ``None``.
4700+ """
4701+ if isinstance (dtype , HasObjectCodec ):
4702+ if dtype .object_codec_id == "vlen-bytes" :
4703+ from numcodecs import VLenBytes
46554704
4656- return filters , numcodecs .get_codec (compressor )
4705+ return (VLenBytes (),)
4706+ elif dtype .object_codec_id == "vlen-utf8" :
4707+ from numcodecs import VLenUTF8
4708+
4709+ return (VLenUTF8 (),)
4710+ else :
4711+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4712+ raise ValueError (msg )
4713+ return None
4714+
4715+
4716+ def default_compressor_v2 (dtype : ZDType [Any , Any ]) -> numcodecs .abc .Codec :
4717+ """
4718+ Given a data type, return the default compressors for that data type.
4719+
4720+ This is just the numcodecs ``Zstd`` codec.
4721+ """
4722+ from numcodecs import Zstd
4723+
4724+ return Zstd (level = 0 , checksum = False )
46574725
46584726
46594727def _parse_chunk_encoding_v2 (
@@ -4665,14 +4733,13 @@ def _parse_chunk_encoding_v2(
46654733 """
46664734 Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
46674735 """
4668- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
46694736 _filters : tuple [numcodecs .abc .Codec , ...] | None
46704737 _compressor : numcodecs .abc .Codec | None
46714738
46724739 if compressor is None or compressor == ():
46734740 _compressor = None
46744741 elif compressor == "auto" :
4675- _compressor = default_compressor
4742+ _compressor = default_compressor_v2 ( dtype )
46764743 elif isinstance (compressor , tuple | list ) and len (compressor ) == 1 :
46774744 _compressor = parse_compressor (compressor [0 ])
46784745 else :
@@ -4684,7 +4751,7 @@ def _parse_chunk_encoding_v2(
46844751 if filters is None :
46854752 _filters = None
46864753 elif filters == "auto" :
4687- _filters = default_filters
4754+ _filters = default_filters_v2 ( dtype )
46884755 else :
46894756 if isinstance (filters , Iterable ):
46904757 for idx , f in enumerate (filters ):
@@ -4695,7 +4762,33 @@ def _parse_chunk_encoding_v2(
46954762 )
46964763 raise TypeError (msg )
46974764 _filters = parse_filters (filters )
4698-
4765+ if isinstance (dtype , HasObjectCodec ):
4766+ # check the filters and the compressor for the object codec required for this data type
4767+ if _filters is None :
4768+ if _compressor is None :
4769+ object_codec_id = None
4770+ else :
4771+ object_codec_id = get_object_codec_id ((_compressor .get_config (),))
4772+ else :
4773+ object_codec_id = get_object_codec_id (
4774+ (
4775+ * [f .get_config () for f in _filters ],
4776+ _compressor .get_config () if _compressor is not None else None ,
4777+ )
4778+ )
4779+ if object_codec_id is None :
4780+ if isinstance (dtype , VariableLengthUTF8 ): # type: ignore[unreachable]
4781+ codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable]
4782+ elif isinstance (dtype , VariableLengthBytes ): # type: ignore[unreachable]
4783+ codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable]
4784+ else :
4785+ codec_name = f"an unknown object codec with id { dtype .object_codec_id !r} "
4786+ msg = (
4787+ f"Data type { dtype } requires { codec_name } , "
4788+ "but no such codec was specified in the filters or compressor parameters for "
4789+ "this array. "
4790+ )
4791+ raise ValueError (msg )
46994792 return _filters , _compressor
47004793
47014794
@@ -4709,14 +4802,11 @@ def _parse_chunk_encoding_v3(
47094802 """
47104803 Generate chunk encoding classes for v3 arrays with optional defaults.
47114804 """
4712- default_array_array , default_array_bytes , default_bytes_bytes = _get_default_chunk_encoding_v3 (
4713- dtype
4714- )
47154805
47164806 if filters is None :
47174807 out_array_array : tuple [ArrayArrayCodec , ...] = ()
47184808 elif filters == "auto" :
4719- out_array_array = default_array_array
4809+ out_array_array = default_filters_v3 ( dtype )
47204810 else :
47214811 maybe_array_array : Iterable [Codec | dict [str , JSON ]]
47224812 if isinstance (filters , dict | Codec ):
@@ -4726,7 +4816,7 @@ def _parse_chunk_encoding_v3(
47264816 out_array_array = tuple (_parse_array_array_codec (c ) for c in maybe_array_array )
47274817
47284818 if serializer == "auto" :
4729- out_array_bytes = default_array_bytes
4819+ out_array_bytes = default_serializer_v3 ( dtype )
47304820 else :
47314821 # TODO: ensure that the serializer is compatible with the ndarray produced by the
47324822 # array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4736,7 +4826,7 @@ def _parse_chunk_encoding_v3(
47364826 if compressors is None :
47374827 out_bytes_bytes : tuple [BytesBytesCodec , ...] = ()
47384828 elif compressors == "auto" :
4739- out_bytes_bytes = default_bytes_bytes
4829+ out_bytes_bytes = default_compressors_v3 ( dtype )
47404830 else :
47414831 maybe_bytes_bytes : Iterable [Codec | dict [str , JSON ]]
47424832 if isinstance (compressors , dict | Codec ):
@@ -4746,17 +4836,11 @@ def _parse_chunk_encoding_v3(
47464836
47474837 out_bytes_bytes = tuple (_parse_bytes_bytes_codec (c ) for c in maybe_bytes_bytes )
47484838
4749- # specialize codecs as needed given the dtype
4750-
4751- # TODO: refactor so that the config only contains the name of the codec, and we use the dtype
4752- # to create the codec instance, instead of storing a dict representation of a full codec.
4753-
47544839 # TODO: ensure that the serializer is compatible with the ndarray produced by the
47554840 # array-array codecs. For example, if a sequence of array-array codecs produces an
47564841 # array with a single-byte data type, then the serializer should not specify endiannesss.
4757- if isinstance (out_array_bytes , BytesCodec ) and not isinstance (dtype , HasEndianness ):
4758- # The default endianness in the bytescodec might not be None, so we need to replace it
4759- out_array_bytes = replace (out_array_bytes , endian = None )
4842+
4843+ # TODO: add checks to ensure that the right serializer is used for vlen data types
47604844 return out_array_array , out_array_bytes , out_bytes_bytes
47614845
47624846
0 commit comments