zarr-developers
diff --git a/‎changes/3228.removal.rst‎
Lines changed: 16 additions & 0 deletions b/‎changes/3228.removal.rst‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/user-guide/arrays.rst‎
Lines changed: 0 additions & 10 deletions b/‎docs/user-guide/arrays.rst‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎docs/user-guide/config.rst‎
Lines changed: 1 addition & 19 deletions b/‎docs/user-guide/config.rst‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎src/zarr/core/array.py‎
Lines changed: 117 additions & 33 deletions b/‎src/zarr/core/array.py‎
Lines changed: 117 additions & 33 deletions
diff --git a/‎src/zarr/core/common.py‎
Lines changed: 2 additions & 2 deletions b/‎src/zarr/core/common.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,16 @@
+Removes default chunk encoding settings (filters, serializer, compressors) from the global
+configuration object.
+
+This removal is justified on the basis that storing chunk encoding settings in the config required
+a brittle, confusing, and inaccurate categorization of array data types, which was particularly
+unsuitable after the recent addition of new data types that didn't fit naturally into the
+pre-existing categories.
+
+The default chunk encoding is the same (Zstandard compression, and the required object codecs for
+variable length data types), but the chunk encoding is now generated by functions that cannot be
+reconfigured at runtime. Users who relied on setting the default chunk encoding via the global configuration object should
+instead specify the desired chunk encoding explicitly when creating an array.
+
+This change also adds an extra validation step to the creation of Zarr V2 arrays, which ensures that
+arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` data type cannot be created without the
+correct "object codec".
@@ -246,16 +246,6 @@ built-in delta filter::
    >>> z.compressors
    (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),)
 
-The default compressor can be changed by setting the value of the using Zarr's
-:ref:`user-guide-config`, e.g.::
-
-   >>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}):
-   ...     z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2)
-   >>> z.filters
-   ()
-   >>> z.compressors
-   (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),)
-
 To disable compression, set ``compressors=None`` when creating an array, e.g.::
 
    >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None)
 
@@ -43,25 +43,7 @@ This is the current default configuration::
 
    >>> zarr.config.pprint()
    {'array': {'order': 'C',
-            'v2_default_compressor': {'default': {'checksum': False,
-                                                   'id': 'zstd',
-                                                   'level': 0},
-                                       'variable-length-string': {'checksum': False,
-                                                                  'id': 'zstd',
-                                                                  'level': 0}},
-            'v2_default_filters': {'default': None,
-                                    'variable-length-string': [{'id': 'vlen-utf8'}]},
-            'v3_default_compressors': {'default': [{'configuration': {'checksum': False,
-                                                                        'level': 0},
-                                                      'name': 'zstd'}],
-                                       'variable-length-string': [{'configuration': {'checksum': False,
-                                                                                       'level': 0},
-                                                                     'name': 'zstd'}]},
-            'v3_default_filters': {'default': [], 'variable-length-string': []},
-            'v3_default_serializer': {'default': {'configuration': {'endian': 'little'},
-                                                   'name': 'bytes'},
-                                       'variable-length-string': {'name': 'vlen-utf8'}},
-            'write_empty_chunks': False},
+              'write_empty_chunks': False},
    'async': {'concurrency': 10, 'timeout': None},
    'buffer': 'zarr.buffer.cpu.Buffer',
    'codec_pipeline': {'batch_size': 1,
 
@@ -29,6 +29,8 @@
 from zarr.abc.store import Store, set_or_delete
 from zarr.codecs._v2 import V2Codec
 from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
+from zarr.codecs.zstd import ZstdCodec
 from zarr.core._info import ArrayInfo
 from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
 from zarr.core.attributes import Attributes
@@ -67,11 +69,13 @@
 from zarr.core.config import categorize_data_type
 from zarr.core.config import config as zarr_config
 from zarr.core.dtype import (
+    VariableLengthBytes,
+    VariableLengthUTF8,
     ZDType,
     ZDTypeLike,
     parse_data_type,
 )
-from zarr.core.dtype.common import HasEndianness, HasItemSize
+from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
 from zarr.core.indexing import (
     BasicIndexer,
     BasicSelection,
@@ -108,6 +112,7 @@
 )
 from zarr.core.metadata.v2 import (
     CompressorLikev2,
+    get_object_codec_id,
     parse_compressor,
     parse_filters,
 )
@@ -708,7 +713,10 @@ def _create_metadata_v3(
 
         shape = parse_shapelike(shape)
         if codecs is None:
-            filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype)
+            filters = default_filters_v3(dtype)
+            serializer = default_serializer_v3(dtype)
+            compressors = default_compressors_v3(dtype)
+
             codecs_parsed = (*filters, serializer, *compressors)
         else:
             codecs_parsed = tuple(codecs)
@@ -848,10 +856,9 @@ async def _create_v2(
         else:
             await ensure_no_existing_node(store_path, zarr_format=2)
 
-        default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
         compressor_parsed: CompressorLikev2
         if compressor == "auto":
-            compressor_parsed = default_compressor
+            compressor_parsed = default_compressor_v2(dtype)
         elif isinstance(compressor, BytesBytesCodec):
             raise ValueError(
                 "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -861,7 +868,7 @@ async def _create_v2(
             compressor_parsed = compressor
 
         if filters is None:
-            filters = default_filters
+            filters = default_filters_v2(dtype)
 
         metadata = cls._create_metadata_v2(
             shape=shape,
@@ -4641,19 +4648,80 @@ def _get_default_chunk_encoding_v3(
     )
 
 
-def _get_default_chunk_encoding_v2(
-    dtype: ZDType[TBaseDType, TBaseScalar],
-) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
+def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]:
     """
-    Get the default chunk encoding for Zarr format 2 arrays, given a dtype
+    Given a data type, return the default filters for that data type.
+
+    This is an empty tuple. No data types have default filters.
     """
-    dtype_category = categorize_data_type(dtype)
-    filters = zarr_config.get("array.v2_default_filters").get(dtype_category)
-    compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category)
-    if filters is not None:
-        filters = tuple(numcodecs.get_codec(f) for f in filters)
+    return ()
+
+
+def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]:
+    """
+    Given a data type, return the default compressors for that data type.
+
+    This is just a tuple containing ``ZstdCodec``
+    """
+    return (ZstdCodec(),)
+
+
+def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
+    """
+    Given a data type, return the default serializer for that data type.
+
+    The default serializer for most data types is the ``BytesCodec``, which may or may not be
+    parameterized with an endianness, depending on whether the data type has endianness. Variable
+    length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
+    ``VLenBytesCodec``, respectively.
+
+    """
+    serializer: ArrayBytesCodec = BytesCodec(endian=None)
+
+    if isinstance(dtype, HasEndianness):
+        serializer = BytesCodec(endian="little")
+    elif isinstance(dtype, HasObjectCodec):
+        if dtype.object_codec_id == "vlen-bytes":
+            serializer = VLenBytesCodec()
+        elif dtype.object_codec_id == "vlen-utf8":
+            serializer = VLenUTF8Codec()
+        else:
+            msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
+            raise ValueError(msg)
+    return serializer
+
+
+def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None:
+    """
+    Given a data type, return the default filters for that data type.
+
+    For data types that require an object codec, namely variable length data types,
+    this is a tuple containing the object codec. Otherwise it's ``None``.
+    """
+    if isinstance(dtype, HasObjectCodec):
+        if dtype.object_codec_id == "vlen-bytes":
+            from numcodecs import VLenBytes
 
-    return filters, numcodecs.get_codec(compressor)
+            return (VLenBytes(),)
+        elif dtype.object_codec_id == "vlen-utf8":
+            from numcodecs import VLenUTF8
+
+            return (VLenUTF8(),)
+        else:
+            msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
+            raise ValueError(msg)
+    return None
+
+
+def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec:
+    """
+    Given a data type, return the default compressors for that data type.
+
+    This is just the numcodecs ``Zstd`` codec.
+    """
+    from numcodecs import Zstd
+
+    return Zstd(level=0, checksum=False)
 
 
 def _parse_chunk_encoding_v2(
@@ -4665,14 +4733,13 @@ def _parse_chunk_encoding_v2(
     """
     Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
     """
-    default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
     _filters: tuple[numcodecs.abc.Codec, ...] | None
     _compressor: numcodecs.abc.Codec | None
 
     if compressor is None or compressor == ():
         _compressor = None
     elif compressor == "auto":
-        _compressor = default_compressor
+        _compressor = default_compressor_v2(dtype)
     elif isinstance(compressor, tuple | list) and len(compressor) == 1:
         _compressor = parse_compressor(compressor[0])
     else:
@@ -4684,7 +4751,7 @@ def _parse_chunk_encoding_v2(
     if filters is None:
         _filters = None
     elif filters == "auto":
-        _filters = default_filters
+        _filters = default_filters_v2(dtype)
     else:
         if isinstance(filters, Iterable):
             for idx, f in enumerate(filters):
@@ -4695,7 +4762,33 @@ def _parse_chunk_encoding_v2(
                     )
                     raise TypeError(msg)
         _filters = parse_filters(filters)
-
+    if isinstance(dtype, HasObjectCodec):
+        # check the filters and the compressor for the object codec required for this data type
+        if _filters is None:
+            if _compressor is None:
+                object_codec_id = None
+            else:
+                object_codec_id = get_object_codec_id((_compressor.get_config(),))
+        else:
+            object_codec_id = get_object_codec_id(
+                (
+                    *[f.get_config() for f in _filters],
+                    _compressor.get_config() if _compressor is not None else None,
+                )
+            )
+        if object_codec_id is None:
+            if isinstance(dtype, VariableLengthUTF8):  # type: ignore[unreachable]
+                codec_name = "the numcodecs.VLenUTF8 codec"  # type: ignore[unreachable]
+            elif isinstance(dtype, VariableLengthBytes):  # type: ignore[unreachable]
+                codec_name = "the numcodecs.VLenBytes codec"  # type: ignore[unreachable]
+            else:
+                codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}"
+            msg = (
+                f"Data type {dtype} requires {codec_name}, "
+                "but no such codec was specified in the filters or compressor parameters for "
+                "this array. "
+            )
+            raise ValueError(msg)
     return _filters, _compressor
 
 
@@ -4709,14 +4802,11 @@ def _parse_chunk_encoding_v3(
     """
     Generate chunk encoding classes for v3 arrays with optional defaults.
     """
-    default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
-        dtype
-    )
 
     if filters is None:
         out_array_array: tuple[ArrayArrayCodec, ...] = ()
     elif filters == "auto":
-        out_array_array = default_array_array
+        out_array_array = default_filters_v3(dtype)
     else:
         maybe_array_array: Iterable[Codec | dict[str, JSON]]
         if isinstance(filters, dict | Codec):
@@ -4726,7 +4816,7 @@ def _parse_chunk_encoding_v3(
         out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
 
     if serializer == "auto":
-        out_array_bytes = default_array_bytes
+        out_array_bytes = default_serializer_v3(dtype)
     else:
         # TODO: ensure that the serializer is compatible with the ndarray produced by the
         # array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4736,7 +4826,7 @@ def _parse_chunk_encoding_v3(
     if compressors is None:
         out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
     elif compressors == "auto":
-        out_bytes_bytes = default_bytes_bytes
+        out_bytes_bytes = default_compressors_v3(dtype)
     else:
         maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
         if isinstance(compressors, dict | Codec):
@@ -4746,17 +4836,11 @@ def _parse_chunk_encoding_v3(
 
         out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
 
-    # specialize codecs as needed given the dtype
-
-    # TODO: refactor so that the config only contains the name of the codec, and we use the dtype
-    # to create the codec instance, instead of storing a dict representation of a full codec.
-
     # TODO: ensure that the serializer is compatible with the ndarray produced by the
     # array-array codecs. For example, if a sequence of array-array codecs produces an
     # array with a single-byte data type, then the serializer should not specify endiannesss.
-    if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness):
-        # The default endianness in the bytescodec might not be None, so we need to replace it
-        out_array_bytes = replace(out_array_bytes, endian=None)
+
+    # TODO: add checks to ensure that the right serializer is used for vlen data types
     return out_array_array, out_array_bytes, out_bytes_bytes
 
 
 
@@ -195,7 +195,7 @@ def _warn_write_empty_chunks_kwarg() -> None:
     msg = (
         "The `write_empty_chunks` keyword argument is deprecated and will be removed in future versions. "
         "To control whether empty chunks are written to storage, either use the `config` keyword "
-        "argument, as in `config={'write_empty_chunks: True}`,"
+        "argument, as in `config={'write_empty_chunks': True}`,"
         "or change the global 'array.write_empty_chunks' configuration variable."
     )
     warnings.warn(msg, RuntimeWarning, stacklevel=2)
@@ -206,7 +206,7 @@ def _warn_order_kwarg() -> None:
     msg = (
         "The `order` keyword argument has no effect for Zarr format 3 arrays. "
         "To control the memory layout of the array, either use the `config` keyword "
-        "argument, as in `config={'order: 'C'}`,"
+        "argument, as in `config={'order': 'C'}`,"
         "or change the global 'array.order' configuration variable."
     )
     warnings.warn(msg, RuntimeWarning, stacklevel=2)