zarr-developers
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/zarr/abc/metadata.py‎
Lines changed: 0 additions & 1 deletion b/‎src/zarr/abc/metadata.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/zarr/codecs/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎src/zarr/codecs/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/zarr/codecs/vlen_utf8.py‎
Lines changed: 117 additions & 0 deletions b/‎src/zarr/codecs/vlen_utf8.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎src/zarr/core/array.py‎
Lines changed: 6 additions & 2 deletions b/‎src/zarr/core/array.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/zarr/core/buffer/core.py‎
Lines changed: 3 additions & 3 deletions b/‎src/zarr/core/buffer/core.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/zarr/core/config.py‎
Lines changed: 2 additions & 0 deletions b/‎src/zarr/core/config.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/zarr/core/group.py‎
Lines changed: 16 additions & 28 deletions b/‎src/zarr/core/group.py‎
Lines changed: 16 additions & 28 deletions
diff --git a/‎src/zarr/core/metadata/v2.py‎
Lines changed: 18 additions & 2 deletions b/‎src/zarr/core/metadata/v2.py‎
Lines changed: 18 additions & 2 deletions
@@ -7,7 +7,7 @@ default_language_version:
   python: python3
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.8
+    rev: v0.6.9
     hooks:
     - id: ruff
       args: ["--fix", "--show-fixes"]
@@ -18,7 +18,7 @@ repos:
       - id: codespell
         args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"]
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     - id: check-yaml
   - repo: https://github.com/pre-commit/mirrors-mypy
 
@@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]:
         are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and
         the output of that recursion is collected in a list.
         """
-        ...
         out_dict = {}
         for field in fields(self):
             key = field.name
 
@@ -1,13 +1,20 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import numpy as np
+
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
 from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.pipeline import BatchedCodecPipeline
 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
+from zarr.core.metadata.v3 import DataType
 
 __all__ = [
     "BatchedCodecPipeline",
@@ -21,5 +28,19 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
+    "VLenUTF8Codec",
+    "VLenBytesCodec",
     "ZstdCodec",
 ]
+
+
+def _get_default_array_bytes_codec(
+    np_dtype: np.dtype[Any],
+) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
+    dtype = DataType.from_numpy(np_dtype)
+    if dtype == DataType.string:
+        return VLenUTF8Codec()
+    elif dtype == DataType.bytes:
+        return VLenBytesCodec()
+    else:
+        return BytesCodec()
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numcodecs.vlen import VLenBytes, VLenUTF8
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.core.buffer import Buffer, NDBuffer
+from zarr.core.common import JSON, parse_named_configuration
+from zarr.core.strings import cast_to_string_dtype
+from zarr.registry import register_codec
+
+if TYPE_CHECKING:
+    from typing import Self
+
+    from zarr.core.array_spec import ArraySpec
+
+
+# can use a global because there are no parameters
+_vlen_utf8_codec = VLenUTF8()
+_vlen_bytes_codec = VLenBytes()
+
+
+@dataclass(frozen=True)
+class VLenUTF8Codec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-utf8", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-utf8", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = _vlen_utf8_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        # coming out of the code, we know this is safe, so don't issue a warning
+        as_string_dtype = cast_to_string_dtype(decoded, safe=True)
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            _vlen_utf8_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+@dataclass(frozen=True)
+class VLenBytesCodec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-bytes", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-bytes", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = _vlen_bytes_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            _vlen_bytes_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+register_codec("vlen-utf8", VLenUTF8Codec)
+register_codec("vlen-bytes", VLenBytesCodec)
@@ -11,7 +11,7 @@
 
 from zarr._compat import _deprecate_positional_args
 from zarr.abc.store import Store, set_or_delete
-from zarr.codecs import BytesCodec
+from zarr.codecs import _get_default_array_bytes_codec
 from zarr.codecs._v2 import V2Compressor, V2Filters
 from zarr.core.attributes import Attributes
 from zarr.core.buffer import (
@@ -318,7 +318,11 @@ async def _create_v3(
             await ensure_no_existing_node(store_path, zarr_format=3)
 
         shape = parse_shapelike(shape)
-        codecs = list(codecs) if codecs is not None else [BytesCodec()]
+        codecs = (
+            list(codecs)
+            if codecs is not None
+            else [_get_default_array_bytes_codec(np.dtype(dtype))]
+        )
 
         if chunk_key_encoding is None:
             chunk_key_encoding = ("default", "/")
 
@@ -313,8 +313,6 @@ class NDBuffer:
     """
 
     def __init__(self, array: NDArrayLike) -> None:
-        # assert array.ndim > 0
-        assert array.dtype != object
         self._data = array
 
     @classmethod
@@ -467,9 +465,11 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
             # Handle None fill_value for Zarr V2
             return False
         # use array_equal to obtain equal_nan=True functionality
+        # Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
+        # every single time we have to write data?
         _data, other = np.broadcast_arrays(self._data, other)
         return np.array_equal(
-            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
+            self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
         )
 
     def fill(self, value: Any) -> None:
 
@@ -58,6 +58,8 @@ def reset(self) -> None:
                 "crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
+                "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
+                "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
             },
             "buffer": "zarr.core.buffer.cpu.Buffer",
             "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
 
@@ -768,24 +768,20 @@ async def full(
         )
 
     async def empty_like(
-        self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
+        self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
     ) -> AsyncArray:
-        return await async_api.empty_like(a=prototype, store=self.store_path, path=name, **kwargs)
+        return await async_api.empty_like(a=data, store=self.store_path, path=name, **kwargs)
 
     async def zeros_like(
-        self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
+        self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
     ) -> AsyncArray:
-        return await async_api.zeros_like(a=prototype, store=self.store_path, path=name, **kwargs)
+        return await async_api.zeros_like(a=data, store=self.store_path, path=name, **kwargs)
 
-    async def ones_like(
-        self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
-    ) -> AsyncArray:
-        return await async_api.ones_like(a=prototype, store=self.store_path, path=name, **kwargs)
+    async def ones_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> AsyncArray:
+        return await async_api.ones_like(a=data, store=self.store_path, path=name, **kwargs)
 
-    async def full_like(
-        self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
-    ) -> AsyncArray:
-        return await async_api.full_like(a=prototype, store=self.store_path, path=name, **kwargs)
+    async def full_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> AsyncArray:
+        return await async_api.full_like(a=data, store=self.store_path, path=name, **kwargs)
 
     async def move(self, source: str, dest: str) -> None:
         raise NotImplementedError
@@ -1171,25 +1167,17 @@ def full(
             )
         )
 
-    def empty_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
-        return Array(
-            self._sync(self._async_group.empty_like(name=name, prototype=prototype, **kwargs))
-        )
+    def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
+        return Array(self._sync(self._async_group.empty_like(name=name, data=data, **kwargs)))
 
-    def zeros_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
-        return Array(
-            self._sync(self._async_group.zeros_like(name=name, prototype=prototype, **kwargs))
-        )
+    def zeros_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
+        return Array(self._sync(self._async_group.zeros_like(name=name, data=data, **kwargs)))
 
-    def ones_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
-        return Array(
-            self._sync(self._async_group.ones_like(name=name, prototype=prototype, **kwargs))
-        )
+    def ones_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
+        return Array(self._sync(self._async_group.ones_like(name=name, data=data, **kwargs)))
 
-    def full_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
-        return Array(
-            self._sync(self._async_group.full_like(name=name, prototype=prototype, **kwargs))
-        )
+    def full_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
+        return Array(self._sync(self._async_group.full_like(name=name, data=data, **kwargs)))
 
     def move(self, source: str, dest: str) -> None:
         return self._sync(self._async_group.move(source, dest))
 
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
+import base64
 from collections.abc import Iterable
 from enum import Enum
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 if TYPE_CHECKING:
     from typing import Any, Literal, Self
@@ -31,7 +32,7 @@ class ArrayV2Metadata(ArrayMetadata):
     shape: ChunkCoords
     chunk_grid: RegularChunkGrid
     data_type: np.dtype[Any]
-    fill_value: None | int | float = 0
+    fill_value: None | int | float | str | bytes = 0
     order: Literal["C", "F"] = "C"
     filters: tuple[numcodecs.abc.Codec, ...] | None = None
     dimension_separator: Literal[".", "/"] = "."
@@ -140,6 +141,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
         _data = data.copy()
         # check that the zarr_format attribute is correct
         _ = parse_zarr_format(_data.pop("zarr_format"))
+        dtype = parse_dtype(_data["dtype"])
+
+        if dtype.kind in "SV":
+            fill_value_encoded = _data.get("fill_value")
+            if fill_value_encoded is not None:
+                fill_value = base64.standard_b64decode(fill_value_encoded)
+                _data["fill_value"] = fill_value
 
         # zarr v2 allowed arbitrary keys here.
         # We don't want the ArrayV2Metadata constructor to fail just because someone put an
@@ -155,6 +163,14 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
 
     def to_dict(self) -> dict[str, JSON]:
         zarray_dict = super().to_dict()
+
+        if self.dtype.kind in "SV" and self.fill_value is not None:
+            # There's a relationship between self.dtype and self.fill_value
+            # that mypy isn't aware of. The fact that we have S or V dtype here
+            # means we should have a bytes-type fill_value.
+            fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
+            zarray_dict["fill_value"] = fill_value
+
         _ = zarray_dict.pop("chunk_grid")
         zarray_dict["chunks"] = self.chunk_grid.chunk_shape