zarr-developers
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/zarr/abc/codec.py‎
Lines changed: 1 addition & 1 deletion b/‎src/zarr/abc/codec.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/zarr/abc/metadata.py‎
Lines changed: 0 additions & 1 deletion b/‎src/zarr/abc/metadata.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/zarr/abc/store.py‎
Lines changed: 5 additions & 8 deletions b/‎src/zarr/abc/store.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/zarr/api/asynchronous.py‎
Lines changed: 15 additions & 15 deletions b/‎src/zarr/api/asynchronous.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎src/zarr/codecs/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎src/zarr/codecs/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/zarr/codecs/vlen_utf8.py‎
Lines changed: 117 additions & 0 deletions b/‎src/zarr/codecs/vlen_utf8.py‎
Lines changed: 117 additions & 0 deletions
@@ -7,7 +7,7 @@ default_language_version:
   python: python3
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.8
+    rev: v0.6.9
     hooks:
     - id: ruff
       args: ["--fix", "--show-fixes"]
@@ -18,7 +18,7 @@ repos:
       - id: codespell
         args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"]
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     - id: check-yaml
   - repo: https://github.com/pre-commit/mirrors-mypy
@@ -49,3 +49,7 @@ repos:
     hooks:
       - id: rst-directive-colons
       - id: rst-inline-touching-normal
+  - repo: https://github.com/numpy/numpydoc
+    rev: v1.8.0
+    hooks:
+      - id: numpydoc-validation
@@ -319,3 +319,7 @@ ignore = [
 	"PC111",  # fix Python code in documentation - enable later
 	"PC180",  # for JavaScript - not interested
 ]
+
+[tool.numpydoc_validation]
+# See https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks for list of checks
+checks = ["GL06", "GL07", "GL10", "PR03", "PR05", "PR06"]
@@ -20,11 +20,11 @@
     from zarr.core.indexing import SelectorTuple
 
 __all__ = [
-    "BaseCodec",
     "ArrayArrayCodec",
     "ArrayBytesCodec",
     "ArrayBytesCodecPartialDecodeMixin",
     "ArrayBytesCodecPartialEncodeMixin",
+    "BaseCodec",
     "BytesBytesCodec",
     "CodecInput",
     "CodecOutput",
 
@@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]:
         are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and
         the output of that recursion is collected in a list.
         """
-        ...
         out_dict = {}
         for field in fields(self):
             key = field.name
 
@@ -74,7 +74,7 @@ class Store(ABC):
     _mode: AccessMode
     _is_open: bool
 
-    def __init__(self, mode: AccessModeLiteral = "r", *args: Any, **kwargs: Any) -> None:
+    def __init__(self, *args: Any, mode: AccessModeLiteral = "r", **kwargs: Any) -> None:
         self._is_open = False
         self._mode = AccessMode.from_literal(mode)
 
@@ -129,13 +129,10 @@ async def _open(self) -> None:
         """
         if self._is_open:
             raise ValueError("store is already open")
-        if not await self.empty():
-            if self.mode.update or self.mode.readonly:
-                pass
-            elif self.mode.overwrite:
-                await self.clear()
-            else:
-                raise FileExistsError("Store already exists")
+        if self.mode.str == "w":
+            await self.clear()
+        elif self.mode.str == "w-" and not await self.empty():
+            raise FileExistsError("Store already exists")
         self._is_open = True
 
     async def _ensure_open(self) -> None:
 
@@ -159,7 +159,7 @@ async def load(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     path : str or None, optional
         The path within the store from which to load.
@@ -203,7 +203,7 @@ async def open(
 
     Parameters
     ----------
-    store : Store or string, optional
+    store : Store or str, optional
         Store or path to directory in file system or name of zip file.
     mode : {'r', 'r+', 'a', 'w', 'w-'}, optional
         Persistence mode: 'r' means read only (must exist); 'r+' means
@@ -267,7 +267,7 @@ async def save(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     args : ndarray
         NumPy arrays with data to save.
@@ -303,7 +303,7 @@ async def save_array(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     arr : ndarray
         NumPy array with data to save.
@@ -351,7 +351,7 @@ async def save_group(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     args : ndarray
         NumPy arrays with data to save.
@@ -467,7 +467,7 @@ async def group(
 
     Parameters
     ----------
-    store : Store or string, optional
+    store : Store or str, optional
         Store or path to directory in file system.
     overwrite : bool, optional
         If True, delete any pre-existing data in `store` at `path` before
@@ -481,7 +481,7 @@ async def group(
         to all attribute read operations.
     synchronizer : object, optional
         Array synchronizer.
-    path : string, optional
+    path : str, optional
         Group path within store.
     meta_array : array-like, optional
         An array instance to use for determining arrays to create and return
@@ -547,7 +547,7 @@ async def open_group(
 
     Parameters
     ----------
-    store : Store, string, or mapping, optional
+    store : Store, str, or mapping, optional
         Store or path to directory in file system or name of zip file.
 
         Strings are interpreted as paths on the local file system
@@ -570,9 +570,9 @@ async def open_group(
         to all attribute read operations.
     synchronizer : object, optional
         Array synchronizer.
-    path : string, optional
+    path : str, optional
         Group path within store.
-    chunk_store : Store or string, optional
+    chunk_store : Store or str, optional
         Store or path to directory in file system or name of zip file.
     storage_options : dict
         If using an fsspec URL to create the store, these will be passed to
@@ -664,22 +664,22 @@ async def create(
         False, will be set to `shape`, i.e., single chunk for the whole array.
         If an int, the chunk size in each dimension will be given by the value
         of `chunks`. Default is True.
-    dtype : string or dtype, optional
+    dtype : str or dtype, optional
         NumPy dtype.
     compressor : Codec, optional
         Primary compressor.
     fill_value : object
         Default value to use for uninitialized portions of the array.
     order : {'C', 'F'}, optional
         Memory layout to be used within each chunk.
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     synchronizer : object, optional
         Array synchronizer.
     overwrite : bool, optional
         If True, delete all pre-existing data in `store` at `path` before
         creating the array.
-    path : string, optional
+    path : str, optional
         Path under which array is stored.
     chunk_store : MutableMapping, optional
         Separate storage for chunks. If not provided, `store` will be used
@@ -937,11 +937,11 @@ async def open_array(
 
     Parameters
     ----------
-    store : Store or string
+    store : Store or str
         Store or path to directory in file system or name of zip file.
     zarr_format : {2, 3, None}, optional
         The zarr format to use when saving.
-    path : string, optional
+    path : str, optional
         Path in store to array.
     storage_options : dict
         If using an fsspec URL to create the store, these will be passed to
 
@@ -1,13 +1,20 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import numpy as np
+
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
 from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.pipeline import BatchedCodecPipeline
 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
 from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
+from zarr.core.metadata.v3 import DataType
 
 __all__ = [
     "BatchedCodecPipeline",
@@ -21,5 +28,19 @@
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
+    "VLenBytesCodec",
+    "VLenUTF8Codec",
     "ZstdCodec",
 ]
+
+
+def _get_default_array_bytes_codec(
+    np_dtype: np.dtype[Any],
+) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
+    dtype = DataType.from_numpy(np_dtype)
+    if dtype == DataType.string:
+        return VLenUTF8Codec()
+    elif dtype == DataType.bytes:
+        return VLenBytesCodec()
+    else:
+        return BytesCodec()
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numcodecs.vlen import VLenBytes, VLenUTF8
+
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.core.buffer import Buffer, NDBuffer
+from zarr.core.common import JSON, parse_named_configuration
+from zarr.core.strings import cast_to_string_dtype
+from zarr.registry import register_codec
+
+if TYPE_CHECKING:
+    from typing import Self
+
+    from zarr.core.array_spec import ArraySpec
+
+
+# can use a global because there are no parameters
+_vlen_utf8_codec = VLenUTF8()
+_vlen_bytes_codec = VLenBytes()
+
+
+@dataclass(frozen=True)
+class VLenUTF8Codec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-utf8", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-utf8", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = _vlen_utf8_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        # coming out of the code, we know this is safe, so don't issue a warning
+        as_string_dtype = cast_to_string_dtype(decoded, safe=True)
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            _vlen_utf8_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+@dataclass(frozen=True)
+class VLenBytesCodec(ArrayBytesCodec):
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "vlen-bytes", require_configuration=False
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {"name": "vlen-bytes", "configuration": {}}
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        return self
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+
+        raw_bytes = chunk_bytes.as_array_like()
+        decoded = _vlen_bytes_codec.decode(raw_bytes)
+        assert decoded.dtype == np.object_
+        decoded.shape = chunk_spec.shape
+        return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        assert isinstance(chunk_array, NDBuffer)
+        return chunk_spec.prototype.buffer.from_bytes(
+            _vlen_bytes_codec.encode(chunk_array.as_numpy_array())
+        )
+
+    def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        # what is input_byte_length for an object dtype?
+        raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
+
+
+register_codec("vlen-utf8", VLenUTF8Codec)
+register_codec("vlen-bytes", VLenBytesCodec)
Original file line number	Diff line number	Diff line change
`@@ -319,3 +319,7 @@ ignore = [`
`319`	`319`	`"PC111", # fix Python code in documentation - enable later`
`320`	`320`	`"PC180", # for JavaScript - not interested`
`321`	`321`	`]`
	`322`	`+`
	`323`	`+[tool.numpydoc_validation]`
	`324`	`+# See https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks for list of checks`
	`325`	`+checks = ["GL06", "GL07", "GL10", "PR03", "PR05", "PR06"]`