Merge branch 'feat/batch-creation' of github.com:d-v-b/zarr-python into feat/batch-creation

d-v-b · d-v-b · commit b6bf2dd6e7c7 · 2025-01-07T09:54:17.000+01:00
diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py
@@ -6,8 +6,9 @@
 import logging
 import warnings
 from collections import defaultdict
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Awaitable
 from dataclasses import asdict, dataclass, field, fields, replace
+from functools import partial
 from typing import TYPE_CHECKING, Literal, TypeVar, assert_never, cast, overload
 
 import numpy as np
@@ -55,7 +56,7 @@
 from zarr.storage._common import ensure_no_existing_node
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Generator, Iterable, Iterator
+    from collections.abc import AsyncGenerator, Callable, Generator, Iterable, Iterator
     from typing import Any
 
     from zarr.core.array_spec import ArrayConfig, ArrayConfigLike
@@ -1266,7 +1267,7 @@ async def require_array(
 
     async def create_nodes(
         self, nodes: dict[str, GroupMetadata | ArrayV2Metadata | ArrayV3Metadata]
-    ) -> tuple[tuple[str, AsyncGroup | AsyncArray]]:
+    ) -> tuple[tuple[str, AsyncGroup | AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]]]:
         """
         Create a set of arrays or groups rooted at this group.
         """
@@ -2817,23 +2818,36 @@ def array(
         )
 
 
-async def _save_metadata_return_node(
+async def _with_semaphore(
+    func: Callable[[Any], Awaitable[T]], semaphore: asyncio.Semaphore | None = None
+) -> T:
+    if semaphore is None:
+        return await func(None)
+    async with semaphore:
+        return await func(None)
+
+
+async def _save_metadata(
     node: AsyncArray[Any] | AsyncGroup,
 ) -> AsyncArray[Any] | AsyncGroup:
-    if isinstance(node, AsyncArray):
-        await node._save_metadata(node.metadata, ensure_parents=False)
-    else:
-        await node._save_metadata(ensure_parents=False)
+    """
+    Save the metadata for an array or group, and return the array or group
+    """
+    match node:
+        case AsyncArray():
+            await node._save_metadata(node.metadata, ensure_parents=False)
+        case AsyncGroup():
+            await node._save_metadata(ensure_parents=False)
+        case _:
+            raise ValueError(f"Unexpected node type {type(node)}")
     return node
 
 
-async def create_nodes_v2(
-    *, store: Store, path: str, nodes: dict[str, GroupMetadata | ArrayV2Metadata]
-) -> tuple[tuple[str, AsyncGroup | AsyncArray[ArrayV2Metadata]]]: ...
-
-
 async def create_nodes(
-    *, store_path: StorePath, nodes: dict[str, GroupMetadata | ArrayV3Metadata | ArrayV2Metadata]
+    *,
+    store_path: StorePath,
+    nodes: dict[str, GroupMetadata | ArrayV3Metadata | ArrayV2Metadata],
+    semaphore: asyncio.Semaphore | None = None,
 ) -> AsyncIterator[AsyncGroup | AsyncArray[Any]]:
     """
     Create a collection of arrays and groups concurrently and atomically. To ensure atomicity,
@@ -2850,15 +2864,18 @@ async def create_nodes(
                 node = AsyncGroup(value, store_path=new_store_path)
             case _:
                 raise ValueError(f"Unexpected metadata type {type(value)}")
-        create_tasks.append(_save_metadata_return_node(node))
+        partial_func = partial(_save_metadata, node)
+        fut = _with_semaphore(partial_func, semaphore)
+        create_tasks.append(fut)
+
     for coro in asyncio.as_completed(create_tasks):
         yield await coro
 
 
 T = TypeVar("T")
 
 
-def _tuplize_keys(data: dict[str, T], separator: str) -> dict[tuple[str, ...], T]:
+def _split_keys(data: dict[str, T], separator: str) -> dict[tuple[str, ...], T]:
     """
     Given a dict of {string: T} pairs, where the keys are strings separated by some separator,
     return the result of splitting each key with the separator.
@@ -2875,10 +2892,10 @@ def _tuplize_keys(data: dict[str, T], separator: str) -> dict[tuple[str, ...], T
 
     Examples
     --------
-    >>> _tuplize_tree({"a": 1}, separator='/')
+    >>> _split_keys({"a": 1}, separator='/')
     {("a",): 1}
 
-    >>> _tuplize_tree({"a/b": 1, "a/b/c": 2, "c": 3}, separator='/')
+    >>> _split_keys({"a/b": 1, "a/b/c": 2, "c": 3}, separator='/')
     {("a", "b"): 1, ("a", "b", "c"): 2, ("c",): 3}
     """
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import pathlib
+from collections.abc import Iterable
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
@@ -10,7 +11,14 @@
 from hypothesis import HealthCheck, Verbosity, settings
 
 from zarr import AsyncGroup, config
+from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
 from zarr.abc.store import Store
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.sharding import ShardingCodec
+from zarr.core.chunk_grids import _guess_chunks
+from zarr.core.chunk_key_encodings import ChunkKeyEncoding
+from zarr.core.metadata.v2 import ArrayV2Metadata
+from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.core.sync import sync
 from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore
 
@@ -159,3 +167,183 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat:
     suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.too_slow],
     verbosity=Verbosity.verbose,
 )
+import numcodecs
+
+
+def meta_from_array_v2(
+    array: np.ndarray[Any, Any],
+    chunks: ChunkCoords | Literal["auto"] = "auto",
+    compressor: numcodecs.abc.Codec | Literal["auto"] | None = "auto",
+    filters: Iterable[numcodecs.abc.Codec] | Literal["auto"] = "auto",
+    fill_value: Any = "auto",
+    order: MemoryOrder | Literal["auto"] = "auto",
+    dimension_separator: Literal[".", "/", "auto"] = "auto",
+    attributes: dict[str, Any] | None = None,
+) -> ArrayV2Metadata:
+    """
+    Create a v2 metadata object from a numpy array
+    """
+
+    _chunks = auto_chunks(chunks, array.shape, array.dtype)
+    _compressor = auto_compressor(compressor)
+    _filters = auto_filters(filters)
+    _fill_value = auto_fill_value(fill_value)
+    _order = auto_order(order)
+    _dimension_separator = auto_dimension_separator(dimension_separator)
+    return ArrayV2Metadata(
+        shape=array.shape,
+        dtype=array.dtype,
+        chunks=_chunks,
+        compressor=_compressor,
+        filters=_filters,
+        fill_value=_fill_value,
+        order=_order,
+        dimension_separator=_dimension_separator,
+        attributes=attributes,
+    )
+
+
+from typing import TypedDict
+
+
+class ChunkEncoding(TypedDict):
+    filters: tuple[ArrayArrayCodec]
+    compressors: tuple[BytesBytesCodec]
+    serializer: ArrayBytesCodec
+
+
+class ChunkingSpec(TypedDict):
+    shard_shape: tuple[int, ...]
+    chunk_shape: tuple[int, ...] | None
+    chunk_key_encoding: ChunkKeyEncoding
+
+
+def meta_from_array_v3(
+    array: np.ndarray[Any, Any],
+    shard_shape: tuple[int, ...] | Literal["auto"] | None,
+    chunk_shape: tuple[int, ...] | Literal["auto"],
+    serializer: ArrayBytesCodec | Literal["auto"] = "auto",
+    compressors: Iterable[BytesBytesCodec] | Literal["auto"] = "auto",
+    filters: Iterable[ArrayArrayCodec] | Literal["auto"] = "auto",
+    fill_value: Any = "auto",
+    chunk_key_encoding: ChunkKeyEncoding | Literal["auto"] = "auto",
+    dimension_names: Iterable[str] | None = None,
+    attributes: dict[str, Any] | None = None,
+) -> ArrayV3Metadata:
+    _write_chunks, _read_chunks = auto_chunks_v3(
+        shard_shape=shard_shape, chunk_shape=chunk_shape, array_shape=array.shape, dtype=array.dtype
+    )
+    _codecs = auto_codecs(serializer=serializer, compressors=compressors, filters=filters)
+    if _read_chunks is not None:
+        _codecs = (ShardingCodec(codecs=_codecs, chunk_shape=_read_chunks),)
+
+    _fill_value = auto_fill_value(fill_value)
+    _chunk_key_encoding = auto_chunk_key_encoding(chunk_key_encoding)
+    return ArrayV3Metadata(
+        shape=array.shape,
+        dtype=array.dtype,
+        codecs=_codecs,
+        chunk_key_encoding=_chunk_key_encoding,
+        fill_value=fill_value,
+        chunk_grid={"name": "regular", "config": {"chunk_shape": shard_shape}},
+        attributes=attributes,
+        dimension_names=dimension_names,
+    )
+
+
+from zarr.abc.codec import Codec
+from zarr.codecs import ZstdCodec
+
+
+def auto_codecs(
+    *,
+    filters: Iterable[ArrayArrayCodec] | Literal["auto"] = "auto",
+    compressors: Iterable[BytesBytesCodec] | Literal["auto"] = "auto",
+    serializer: ArrayBytesCodec | Literal["auto"] = "auto",
+) -> tuple[Codec, ...]:
+    """
+    Heuristically generate a tuple of codecs
+    """
+    _compressors: tuple[BytesBytesCodec, ...]
+    _filters: tuple[ArrayArrayCodec, ...]
+    _serializer: ArrayBytesCodec
+    if filters == "auto":
+        _filters = ()
+    else:
+        _filters = tuple(filters)
+
+    if compressors == "auto":
+        _compressors = (ZstdCodec(level=3),)
+    else:
+        _compressors = tuple(compressors)
+
+    if serializer == "auto":
+        _serializer = BytesCodec()
+    else:
+        _serializer = serializer
+    return (*_filters, _serializer, *_compressors)
+
+
+def auto_dimension_separator(dimension_separator: Literal[".", "/", "auto"]) -> Literal[".", "/"]:
+    if dimension_separator == "auto":
+        return "/"
+    return dimension_separator
+
+
+def auto_order(order: MemoryOrder | Literal["auto"]) -> MemoryOrder:
+    if order == "auto":
+        return "C"
+    return order
+
+
+def auto_fill_value(fill_value: Any) -> Any:
+    if fill_value == "auto":
+        return 0
+    return fill_value
+
+
+def auto_compressor(
+    compressor: numcodecs.abc.Codec | Literal["auto"] | None,
+) -> numcodecs.abc.Codec | None:
+    if compressor == "auto":
+        return numcodecs.Zstd(level=3)
+    return compressor
+
+
+def auto_filters(
+    filters: Iterable[numcodecs.abc.Codec] | Literal["auto"],
+) -> tuple[numcodecs.abc.Codec, ...]:
+    if filters == "auto":
+        return ()
+    return tuple(filters)
+
+
+def auto_chunks(
+    chunks: tuple[int, ...] | Literal["auto"], shape: tuple[int, ...], dtype: npt.DTypeLike
+) -> tuple[int, ...]:
+    if chunks == "auto":
+        return _guess_chunks(shape, np.dtype(dtype).itemsize)
+    return chunks
+
+
+def auto_chunks_v3(
+    *,
+    shard_shape: tuple[int, ...] | Literal["auto"],
+    chunk_shape: tuple[int, ...] | Literal["auto"] | None,
+    array_shape: tuple[int, ...],
+    dtype: npt.DTypeLike,
+) -> tuple[tuple[int, ...], tuple[int, ...] | None]:
+    match (shard_shape, chunk_shape):
+        case ("auto", "auto"):
+            # stupid default but easy to think about
+            return ((256,) * len(array_shape), (64,) * len(array_shape))
+        case ("auto", None):
+            return (_guess_chunks(array_shape, np.dtype(dtype).itemsize), None)
+        case ("auto", _):
+            return (chunk_shape, chunk_shape)
+        case (_, None):
+            return (shard_shape, None)
+        case (_, "auto"):
+            return (shard_shape, shard_shape)
+        case _:
+            return (shard_shape, chunk_shape)
diff --git a/tests/test_group.py b/tests/test_group.py
@@ -18,12 +18,12 @@
 from zarr.abc.store import Store
 from zarr.core._info import GroupInfo
 from zarr.core.buffer import default_buffer_prototype
-from zarr.core.group import ConsolidatedMetadata, GroupMetadata
+from zarr.core.group import ConsolidatedMetadata, GroupMetadata, create_nodes
 from zarr.core.sync import sync
 from zarr.errors import ContainsArrayError, ContainsGroupError
 from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore, make_store_path
 
-from .conftest import parse_store
+from .conftest import meta_from_array_v2, parse_store
 
 if TYPE_CHECKING:
     from _pytest.compat import LEGACY_PATH
@@ -1440,6 +1440,15 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None
         g1["0/0"]
 
 
+@pytest.mark.parametrize("store", ["memory"], indirect=True)
+async def test_create_nodes(store: Store) -> None:
+    """
+    Ensure that create_nodes works.
+    """
+    arrays = {str(idx): meta_from_array_v2(np.arange(idx)) for idx in range(1, 5)}
+    spath = await make_store_path(store, path="foo")
+    results = [a async for a in create_nodes(store_path=spath, nodes=arrays)]
+
 @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
 def test_deprecated_compressor(store: Store) -> None:
     g = zarr.group(store=store, zarr_format=2)