test for auto sharding

d-v-b · d-v-b · commit 5dcd80bf765a · 2024-12-23T22:46:21.000+01:00
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -3472,8 +3472,8 @@ async def create_array(
     name: str | None = None,
     shape: ShapeLike,
     dtype: npt.DTypeLike,
-    chunk_shape: ChunkCoords | Literal["auto"] = "auto",
-    shard_shape: ChunkCoords | None = None,
+    chunks: ChunkCoords | Literal["auto"] = "auto",
+    shards: ChunkCoords | Literal["auto"] | None = None,
     filters: FiltersParam = "auto",
     compression: CompressionParam = "auto",
     fill_value: Any | None = 0,
@@ -3500,9 +3500,9 @@ async def create_array(
         Shape of the array.
     dtype : npt.DTypeLike
         Data type of the array.
-    chunk_shape : ChunkCoords
+    chunks : ChunkCoords
         Chunk shape of the array.
-    shard_shape : ChunkCoords, optional
+    shards : ChunkCoords, optional
         Shard shape of the array. The default value of ``None`` results in no sharding at all.
     filters : Iterable[Codec], optional
         List of filters to apply to the array.
@@ -3552,15 +3552,16 @@ async def create_array(
     )
     store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options)
     shard_shape_parsed, chunk_shape_parsed = _auto_partition(
-        shape_parsed, shard_shape, chunk_shape, dtype_parsed
+        array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed
     )
+    chunks_out: tuple[int, ...]
     result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]
 
     if zarr_format == 2:
         if shard_shape_parsed is not None:
             msg = (
-                'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.'
-                f"Got `shard_shape={shard_shape}` instead."
+                "Zarr v2 arrays can only be created with `shard_shape` set to `None`."
+                f"Got `shard_shape={shards}` instead."
             )
 
             raise ValueError(msg)
@@ -3604,10 +3605,10 @@ async def create_array(
             sharding_codec.validate(
                 shape=chunk_shape_parsed,
                 dtype=dtype_parsed,
-                chunk_grid=RegularChunkGrid(chunk_shape=shard_shape),
+                chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed),
             )
             codecs_out = (sharding_codec,)
-            chunks_out = shard_shape
+            chunks_out = shard_shape_parsed
         else:
             chunks_out = chunk_shape_parsed
             codecs_out = sub_codecs
diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py
@@ -197,9 +197,10 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int:
 
 
 def _auto_partition(
+    *,
     array_shape: tuple[int, ...],
-    shard_shape: tuple[int, ...] | Literal["auto"] | None,
     chunk_shape: tuple[int, ...] | Literal["auto"],
+    shard_shape: tuple[int, ...] | Literal["auto"] | None,
     dtype: np.dtype[Any],
 ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]:
     """
@@ -210,7 +211,6 @@ def _auto_partition(
     of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well,
     given the dtype and shard shape. Otherwise, the chunks will be returned as-is.
     """
-
     item_size = dtype.itemsize
     if shard_shape is None:
         _shards_out: None | tuple[int, ...] = None
@@ -229,9 +229,9 @@ def _auto_partition(
             _shards_out = ()
             for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True):
                 # TODO: make a better heuristic than this.
-                # for each axis, if there are more than 16 chunks along that axis, then make put
+                # for each axis, if there are more than 8 chunks along that axis, then put
                 # 2 chunks in each shard for that axis.
-                if a_shape // c_shape > 16:
+                if a_shape // c_shape > 8:
                     _shards_out += (c_shape * 2,)
                 else:
                     _shards_out += (c_shape,)
diff --git a/tests/test_array.py b/tests/test_array.py
@@ -13,10 +13,13 @@
 import zarr.api.asynchronous
 from zarr import Array, AsyncArray, Group
 from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec
+from zarr.codecs.sharding import ShardingCodec
 from zarr.core._info import ArrayInfo
 from zarr.core.array import chunks_initialized
 from zarr.core.buffer import default_buffer_prototype
 from zarr.core.buffer.cpu import NDBuffer
+from zarr.core.chunk_grids import _auto_partition
+from zarr.core.codec_pipeline import BatchedCodecPipeline
 from zarr.core.common import JSON, MemoryOrder, ZarrFormat
 from zarr.core.group import AsyncGroup
 from zarr.core.indexing import ceildiv
@@ -881,3 +884,47 @@ async def test_nbytes(
         assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize
     else:
         assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize
+
+
+def _get_partitioning(data: AsyncArray) -> tuple[tuple[int, ...], tuple[int, ...] | None]:
+    """
+    Get the shard shape and chunk shape of an array. If the array is not sharded, the shard shape
+    will be None.
+    """
+
+    shard_shape: tuple[int, ...] | None
+    chunk_shape: tuple[int, ...]
+    codecs = data.codec_pipeline
+    if isinstance(codecs, BatchedCodecPipeline):
+        if isinstance(codecs.array_bytes_codec, ShardingCodec):
+            chunk_shape = codecs.array_bytes_codec.chunk_shape
+            shard_shape = data.chunks
+        else:
+            chunk_shape = data.chunks
+            shard_shape = None
+    return chunk_shape, shard_shape
+
+
+@pytest.mark.parametrize(
+    ("array_shape", "chunk_shape"),
+    [((256,), (2,))],
+)
+def test_auto_partition_auto_shards(
+    array_shape: tuple[int, ...], chunk_shape: tuple[int, ...]
+) -> None:
+    """
+    Test that automatically picking a shard size returns a tuple of 2 * the chunk shape for any axis
+    where there are 8 or more chunks.
+    """
+    dtype = np.dtype("uint8")
+    expected_shards: tuple[int, ...] = ()
+    for cs, a_len in zip(chunk_shape, array_shape, strict=False):
+        if a_len // cs >= 8:
+            expected_shards += (2 * cs,)
+        else:
+            expected_shards += (cs,)
+
+    auto_shards, _ = _auto_partition(
+        array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype
+    )
+    assert auto_shards == expected_shards