Skip to content

Commit 31c831d

Browse files
committed
implementation (wip)
1 parent 0fd633f commit 31c831d

File tree

8 files changed

+1471
-166
lines changed

8 files changed

+1471
-166
lines changed

src/zarr/api/synchronous.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from zarr.errors import ZarrDeprecationWarning
1414

1515
if TYPE_CHECKING:
16-
from collections.abc import Iterable
16+
from collections.abc import Iterable, Sequence
1717

1818
import numpy as np
1919
import numpy.typing as npt
@@ -29,6 +29,7 @@
2929
)
3030
from zarr.core.array_spec import ArrayConfigLike
3131
from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar
32+
from zarr.core.chunk_grids import ChunkGrid
3233
from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike
3334
from zarr.core.common import (
3435
JSON,
@@ -821,7 +822,7 @@ def create_array(
821822
shape: ShapeLike | None = None,
822823
dtype: ZDTypeLike | None = None,
823824
data: np.ndarray[Any, np.dtype[Any]] | None = None,
824-
chunks: tuple[int, ...] | Literal["auto"] = "auto",
825+
chunks: tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"] = "auto",
825826
shards: ShardsLike | None = None,
826827
filters: FiltersLike = "auto",
827828
compressors: CompressorsLike = "auto",
@@ -857,9 +858,14 @@ def create_array(
857858
data : np.ndarray, optional
858859
Array-like data to use for initializing the array. If this parameter is provided, the
859860
``shape`` and ``dtype`` parameters must be ``None``.
860-
chunks : tuple[int, ...] | Literal["auto"], default="auto"
861-
Chunk shape of the array.
862-
If chunks is "auto", a chunk shape is guessed based on the shape of the array and the dtype.
861+
chunks : tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"], default="auto"
862+
Chunk shape of the array. Several formats are supported:
863+
864+
- tuple of ints: Creates a RegularChunkGrid with uniform chunks, e.g., ``(10, 10)``
865+
- nested sequence: Creates a RectilinearChunkGrid with variable-sized chunks (Zarr format 3 only),
866+
e.g., ``[[10, 20, 30], [5, 5]]`` creates variable chunks along each dimension
867+
- ChunkGrid instance: Uses the provided chunk grid directly (Zarr format 3 only)
868+
- "auto": Automatically determines chunk shape based on array shape and dtype
863869
shards : tuple[int, ...], optional
864870
Shard shape of the array. The default value of ``None`` results in no sharding at all.
865871
filters : Iterable[Codec] | Literal["auto"], optional

src/zarr/core/array.py

Lines changed: 115 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import warnings
55
from asyncio import gather
6-
from collections.abc import Iterable, Mapping
6+
from collections.abc import Iterable, Mapping, Sequence
77
from dataclasses import dataclass, field, replace
88
from itertools import starmap
99
from logging import getLogger
@@ -40,7 +40,7 @@
4040
default_buffer_prototype,
4141
)
4242
from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype
43-
from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks
43+
from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid, _auto_partition, normalize_chunks
4444
from zarr.core.chunk_key_encodings import (
4545
ChunkKeyEncoding,
4646
ChunkKeyEncodingLike,
@@ -737,15 +737,25 @@ async def _create(
737737
def _create_metadata_v3(
738738
shape: ShapeLike,
739739
dtype: ZDType[TBaseDType, TBaseScalar],
740-
chunk_shape: tuple[int, ...],
740+
chunk_shape: tuple[int, ...] | None = None,
741741
fill_value: Any | None = DEFAULT_FILL_VALUE,
742742
chunk_key_encoding: ChunkKeyEncodingLike | None = None,
743743
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
744744
dimension_names: DimensionNames = None,
745745
attributes: dict[str, JSON] | None = None,
746+
chunk_grid: ChunkGrid | None = None,
746747
) -> ArrayV3Metadata:
747748
"""
748749
Create an instance of ArrayV3Metadata.
750+
751+
Parameters
752+
----------
753+
chunk_grid : ChunkGrid, optional
754+
Custom chunk grid to use. If provided, chunk_shape is ignored.
755+
If not provided, a RegularChunkGrid is created from chunk_shape.
756+
chunk_shape : tuple[int, ...], optional
757+
Shape of chunks for creating a RegularChunkGrid.
758+
Only used if chunk_grid is not provided.
749759
"""
750760
filters: tuple[ArrayArrayCodec, ...]
751761
compressors: tuple[BytesBytesCodec, ...]
@@ -773,7 +783,14 @@ def _create_metadata_v3(
773783
else:
774784
fill_value_parsed = fill_value
775785

776-
chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape)
786+
# Use provided chunk_grid or create RegularChunkGrid from chunk_shape
787+
if chunk_grid is not None:
788+
chunk_grid_parsed = chunk_grid
789+
elif chunk_shape is not None:
790+
chunk_grid_parsed = RegularChunkGrid(chunk_shape=chunk_shape)
791+
else:
792+
raise ValueError("Either chunk_grid or chunk_shape must be provided")
793+
777794
return ArrayV3Metadata(
778795
shape=shape,
779796
data_type=dtype,
@@ -4564,6 +4581,7 @@ async def init_array(
45644581
dimension_names: DimensionNames = None,
45654582
overwrite: bool = False,
45664583
config: ArrayConfigLike | None = None,
4584+
chunk_grid: ChunkGrid | None = None,
45674585
) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]:
45684586
"""Create and persist an array metadata document.
45694587
@@ -4641,6 +4659,10 @@ async def init_array(
46414659
Configuration for this array.
46424660
If ``None``, the default array runtime configuration will be used. This default
46434661
is stored in the global configuration object.
4662+
chunk_grid : ChunkGrid, optional
4663+
Custom chunk grid to use for the array. If provided, the ``chunks`` parameter is ignored.
4664+
Zarr format 3 only. Use this to create arrays with variable-sized chunks (e.g., RectilinearChunkGrid).
4665+
If not provided, a RegularChunkGrid is created from the ``chunks`` parameter.
46444666
46454667
Returns
46464668
-------
@@ -4721,6 +4743,17 @@ async def init_array(
47214743
)
47224744
sub_codecs = cast("tuple[Codec, ...]", (*array_array, array_bytes, *bytes_bytes))
47234745
codecs_out: tuple[Codec, ...]
4746+
4747+
# Validate that RectilinearChunkGrid is not used with sharding
4748+
if shard_shape_parsed is not None and chunk_grid is not None:
4749+
from zarr.core.chunk_grids import RectilinearChunkGrid
4750+
4751+
if isinstance(chunk_grid, RectilinearChunkGrid):
4752+
raise ValueError(
4753+
"Sharding is not supported with RectilinearChunkGrid (variable-sized chunks). "
4754+
"Use RegularChunkGrid (uniform chunks) with sharding, or use RectilinearChunkGrid without sharding."
4755+
)
4756+
47244757
if shard_shape_parsed is not None:
47254758
index_location = None
47264759
if isinstance(shards, dict):
@@ -4731,9 +4764,11 @@ async def init_array(
47314764
chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location
47324765
)
47334766
sharding_codec.validate(
4734-
shape=chunk_shape_parsed,
4767+
shape=chunk_shape_parsed, # Original code: inner chunk shape
47354768
dtype=zdtype,
4736-
chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed),
4769+
chunk_grid=RegularChunkGrid(
4770+
chunk_shape=shard_shape_parsed
4771+
), # Original code: shard shape
47374772
)
47384773
codecs_out = (sharding_codec,)
47394774
chunks_out = shard_shape_parsed
@@ -4748,11 +4783,12 @@ async def init_array(
47484783
shape=shape_parsed,
47494784
dtype=zdtype,
47504785
fill_value=fill_value,
4751-
chunk_shape=chunks_out,
4786+
chunk_shape=chunks_out if chunk_grid is None else None,
47524787
chunk_key_encoding=chunk_key_encoding_parsed,
47534788
codecs=codecs_out,
47544789
dimension_names=dimension_names,
47554790
attributes=attributes,
4791+
chunk_grid=chunk_grid,
47564792
)
47574793

47584794
arr = AsyncArray(metadata=meta, store_path=store_path, config=config)
@@ -4767,7 +4803,7 @@ async def create_array(
47674803
shape: ShapeLike | None = None,
47684804
dtype: ZDTypeLike | None = None,
47694805
data: np.ndarray[Any, np.dtype[Any]] | None = None,
4770-
chunks: tuple[int, ...] | Literal["auto"] = "auto",
4806+
chunks: tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"] = "auto",
47714807
shards: ShardsLike | None = None,
47724808
filters: FiltersLike = "auto",
47734809
compressors: CompressorsLike = "auto",
@@ -4801,9 +4837,14 @@ async def create_array(
48014837
data : np.ndarray, optional
48024838
Array-like data to use for initializing the array. If this parameter is provided, the
48034839
``shape`` and ``dtype`` parameters must be ``None``.
4804-
chunks : tuple[int, ...] | Literal["auto"], default="auto"
4805-
Chunk shape of the array.
4806-
If chunks is "auto", a chunk shape is guessed based on the shape of the array and the dtype.
4840+
chunks : tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"], default="auto"
4841+
Chunk shape of the array. Several formats are supported:
4842+
4843+
- tuple of ints: Creates a RegularChunkGrid with uniform chunks, e.g., ``(10, 10)``
4844+
- nested sequence: Creates a RectilinearChunkGrid with variable-sized chunks (Zarr format 3 only),
4845+
e.g., ``[[10, 20, 30], [5, 5]]`` creates variable chunks along each dimension
4846+
- ChunkGrid instance: Uses the provided chunk grid directly (Zarr format 3 only)
4847+
- "auto": Automatically determines chunk shape based on array shape and dtype
48074848
shards : tuple[int, ...], optional
48084849
Shard shape of the array. The default value of ``None`` results in no sharding at all.
48094850
filters : Iterable[Codec] | Literal["auto"], optional
@@ -4900,16 +4941,72 @@ async def create_array(
49004941
>>> fill_value=0)
49014942
<AsyncArray memory://140349042942400 shape=(100, 100) dtype=int32>
49024943
"""
4944+
# Handle chunks as ChunkGrid or nested sequence - convert to chunk_grid for init_array
4945+
chunk_grid: ChunkGrid | None = None
4946+
4947+
if isinstance(chunks, ChunkGrid):
4948+
chunk_grid = chunks
4949+
chunks = "auto" # Will be ignored since chunk_grid is set
4950+
elif chunks != "auto" and not isinstance(chunks, (tuple, int)):
4951+
# Check if it's a nested sequence for RectilinearChunkGrid
4952+
# We need to distinguish between flat sequences like [10, 10] and nested like [[10, 20], [5, 5]]
4953+
is_nested = False
4954+
try:
4955+
# Try to iterate and check if elements are sequences
4956+
if hasattr(chunks, "__iter__") and not isinstance(chunks, (str, bytes)): # type: ignore[unreachable]
4957+
first_elem = next(iter(chunks), None)
4958+
if (
4959+
first_elem is not None
4960+
and hasattr(first_elem, "__iter__")
4961+
and not isinstance(first_elem, (str, bytes, int))
4962+
):
4963+
is_nested = True
4964+
except (TypeError, StopIteration):
4965+
pass
4966+
4967+
if is_nested:
4968+
# It's a nested sequence - create RectilinearChunkGrid
4969+
from zarr.core.chunk_grids import RectilinearChunkGrid
4970+
4971+
if zarr_format == 2:
4972+
raise ValueError(
4973+
"Variable chunks (nested sequences) are only supported in Zarr format 3. "
4974+
"Use zarr_format=3 or provide a regular tuple for chunks."
4975+
)
4976+
4977+
try:
4978+
# Convert nested sequence to list of lists for RectilinearChunkGrid
4979+
chunk_shapes = [list(dim) for dim in chunks]
4980+
chunk_grid = RectilinearChunkGrid(chunk_shapes=chunk_shapes)
4981+
chunks = "auto" # Will be ignored since chunk_grid is set
4982+
except (TypeError, ValueError) as e:
4983+
raise TypeError(
4984+
f"Invalid chunks argument: {chunks}. "
4985+
"Expected a tuple of integers, a nested sequence for variable chunks, "
4986+
f"a ChunkGrid instance, or 'auto'. Got error: {e}"
4987+
) from e
4988+
# else: it's a flat sequence like [10, 10] or single int, let it pass through to existing code
4989+
49034990
data_parsed, shape_parsed, dtype_parsed = _parse_data_params(
49044991
data=data, shape=shape, dtype=dtype
49054992
)
49064993
if data_parsed is not None:
4994+
# from_array doesn't support ChunkGrid parameter, so error if chunk_grid was set
4995+
if chunk_grid is not None:
4996+
raise ValueError(
4997+
"Cannot use ChunkGrid or nested sequences for chunks when creating array from data. "
4998+
"Use a regular tuple for chunks instead."
4999+
)
5000+
# At this point, chunks must be Literal["auto"] | tuple[int, ...] since chunk_grid is None
5001+
from typing import cast
5002+
5003+
chunks_narrowed = cast("Literal['auto', 'keep'] | tuple[int, ...]", chunks)
49075004
return await from_array(
49085005
store,
49095006
data=data_parsed,
49105007
write_data=write_data,
49115008
name=name,
4912-
chunks=chunks,
5009+
chunks=chunks_narrowed,
49135010
shards=shards,
49145011
filters=filters,
49155012
compressors=compressors,
@@ -4930,11 +5027,15 @@ async def create_array(
49305027
store_path = await make_store_path(
49315028
store, path=name, mode=mode, storage_options=storage_options
49325029
)
5030+
# At this point, chunks must be Literal["auto"] | tuple[int, ...] since we set it to "auto" when chunk_grid is set
5031+
from typing import cast
5032+
5033+
chunks_narrowed = cast("tuple[int, ...] | Literal['auto']", chunks)
49335034
return await init_array(
49345035
store_path=store_path,
49355036
shape=shape_parsed,
49365037
dtype=dtype_parsed,
4937-
chunks=chunks,
5038+
chunks=chunks_narrowed,
49385039
shards=shards,
49395040
filters=filters,
49405041
compressors=compressors,
@@ -4947,6 +5048,7 @@ async def create_array(
49475048
dimension_names=dimension_names,
49485049
overwrite=overwrite,
49495050
config=config,
5051+
chunk_grid=chunk_grid,
49505052
)
49515053

49525054

0 commit comments

Comments
 (0)