diff --git a/changes/3299.bugfix.rst b/changes/3299.bugfix.rst new file mode 100644 index 0000000000..29accdaa30 --- /dev/null +++ b/changes/3299.bugfix.rst @@ -0,0 +1,6 @@ +Fix a bug in ``create_array`` caused by iterating over chunk-aligned regions instead of +shard-aligned regions when writing data. To make the distinction between chunks and shards more +obvious in the ``Array`` API, new properties ``chunk_grid_shape``, +``shard_grid_shape``, ``nshards``, ``nshards_initialized`` were added to the ``Array`` class. +Additionally, the behavior of ``nchunks_initialized`` has been adjusted. This function consistently +reports the number of chunks present in stored objects, even when the array uses the sharding codec. \ No newline at end of file diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 0f31e5d7be..f254c5bcb5 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -211,7 +211,7 @@ the time required to write an array with different values.:: ... start = time.time() ... arr[:] = value ... elapsed = time.time() - start - ... result.append((elapsed, arr.nchunks_initialized)) + ... result.append((elapsed, arr.nshards_initialized)) ... return result ... # log results >>> for write_empty_chunks in (True, False): diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2ce33df7ba..840be87241 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -94,6 +94,7 @@ Selection, VIndex, _iter_grid, + _iter_regions, check_fields, check_no_multi_fields, is_pure_fancy_indexing, @@ -698,7 +699,7 @@ async def _create( overwrite=overwrite, ) else: - raise ValueError(f"Unsupported zarr_format. Got: {zarr_format}") + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover if data is not None: # insert user-provided data @@ -1190,27 +1191,75 @@ def cdata_shape(self) -> ChunkCoords: Returns ------- - Tuple[int] + tuple[int, ...] The shape of the chunk grid for this array. """ - return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=False))) + return self._chunk_grid_shape + + @property + def _chunk_grid_shape(self) -> ChunkCoords: + """ + The shape of the chunk grid for this array. + + Returns + ------- + tuple[int, ...] + The shape of the chunk grid for this array. + """ + return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=True))) + + @property + def _shard_grid_shape(self) -> ChunkCoords: + """ + The shape of the shard grid for this array. + + Returns + ------- + tuple[int, ...] + The shape of the shard grid for this array. + """ + if self.shards is None: + shard_shape = self.chunks + else: + shard_shape = self.shards + return tuple(starmap(ceildiv, zip(self.shape, shard_shape, strict=True))) @property def nchunks(self) -> int: """ - The number of chunks in the stored representation of this array. + The number of chunks in this array. + + Note that if a sharding codec is used, then the number of chunks may exceed the number of + stored objects supporting this array. To find out the number of stored objects that support + this array, see :func:`nshards`. Returns ------- int The total number of chunks in the array. """ - return product(self.cdata_shape) + return product(self._chunk_grid_shape) + + @property + def _nshards(self) -> int: + """ + The number of shards in this array. + + Returns + ------- + int + The total number of shards in the array. + """ + return product(self._shard_grid_shape) async def nchunks_initialized(self) -> int: """ - Calculate the number of chunks that have been initialized, i.e. the number of chunks that have - been persisted to the storage backend. + Calculate the number of chunks that have been initialized in storage. + + This value is calculated as the product of the number of initialized shards and the number + of chunks per shard. For arrays that do not use sharding, the number of chunks per shard is + effectively 1, and in that case the number of chunks initialized is the same as the number + of stored objects associated with an array. Returns ------- @@ -1224,14 +1273,49 @@ async def nchunks_initialized(self) -> int: Examples -------- - >>> arr = await zarr.api.asynchronous.create(shape=(10,), chunks=(2,)) + >>> arr = await zarr.api.asynchronous.create(shape=(10,), chunks=(1,), shards=(2,)) >>> await arr.nchunks_initialized() 0 >>> await arr.setitem(slice(5), 1) + >>> await arr.nshards_initialized() + 3 >>> await arr.nchunks_initialized() + 6 + """ + if self.shards is None: + chunks_per_shard = 1 + else: + chunks_per_shard = product( + tuple(a // b for a, b in zip(self.shards, self.chunks, strict=True)) + ) + return (await self._nshards_initialized()) * chunks_per_shard + + async def _nshards_initialized(self) -> int: + """ + Calculate the number of shards that have been initialized in storage. + + This is the number of shards that have been persisted to the storage backend. + + Returns + ------- + nshards_initialized : int + The number of shards that have been initialized. + + Notes + ----- + On :class:`AsyncArray` this is an asynchronous method, unlike the (synchronous) + property :attr:`Array.nshards_initialized`. + + Examples + -------- + >>> arr = await zarr.api.asynchronous.create(shape=(10,), chunks=(2,)) + >>> await arr.nshards_initialized() + 0 + >>> await arr.setitem(slice(5), 1) + >>> await arr.nshards_initialized() 3 """ - return len(await chunks_initialized(self)) + return len(await _shards_initialized(self)) async def nbytes_stored(self) -> int: return await self.store_path.store.getsize_prefix(self.store_path.path) @@ -1240,8 +1324,9 @@ def _iter_chunk_coords( self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None ) -> Iterator[ChunkCoords]: """ - Create an iterator over the coordinates of chunks in chunk grid space. If the `origin` - keyword is used, iteration will start at the chunk index specified by `origin`. + Create an iterator over the coordinates of chunks in chunk grid space. + + If the `origin` keyword is used, iteration will start at the chunk index specified by `origin`. The default behavior is to start at the origin of the grid coordinate space. If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region ranging from `[origin, origin selection_shape]`, where the upper bound is exclusive as @@ -1259,21 +1344,56 @@ def _iter_chunk_coords( chunk_coords: ChunkCoords The coordinates of each chunk in the selection. """ - return _iter_grid(self.cdata_shape, origin=origin, selection_shape=selection_shape) + return _iter_chunk_coords( + array=self, + origin=origin, + selection_shape=selection_shape, + ) + + def _iter_shard_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[ChunkCoords]: + """ + Create an iterator over the coordinates of shards in shard grid space. + + Note that + + If the `origin` keyword is used, iteration will start at the shard index specified by `origin`. + The default behavior is to start at the origin of the grid coordinate space. + If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region + ranging from `[origin, origin selection_shape]`, where the upper bound is exclusive as + per python indexing conventions. + + Parameters + ---------- + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's shard grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in shard grid coordinates. + + Yields + ------ + chunk_coords: tuple[int, ...] + The coordinates of each shard in the selection. + """ + return _iter_shard_coords( + array=self, + origin=origin, + selection_shape=selection_shape, + ) - def _iter_chunk_keys( + def _iter_shard_keys( self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None ) -> Iterator[str]: """ - Iterate over the storage keys of each chunk, relative to an optional origin, and optionally - limited to a contiguous region in chunk grid coordinates. + Iterate over the keys of the stored objects supporting this array. Parameters ---------- origin : Sequence[int] | None, default=None The origin of the selection relative to the array's chunk grid. selection_shape : Sequence[int] | None, default=None - The shape of the selection in chunk grid coordinates. + The shape of the selection in shard grid coordinates. Yields ------ @@ -1281,9 +1401,11 @@ def _iter_chunk_keys( The storage key of each chunk in the selection. """ # Iterate over the coordinates of chunks in chunk grid space. - for k in self._iter_chunk_coords(origin=origin, selection_shape=selection_shape): - # Encode the chunk key from the chunk coordinates. - yield self.metadata.encode_chunk_key(k) + return _iter_shard_keys( + array=self, + origin=origin, + selection_shape=selection_shape, + ) def _iter_chunk_regions( self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None @@ -1303,15 +1425,31 @@ def _iter_chunk_regions( region: tuple[slice, ...] A tuple of slice objects representing the region spanned by each chunk in the selection. """ - for cgrid_position in self._iter_chunk_coords( - origin=origin, selection_shape=selection_shape - ): - out: tuple[slice, ...] = () - for c_pos, c_shape in zip(cgrid_position, self.chunks, strict=False): - start = c_pos * c_shape - stop = start + c_shape - out += (slice(start, stop, 1),) - yield out + return _iter_chunk_regions( + array=self, + origin=origin, + selection_shape=selection_shape, + ) + + def _iter_shard_regions( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """ + Iterate over the regions spanned by each shard. + + Parameters + ---------- + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's shard grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in shard grid coordinates. + + Yields + ------ + region: tuple[slice, ...] + A tuple of slice objects representing the region spanned by each shard in the selection. + """ + return _iter_shard_regions(array=self, origin=origin, selection_shape=selection_shape) @property def nbytes(self) -> int: @@ -1818,7 +1956,7 @@ async def info_complete(self) -> Any: A property giving just the statically known information about an array. """ return self._info( - await self.nchunks_initialized(), + await self._nshards_initialized(), await self.store_path.store.getsize_prefix(self.store_path.path), ) @@ -2265,41 +2403,39 @@ def cdata_shape(self) -> ChunkCoords: """ The shape of the chunk grid for this array. """ - return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=False))) + return self._async_array._chunk_grid_shape @property - def nchunks(self) -> int: + def _chunk_grid_shape(self) -> ChunkCoords: """ - The number of chunks in the stored representation of this array. + The shape of the chunk grid for this array. """ - return self._async_array.nchunks + return self._async_array._chunk_grid_shape - def _iter_chunk_coords( - self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None - ) -> Iterator[ChunkCoords]: + @property + def _shard_grid_shape(self) -> ChunkCoords: """ - Create an iterator over the coordinates of chunks in chunk grid space. If the `origin` - keyword is used, iteration will start at the chunk index specified by `origin`. - The default behavior is to start at the origin of the grid coordinate space. - If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region - ranging from `[origin, origin + selection_shape]`, where the upper bound is exclusive as - per python indexing conventions. + The shape of the shard grid for this array. + """ + return self._async_array._shard_grid_shape - Parameters - ---------- - origin : Sequence[int] | None, default=None - The origin of the selection relative to the array's chunk grid. - selection_shape : Sequence[int] | None, default=None - The shape of the selection in chunk grid coordinates. + @property + def nchunks(self) -> int: + """ + The number of chunks in this array. - Yields - ------ - chunk_coords: ChunkCoords - The coordinates of each chunk in the selection. + Note that if a sharding codec is used, then the number of chunks may exceed the number of + stored objects supporting this array. To find out the number of stored objects that support + this array, see :func:`nshards`. """ - yield from self._async_array._iter_chunk_coords( - origin=origin, selection_shape=selection_shape - ) + return self._async_array.nchunks + + @property + def _nshards(self) -> int: + """ + The number of shards in the stored representation of this array. + """ + return self._async_array._nshards @property def nbytes(self) -> int: @@ -2319,30 +2455,53 @@ def nbytes(self) -> int: @property def nchunks_initialized(self) -> int: """ - Calculate the number of chunks that have been initialized, i.e. the number of chunks that have - been persisted to the storage backend. + Calculate the number of chunks that have been initialized in storage. + + This value is calculated as the product of the number of initialized shards and the number of + chunks per shard. For arrays that do not use sharding, the number of chunks per shard is effectively 1, + and in that case the number of chunks initialized is the same as the number of stored objects associated with an + array. For a direct count of the number of initialized stored objects, see ``nshards_initialized``. Returns ------- nchunks_initialized : int The number of chunks that have been initialized. - Notes - ----- - On :class:`Array` this is a (synchronous) property, unlike asynchronous function - :meth:`AsyncArray.nchunks_initialized`. - Examples -------- - >>> arr = await zarr.create(shape=(10,), chunks=(2,)) + >>> arr = zarr.create_array(store={}, shape=(10,), chunks=(1,), shards=(2,)) >>> arr.nchunks_initialized 0 >>> arr[:5] = 1 - >>> arr.nchunks_initialized + >>> arr.nshards_initialized 3 + >>> arr.nchunks_initialized + 6 """ return sync(self._async_array.nchunks_initialized()) + @property + def nshards_initialized(self) -> int: + """ + Calculate the number of shards that have been initialized, i.e. the number of shards that have + been persisted to the storage backend. + + Returns + ------- + nshards_initialized : int + The number of shards that have been initialized. + + Examples + -------- + >>> arr = await zarr.create(shape=(10,), chunks=(2,)) + >>> arr.nshards_initialized + 0 + >>> arr[:5] = 1 + >>> arr.nshard_initialized + 3 + """ + return sync(self._async_array._nshards_initialized()) + def nbytes_stored(self) -> int: """ Determine the size, in bytes, of the array actually written to the store. @@ -2353,13 +2512,39 @@ def nbytes_stored(self) -> int: """ return sync(self._async_array.nbytes_stored()) - def _iter_chunk_keys( + def _iter_shard_keys( self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None ) -> Iterator[str]: """ - Iterate over the storage keys of each chunk, relative to an optional origin, and optionally + Iterate over the storage keys of each shard, relative to an optional origin, and optionally limited to a contiguous region in chunk grid coordinates. + Parameters + ---------- + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's shard grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in shard grid coordinates. + + Yields + ------ + key: str + The storage key of each shard in the selection. + """ + return self._async_array._iter_shard_keys(origin=origin, selection_shape=selection_shape) + + def _iter_chunk_coords( + self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[ChunkCoords]: + """ + Create an iterator over the coordinates of chunks in chunk grid space. + + If the `origin` keyword is used, iteration will start at the chunk index specified by `origin`. + The default behavior is to start at the origin of the grid coordinate space. + If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region + ranging from `[origin, origin + selection_shape]`, where the upper bound is exclusive as + per python indexing conventions. + Parameters ---------- origin : Sequence[int] | None, default=None @@ -2369,12 +2554,36 @@ def _iter_chunk_keys( Yields ------ - key: str - The storage key of each chunk in the selection. + chunk_coords: ChunkCoords + The coordinates of each chunk in the selection. """ - yield from self._async_array._iter_chunk_keys( - origin=origin, selection_shape=selection_shape - ) + return self._async_array._iter_chunk_coords(origin=origin, selection_shape=selection_shape) + + def _iter_shard_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[int, ...]]: + """ + Create an iterator over the coordinates of shards in shard grid space. + + If the `origin` keyword is used, iteration will start at the shard index specified by `origin`. + The default behavior is to start at the origin of the grid coordinate space. + If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region + ranging from `[origin, origin selection_shape]`, where the upper bound is exclusive as + per python indexing conventions. + + Parameters + ---------- + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's shard grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in shard grid coordinates. + + Yields + ------ + chunk_coords: tuple[int, ...] + The coordinates of each shard in the selection. + """ + return self._async_array._iter_shard_coords(origin=origin, selection_shape=selection_shape) def _iter_chunk_regions( self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None @@ -2394,9 +2603,27 @@ def _iter_chunk_regions( region: tuple[slice, ...] A tuple of slice objects representing the region spanned by each chunk in the selection. """ - yield from self._async_array._iter_chunk_regions( - origin=origin, selection_shape=selection_shape - ) + return self._async_array._iter_chunk_regions(origin=origin, selection_shape=selection_shape) + + def _iter_shard_regions( + self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """ + Iterate over the regions spanned by each shard. + + Parameters + ---------- + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's chunk grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in chunk grid coordinates. + + Yields + ------ + region: tuple[slice, ...] + A tuple of slice objects representing the region spanned by each chunk in the selection. + """ + return self._async_array._iter_shard_regions(origin=origin, selection_shape=selection_shape) def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None @@ -3833,7 +4060,7 @@ def info_complete(self) -> Any: return sync(self._async_array.info_complete()) -async def chunks_initialized( +async def _shards_initialized( array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], ) -> tuple[str, ...]: """ @@ -3861,7 +4088,7 @@ async def chunks_initialized( _relativize_path(path=key, prefix=array.store_path.path) for key in store_contents ] return tuple( - chunk_key for chunk_key in array._iter_chunk_keys() if chunk_key in store_contents_relative + chunk_key for chunk_key in array._iter_shard_keys() if chunk_key in store_contents_relative ) @@ -4171,7 +4398,7 @@ async def _copy_array_region(chunk_coords: ChunkCoords | slice, _data: Array) -> # Stream data from the source array to the new array await concurrent_map( - [(region, data) for region in result._iter_chunk_regions()], + [(region, data) for region in result._iter_shard_regions()], _copy_array_region, zarr.core.config.config.get("async.concurrency"), ) @@ -4990,3 +5217,160 @@ def _parse_data_params( raise ValueError(msg) dtype_out = data.dtype return data, shape_out, dtype_out + + +def _iter_chunk_coords( + array: Array | AsyncArray[Any], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, +) -> Iterator[ChunkCoords]: + """ + Create an iterator over the coordinates of chunks in chunk grid space. If the `origin` + keyword is used, iteration will start at the chunk index specified by `origin`. + The default behavior is to start at the origin of the grid coordinate space. + If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region + ranging from `[origin, origin selection_shape]`, where the upper bound is exclusive as + per python indexing conventions. + + Parameters + ---------- + array : Array | AsyncArray + The array to iterate over. + origin : Sequence[int] | None, default=None + The origin of the selection in grid coordinates. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in grid coordinates. + + Yields + ------ + chunk_coords: ChunkCoords + The coordinates of each chunk in the selection. + """ + return _iter_grid(array._chunk_grid_shape, origin=origin, selection_shape=selection_shape) + + +def _iter_shard_coords( + array: Array | AsyncArray[Any], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, +) -> Iterator[ChunkCoords]: + """ + Create an iterator over the coordinates of shards in shard grid space. If the `origin` + keyword is used, iteration will start at the shard index specified by `origin`. + The default behavior is to start at the origin of the grid coordinate space. + If the `selection_shape` keyword is used, iteration will be bounded over a contiguous region + ranging from `[origin, origin selection_shape]`, where the upper bound is exclusive as + per python indexing conventions. + + Parameters + ---------- + array : Array | AsyncArray + The array to iterate over. + origin : Sequence[int] | None, default=None + The origin of the selection in grid coordinates. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in grid coordinates. + + Yields + ------ + chunk_coords: ChunkCoords + The coordinates of each shard in the selection. + """ + return _iter_grid(array._shard_grid_shape, origin=origin, selection_shape=selection_shape) + + +def _iter_shard_keys( + array: Array | AsyncArray[Any], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, +) -> Iterator[str]: + """ + Iterate over the storage keys of each shard, relative to an optional origin, and optionally + limited to a contiguous region in shard grid coordinates. + + Parameters + ---------- + array : Array | AsyncArray + The array to iterate over. + origin : Sequence[int] | None, default=None + The origin of the selection in grid coordinates. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in grid coordinates. + + Yields + ------ + key: str + The storage key of each chunk in the selection. + """ + # Iterate over the coordinates of chunks in chunk grid space. + _iter = _iter_grid(array._shard_grid_shape, origin=origin, selection_shape=selection_shape) + return (array.metadata.encode_chunk_key(k) for k in _iter) + + +def _iter_shard_regions( + array: Array | AsyncArray[Any], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, +) -> Iterator[tuple[slice, ...]]: + """ + Iterate over the regions spanned by each shard. + + These are the smallest regions of the array that are safe to write concurrently. + + Parameters + ---------- + array : Array | AsyncArray + The array to iterate over. + origin : Sequence[int] | None, default=None + The origin of the selection relative to the array's shard grid. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in shard grid coordinates. + + Yields + ------ + region: tuple[slice, ...] + A tuple of slice objects representing the region spanned by each shard in the selection. + """ + if array.shards is None: + shard_shape = array.chunks + else: + shard_shape = array.shards + + return _iter_regions( + array.shape, shard_shape, origin=origin, selection_shape=selection_shape, trim_excess=True + ) + + +def _iter_chunk_regions( + array: Array | AsyncArray[Any], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, +) -> Iterator[tuple[slice, ...]]: + """ + Iterate over the regions spanned by each shard. + + These are the smallest regions of the array that are efficient to read concurrently. + + Parameters + ---------- + array : Array | AsyncArray + The array to iterate over. + origin : Sequence[int] | None, default=None + The origin of the selection in grid coordinates. + selection_shape : Sequence[int] | None, default=None + The shape of the selection in grid coordinates. + + Yields + ------ + region: tuple[slice, ...] + A tuple of slice objects representing the region spanned by each shard in the selection. + """ + + return _iter_regions( + array.shape, array.chunks, origin=origin, selection_shape=selection_shape, trim_excess=True + ) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 15cf6f0f1a..bc2bd3b9f2 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -124,7 +124,7 @@ def _iter_grid( Returns ------- - itertools.product object + Iterator[tuple[int, ...]] An iterator over tuples of integers Examples @@ -135,11 +135,11 @@ def _iter_grid( >>> tuple(iter_grid((2,3))) ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)) - >>> tuple(iter_grid((2,3)), origin=(1,1)) - ((1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)) + >>> tuple(iter_grid((2,3), origin=(1,1))) + ((1, 1), (1, 2)) - >>> tuple(iter_grid((2,3)), origin=(1,1), selection_shape=(2,2)) - ((1, 1), (1, 2), (1, 3), (2, 1)) + >>> tuple(iter_grid((2,3), origin=(0,0), selection_shape=(2,2))) + ((0, 0), (0, 1), (1, 0), (1, 1)) """ if origin is None: origin_parsed = (0,) * len(grid_shape) @@ -164,14 +164,74 @@ def _iter_grid( ): if o + ss > gs: raise IndexError( - f"Invalid selection shape ({selection_shape}) for origin ({origin}) and grid shape ({grid_shape}) at axis {idx}." + f"Invalid selection shape ({ss}) for origin ({o}) and grid shape ({gs}) at axis {idx}." ) dimensions += (range(o, o + ss),) - yield from itertools.product(*(dimensions)) + return itertools.product(*(dimensions)) else: - msg = f"Indexing order {order} is not supported at this time." # type: ignore[unreachable] - raise NotImplementedError(msg) + msg = f"Indexing order {order} is not supported at this time." # type: ignore[unreachable] # pragma: no cover + raise NotImplementedError(msg) # pragma: no cover + + +def _iter_regions( + domain_shape: Sequence[int], + region_shape: Sequence[int], + *, + origin: Sequence[int] | None = None, + selection_shape: Sequence[int] | None = None, + order: _ArrayIndexingOrder = "lexicographic", + trim_excess: bool = True, +) -> Iterator[tuple[slice, ...]]: + """ + Iterate over contiguous regions on a grid of integers, with the option to restrict the + domain of iteration to a contiguous subregion of that grid. + + Parameters + ---------- + domain_shape : Sequence[int] + The size of the domain to iterate over. + region_shape : Sequence[int] + The shape of the region to iterate over. + origin : Sequence[int] | None, default=None + The location, in grid coordinates, of the first region to return. + selection_shape : Sequence[int] | None, default=None + The shape of the selection, in grid coordinates. + order : Literal["lexicographic"], default="lexicographic" + The linear indexing order to use. + + Returns + ------- + + Iterator[tuple[slice, ...]] + An iterator over tuples of slices, where each slice spans a separate contiguous region + + Examples + -------- + >>> tuple(iter_regions((1,), (1,))) + ((slice(0, 1, 1),),) + + >>> tuple(iter_regions((2, 3), (1, 2))) + ((slice(0, 1, 1), slice(0, 2, 1)), (slice(1, 2, 1), slice(0, 2, 1))) + + >>> tuple(iter_regions((2,3), (1,2)), origin=(1,1)) + ((slice(1, 2, 1), slice(1, 3, 1)), (slice(2, 3, 1), slice(1, 3, 1))) + + >>> tuple(iter_regions((2,3), (1,2)), origin=(1,1), selection_shape=(2,2)) + ((slice(1, 2, 1), slice(1, 3, 1)), (slice(2, 3, 1), slice(1, 3, 1))) + """ + grid_shape = tuple(ceildiv(d, s) for d, s in zip(domain_shape, region_shape, strict=True)) + for grid_position in _iter_grid( + grid_shape=grid_shape, origin=origin, selection_shape=selection_shape, order=order + ): + out: list[slice] = [] + for g_pos, r_shape, d_shape in zip(grid_position, region_shape, domain_shape, strict=True): + start = g_pos * r_shape + stop = start + r_shape + if trim_excess: + stop = min(stop, d_shape) + out.append(slice(start, stop, 1)) + yield tuple(out) def is_integer(x: Any) -> TypeGuard[int]: diff --git a/tests/test_array.py b/tests/test_array.py index a316ee127f..2087013b1a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -31,9 +31,14 @@ from zarr.core.array import ( CompressorsLike, FiltersLike, + _iter_chunk_coords, + _iter_chunk_regions, + _iter_shard_coords, + _iter_shard_keys, + _iter_shard_regions, _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, - chunks_initialized, + _shards_initialized, create_array, default_filters_v2, default_serializer_v3, @@ -59,7 +64,7 @@ from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup -from zarr.core.indexing import BasicIndexer +from zarr.core.indexing import BasicIndexer, _iter_grid, _iter_regions from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync @@ -375,48 +380,74 @@ def test_nchunks(test_cls: type[Array] | type[AsyncArray[Any]], nchunks: int) -> @pytest.mark.parametrize("test_cls", [Array, AsyncArray[Any]]) -async def test_nchunks_initialized(test_cls: type[Array] | type[AsyncArray[Any]]) -> None: +@pytest.mark.parametrize( + ("shape", "shard_shape", "chunk_shape"), + [((10,), None, (1,)), ((10,), (1,), (1,)), ((40,), (20,), (5,))], +) +async def test_nchunks_initialized( + test_cls: type[Array] | type[AsyncArray[Any]], + shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], +) -> None: """ - Test that nchunks_initialized accurately returns the number of stored chunks. + Test that nchunks_initialized accurately returns the number of stored partitions. """ store = MemoryStore() - arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") + if shard_shape is None: + chunks_per_shard = 1 + else: + chunks_per_shard = np.prod(np.array(shard_shape) // np.array(chunk_shape)) + + arr = zarr.create_array(store, shape=shape, shards=shard_shape, chunks=chunk_shape, dtype="i1") # write chunks one at a time - for idx, region in enumerate(arr._iter_chunk_regions()): + for idx, region in enumerate(arr._iter_shard_regions()): arr[region] = 1 expected = idx + 1 if test_cls == Array: - observed = arr.nchunks_initialized + observed = arr.nshards_initialized + assert observed == arr.nchunks_initialized // chunks_per_shard else: - observed = await arr._async_array.nchunks_initialized() + observed = await arr._async_array._nshards_initialized() + assert observed == await arr._async_array.nchunks_initialized() // chunks_per_shard assert observed == expected # delete chunks - for idx, key in enumerate(arr._iter_chunk_keys()): + for idx, key in enumerate(arr._iter_shard_keys()): sync(arr.store_path.store.delete(key)) if test_cls == Array: - observed = arr.nchunks_initialized + observed = arr.nshards_initialized + assert observed == arr.nchunks_initialized // chunks_per_shard else: - observed = await arr._async_array.nchunks_initialized() - expected = arr.nchunks - idx - 1 + observed = await arr._async_array._nshards_initialized() + assert observed == await arr._async_array.nchunks_initialized() // chunks_per_shard + expected = arr._nshards - idx - 1 assert observed == expected @pytest.mark.parametrize("path", ["", "foo"]) -async def test_chunks_initialized(path: str) -> None: +@pytest.mark.parametrize( + ("shape", "shard_shape", "chunk_shape"), + [((10,), None, (1,)), ((10,), (1,), (1,)), ((40,), (20,), (5,))], +) +async def test_chunks_initialized( + path: str, shape: tuple[int, ...], shard_shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> None: """ Test that chunks_initialized accurately returns the keys of stored chunks. """ store = MemoryStore() - arr = zarr.create_array(store, name=path, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array( + store, name=path, shape=shape, shards=shard_shape, chunks=chunk_shape, dtype="i1" + ) chunks_accumulated = tuple( - accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) + accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_shard_keys())) ) - for keys, region in zip(chunks_accumulated, arr._iter_chunk_regions(), strict=False): + for keys, region in zip(chunks_accumulated, arr._iter_shard_regions(), strict=False): arr[region] = 1 - observed = sorted(await chunks_initialized(arr._async_array)) + observed = sorted(await _shards_initialized(arr._async_array)) expected = sorted(keys) assert observed == expected @@ -861,14 +892,14 @@ def test_write_empty_chunks_behavior( # initialize the store with some non-fill value chunks arr[:] = fill_value + 1 - assert arr.nchunks_initialized == arr.nchunks + assert arr.nshards_initialized == arr._nshards arr[:] = fill_value if not write_empty_chunks: - assert arr.nchunks_initialized == 0 + assert arr.nshards_initialized == 0 else: - assert arr.nchunks_initialized == arr.nchunks + assert arr.nshards_initialized == arr._nshards @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1871,3 +1902,220 @@ def test_unknown_object_codec_default_filters_v2() -> None: msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." with pytest.raises(ValueError, match=re.escape(msg)): default_filters_v2(dtype) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), + [ + ((10,), None, (1,)), + ((10,), (1,), (1,)), + ((30, 10), None, (2, 5)), + ((30, 10), (4, 10), (2, 5)), + ], +) +def test_chunk_grid_shape( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that the shape of the chunk grid and the shard grid are correctly indicated + """ + if zarr_format == 2 and shard_shape is not None: + with pytest.raises( + ValueError, + match="Zarr format 2 arrays can only be created with `shard_shape` set to `None`.", + ): + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + pytest.skip("Zarr format 2 arrays can only be created with `shard_shape` set to `None`.") + else: + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + + chunk_grid_shape = tuple(ceildiv(a, b) for a, b in zip(array_shape, chunk_shape, strict=True)) + if shard_shape is None: + _shard_shape = chunk_shape + else: + _shard_shape = shard_shape + shard_grid_shape = tuple(ceildiv(a, b) for a, b in zip(array_shape, _shard_shape, strict=True)) + assert arr._chunk_grid_shape == chunk_grid_shape + assert arr.cdata_shape == chunk_grid_shape + assert arr._async_array.cdata_shape == chunk_grid_shape + assert arr._shard_grid_shape == shard_grid_shape + assert arr._nshards == np.prod(shard_grid_shape) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), [((10,), None, (1,)), ((30, 10), None, (2, 5))] +) +def test_iter_chunk_coords( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that we can use the various invocations of iter_chunk_coords to iterate over the coordinates + of the origin of each chunk. + """ + + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + expected = tuple(_iter_grid(arr._shard_grid_shape)) + observed = tuple(_iter_chunk_coords(arr)) + assert observed == expected + assert observed == tuple(arr._iter_chunk_coords()) + assert observed == tuple(arr._async_array._iter_chunk_coords()) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), + [((10,), (1,), (1,)), ((10,), None, (1,)), ((30, 10), (10, 5), (2, 5))], +) +def test_iter_shard_coords( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that we can use the various invocations of iter_shard_coords to iterate over the coordinates + of the origin of each shard. + """ + + if zarr_format == 2 and shard_shape is not None: + pytest.skip("Zarr format 2 does not support shard shape.") + + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + expected = tuple(_iter_grid(arr._shard_grid_shape)) + observed = tuple(_iter_shard_coords(arr)) + assert observed == expected + assert observed == tuple(arr._iter_shard_coords()) + assert observed == tuple(arr._async_array._iter_shard_coords()) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), + [((10,), (1,), (1,)), ((10,), None, (1,)), ((30, 10), (10, 5), (2, 5))], +) +def test_iter_shard_keys( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that we can use the various invocations of iter_shard_keys to iterate over the stored + keys of the shards of an array. + """ + + if zarr_format == 2 and shard_shape is not None: + pytest.skip("Zarr format 2 does not support shard shape.") + + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + expected = tuple( + arr.metadata.encode_chunk_key(key) for key in _iter_grid(arr._shard_grid_shape) + ) + observed = tuple(_iter_shard_keys(arr)) + assert observed == expected + assert observed == tuple(arr._iter_shard_keys()) + assert observed == tuple(arr._async_array._iter_shard_keys()) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), + [((10,), None, (1,)), ((10,), (1,), (1,)), ((30, 10), (10, 5), (2, 5))], +) +def test_iter_shard_regions( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that we can use the various invocations of iter_shard_regions to iterate over the regions + spanned by the shards of an array. + """ + if zarr_format == 2 and shard_shape is not None: + pytest.skip("Zarr format 2 does not support shard shape.") + + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + if shard_shape is None: + _shard_shape = chunk_shape + else: + _shard_shape = shard_shape + expected = tuple(_iter_regions(arr.shape, _shard_shape)) + observed = tuple(_iter_shard_regions(arr)) + assert observed == expected + assert observed == tuple(arr._iter_shard_regions()) + assert observed == tuple(arr._async_array._iter_shard_regions()) + + +@pytest.mark.parametrize( + ("array_shape", "shard_shape", "chunk_shape"), [((10,), None, (1,)), ((30, 10), None, (2, 5))] +) +def test_iter_chunk_regions( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | None, + chunk_shape: tuple[int, ...], + zarr_format: ZarrFormat, +) -> None: + """ + Test that we can use the various invocations of iter_chunk_regions to iterate over the regions + spanned by the chunks of an array. + """ + arr = zarr.create_array( + {}, + dtype="uint8", + shape=array_shape, + chunks=chunk_shape, + shards=shard_shape, + zarr_format=zarr_format, + ) + + expected = tuple(_iter_regions(arr.shape, chunk_shape)) + observed = tuple(_iter_chunk_regions(arr)) + assert observed == expected + assert observed == tuple(arr._iter_chunk_regions()) + assert observed == tuple(arr._async_array._iter_chunk_regions()) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 24b4b65505..56f17ad46b 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -18,7 +18,10 @@ CoordinateSelection, OrthogonalSelection, Selection, + _ArrayIndexingOrder, _iter_grid, + _iter_regions, + ceildiv, make_slice_selection, normalize_integer_selection, oindex, @@ -1996,6 +1999,71 @@ def test_iter_chunk_regions(): assert_array_equal(a[region], np.zeros_like(a[region])) +@pytest.mark.parametrize( + ("domain_shape", "region_shape", "origin", "selection_shape"), + [ + ((9,), (1,), None, (9,)), + ((9,), (1,), (0,), (9,)), + ((3,), (2,), (0,), (1,)), + ((9,), (2,), (2,), (2,)), + ((9, 9), (2, 1), None, None), + ((9, 9), (4, 1), None, None), + ], +) +@pytest.mark.parametrize("order", ["lexicographic"]) +@pytest.mark.parametrize("trim_excess", [True, False]) +def test_iter_regions( + domain_shape: tuple[int, ...], + region_shape: tuple[int, ...], + origin: tuple[int, ...] | None, + selection_shape: tuple[int, ...] | None, + order: _ArrayIndexingOrder, + trim_excess: bool, +) -> None: + """ + Test that iter_regions properly iterates over contiguous regions of a gridded domain. + """ + expected_slices_by_dim: list[list[slice]] = [] + origin_parsed: tuple[int, ...] + selection_shape_parsed: tuple[int, ...] + if origin is None: + origin_parsed = (0,) * len(domain_shape) + else: + origin_parsed = origin + if selection_shape is None: + selection_shape_parsed = tuple( + ceildiv(ds, rs) - o + for ds, o, rs in zip(domain_shape, origin_parsed, region_shape, strict=True) + ) + else: + selection_shape_parsed = selection_shape + for d_s, r_s, o, ss in zip( + domain_shape, region_shape, origin_parsed, selection_shape_parsed, strict=True + ): + _expected_slices: list[slice] = [] + start = o * r_s + for incr in range(start, start + ss * r_s, r_s): + if trim_excess: + term = min(incr + r_s, d_s) + else: + term = incr + r_s + _expected_slices.append(slice(incr, term, 1)) + expected_slices_by_dim.append(_expected_slices) + + expected = tuple(itertools.product(*expected_slices_by_dim)) + observed = tuple( + _iter_regions( + domain_shape, + region_shape, + origin=origin, + selection_shape=selection_shape, + order=order, + trim_excess=trim_excess, + ) + ) + assert observed == expected + + class TestAsync: @pytest.mark.parametrize( ("indexer", "expected"),