From 86e7368d320e4512554a554d123c7cd3f5056eda Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 11 Jul 2024 14:04:56 -0400 Subject: [PATCH 01/10] basic test --- .../tests/test_manifests/test_array.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index befc86510..86736b814 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -366,3 +366,26 @@ def test_refuse_combine(array_v3_metadata): for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): func([marr1, marr2], axis=0) + + +class TestIndexing: + def test_slice_aligned_with_chunks(self): + marr = create_manifestarray(shape=(4,), chunks=(2,)) + marr[0:2] + marr[2:4] + marr[0:4] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:1] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:3] From 3ccb07c0fe1fa88b597ef9b5efeb7bacc3d9c38f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 11 Jul 2024 14:05:10 -0400 Subject: [PATCH 02/10] implementation --- virtualizarr/manifests/array.py | 133 ++++++++++++++++++++++++++------ 1 file changed, 110 insertions(+), 23 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 5838e4d58..c8d6bd493 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,4 +1,5 @@ import warnings +from types import EllipsisType, NoneType from typing import Any, Callable, Union import numpy as np @@ -205,36 +206,85 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": def __getitem__( self, - key, + key: Union[ + int, + slice, + EllipsisType, + None, + tuple[Union[int, slice, EllipsisType, None], ...], + np.ndarray, + ], /, ) -> "ManifestArray": """ - Only supports extremely limited indexing. + Only supports indexing where slices are aligned with chunk boundaries. - Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway. + Follows the array API standard otherwise. """ - from xarray.core.indexing import BasicIndexer + indexer = key - if isinstance(key, BasicIndexer): - indexer = key.tuple + indexer_nd: tuple[Union[int, slice, EllipsisType, None, np.ndarray], ...] + if isinstance(indexer, (int, slice, EllipsisType, NoneType, np.ndarray)): + indexer_nd = (indexer,) else: - indexer = key + indexer_nd = indexer - indexer = _possibly_expand_trailing_ellipsis(key, self.ndim) + # _validate_indexer(indexer) - if len(indexer) != self.ndim: + indexer_nd = _possibly_expand_trailing_ellipses(indexer_nd, self.ndim) + + if len(indexer_nd) != self.ndim: raise ValueError( - f"Invalid indexer for array with ndim={self.ndim}: {indexer}" + f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}" ) - if all( - isinstance(axis_indexer, slice) and axis_indexer == slice(None) - for axis_indexer in indexer + chunk_slices = [] + new_arr_shape = [] + for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( + zip(indexer_nd, self.shape, self.chunks) ): - # indexer is all slice(None)'s, so this is a no-op - return self - else: - raise NotImplementedError(f"Doesn't support slicing with {indexer}") + if isinstance(indexer_1d, int): + array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) + elif isinstance(indexer_1d, NoneType): + array_slice_1d = slice(0, arr_length, 1) + elif isinstance(indexer_1d, slice): + array_slice_1d = slice( + indexer_1d.start if indexer_1d.start is not None else 0, + indexer_1d.stop if indexer_1d.stop is not None else arr_length, + indexer_1d.step if indexer_1d.step is not None else 1, + ) + else: + # TODO we could attempt to also support indexing with numpy arrays + raise TypeError( + f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" + ) + + chunk_slice_1d = _array_slice_to_chunk_slice( + array_slice_1d, arr_length, chunk_length + ) + chunk_slices.append(chunk_slice_1d) + + n_elements_in_slice = abs( + (array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step + ) + new_arr_shape.append(n_elements_in_slice) + + print(chunk_slices) + + # do slicing of entries in manifest + sliced_paths = self.manifest._paths[tuple(chunk_slices)] + sliced_offsets = self.manifest._offsets[tuple(chunk_slices)] + sliced_lengths = self.manifest._lengths[tuple(chunk_slices)] + sliced_manifest = ChunkManifest.from_arrays( + paths=sliced_paths, + offsets=sliced_offsets, + lengths=sliced_lengths, + ) + + # chunk sizes are unchanged by slicing that aligns with chunks + new_zarray = self.zarray.replace(shape=tuple(new_arr_shape)) + + return ManifestArray(chunkmanifest=sliced_manifest, zarray=new_zarray) def rename_paths( self, @@ -277,10 +327,47 @@ def rename_paths( return ManifestArray(metadata=self.metadata, chunkmanifest=renamed_manifest) -def _possibly_expand_trailing_ellipsis(key, ndim: int): - if key[-1] == ...: - extra_slices_needed = ndim - (len(key) - 1) - *indexer, ellipsis = key - return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed) +def _array_slice_to_chunk_slice( + array_slice: slice, + arr_length: int, + chunk_length: int, +) -> slice: + """ + Translate a slice into an array into a corresponding slice into the underlying chunk grid. + + Will raise on any array slices that require slicing within individual chunks. + """ + + if chunk_length == 1: + # alot of indexing is possible only in this case, because this is basically just a normal array along that axis + chunk_slice = array_slice + return chunk_slice + + # Check that start of slice aligns with start of a chunk + if array_slice.start % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + # Check that slice spans integer number of chunks + slice_length = array_slice.stop - array_slice.start + if slice_length % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + index_of_first_chunk = int(array_slice.start / chunk_length) + n_chunks = int(slice_length / chunk_length) + + chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) + + return chunk_slice + + +def _possibly_expand_trailing_ellipses(indexer, ndim: int): + if indexer[-1] == ...: + extra_slices_needed = ndim - (len(indexer) - 1) + *indexer_1d, ellipsis = indexer + return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) else: - return key + return indexer From ec7f2366e1793389394168d6ec77d72cafd58b1b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 21 Mar 2025 22:57:37 -0400 Subject: [PATCH 03/10] move into dedicated indexing.py file --- pyproject.toml | 2 +- virtualizarr/manifests/array.py | 100 +++--------------- virtualizarr/manifests/indexing.py | 89 ++++++++++++++++ .../tests/test_manifests/test_array.py | 19 +++- 4 files changed, 118 insertions(+), 92 deletions(-) create mode 100644 virtualizarr/manifests/indexing.py diff --git a/pyproject.toml b/pyproject.toml index 0bb5e6f4f..11ebdc4af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "ujson", "packaging", "zarr>=3.0.2", + "icechunk>=0.2.5", ] # Dependency sets under optional-dependencies are available via PyPI @@ -95,7 +96,6 @@ upstream = [ 'fsspec @ git+https://github.com/fsspec/filesystem_spec', 's3fs @ git+https://github.com/fsspec/s3fs', 'kerchunk @ git+https://github.com/fsspec/kerchunk', - 'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python', ] docs = [ "sphinx", diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index c8d6bd493..aaaf0b2c4 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -10,6 +10,8 @@ _isnan, ) from virtualizarr.manifests.manifest import ChunkManifest +import virtualizarr.manifests.utils as utils +import virtualizarr.manifests.indexing as indexing class ManifestArray: @@ -217,7 +219,9 @@ def __getitem__( /, ) -> "ManifestArray": """ - Only supports indexing where slices are aligned with chunk boundaries. + Slice this ManifestArray by indexing in array element space (as opposed to in chunk grid space). + + Only supports indexing where slices are aligned exactly with chunk boundaries. Follows the array API standard otherwise. """ @@ -231,60 +235,30 @@ def __getitem__( # _validate_indexer(indexer) - indexer_nd = _possibly_expand_trailing_ellipses(indexer_nd, self.ndim) + indexer_nd = indexing.possibly_expand_trailing_ellipses(indexer_nd, self.ndim) if len(indexer_nd) != self.ndim: raise ValueError( f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}" ) - chunk_slices = [] - new_arr_shape = [] - for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( - zip(indexer_nd, self.shape, self.chunks) - ): - if isinstance(indexer_1d, int): - array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) - elif isinstance(indexer_1d, NoneType): - array_slice_1d = slice(0, arr_length, 1) - elif isinstance(indexer_1d, slice): - array_slice_1d = slice( - indexer_1d.start if indexer_1d.start is not None else 0, - indexer_1d.stop if indexer_1d.stop is not None else arr_length, - indexer_1d.step if indexer_1d.step is not None else 1, - ) - else: - # TODO we could attempt to also support indexing with numpy arrays - raise TypeError( - f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" - ) - - chunk_slice_1d = _array_slice_to_chunk_slice( - array_slice_1d, arr_length, chunk_length - ) - chunk_slices.append(chunk_slice_1d) - - n_elements_in_slice = abs( - (array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step - ) - new_arr_shape.append(n_elements_in_slice) - - print(chunk_slices) + chunk_indexer, new_arr_shape = indexing.array_indexer_to_chunk_grid_indexer() # do slicing of entries in manifest - sliced_paths = self.manifest._paths[tuple(chunk_slices)] - sliced_offsets = self.manifest._offsets[tuple(chunk_slices)] - sliced_lengths = self.manifest._lengths[tuple(chunk_slices)] + # TODO add ChunkManifest.__getitem__ for this + sliced_paths = self.manifest._paths[chunk_indexer] + sliced_offsets = self.manifest._offsets[chunk_indexer] + sliced_lengths = self.manifest._lengths[chunk_indexer] sliced_manifest = ChunkManifest.from_arrays( paths=sliced_paths, offsets=sliced_offsets, lengths=sliced_lengths, ) - # chunk sizes are unchanged by slicing that aligns with chunks - new_zarray = self.zarray.replace(shape=tuple(new_arr_shape)) + # chunk sizes are unchanged by slicing that aligns with chunk boundaries + new_metadata = utils.copy_and_replace_metadata(self.metadata, new_shape=new_arr_shape) - return ManifestArray(chunkmanifest=sliced_manifest, zarray=new_zarray) + return ManifestArray(chunkmanifest=sliced_manifest, metadata=new_metadata) def rename_paths( self, @@ -325,49 +299,3 @@ def rename_paths( """ renamed_manifest = self.manifest.rename_paths(new) return ManifestArray(metadata=self.metadata, chunkmanifest=renamed_manifest) - - -def _array_slice_to_chunk_slice( - array_slice: slice, - arr_length: int, - chunk_length: int, -) -> slice: - """ - Translate a slice into an array into a corresponding slice into the underlying chunk grid. - - Will raise on any array slices that require slicing within individual chunks. - """ - - if chunk_length == 1: - # alot of indexing is possible only in this case, because this is basically just a normal array along that axis - chunk_slice = array_slice - return chunk_slice - - # Check that start of slice aligns with start of a chunk - if array_slice.start % chunk_length != 0: - raise NotImplementedError( - f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" - ) - - # Check that slice spans integer number of chunks - slice_length = array_slice.stop - array_slice.start - if slice_length % chunk_length != 0: - raise NotImplementedError( - f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" - ) - - index_of_first_chunk = int(array_slice.start / chunk_length) - n_chunks = int(slice_length / chunk_length) - - chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) - - return chunk_slice - - -def _possibly_expand_trailing_ellipses(indexer, ndim: int): - if indexer[-1] == ...: - extra_slices_needed = ndim - (len(indexer) - 1) - *indexer_1d, ellipsis = indexer - return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) - else: - return indexer diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py new file mode 100644 index 000000000..92603bf41 --- /dev/null +++ b/virtualizarr/manifests/indexing.py @@ -0,0 +1,89 @@ +from types import NoneType, EllipsisType +from typing import Union + + +def array_slice_to_chunk_slice( + array_slice: slice, + arr_length: int, + chunk_length: int, +) -> slice: + """ + Translate a slice into an array into a corresponding slice into the underlying chunk grid. + + Will raise on any array slices that require slicing within individual chunks. + """ + + if chunk_length == 1: + # alot of indexing is possible only in this case, because this is basically just a normal array along that axis + chunk_slice = array_slice + return chunk_slice + + # Check that start of slice aligns with start of a chunk + if array_slice.start % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + # Check that slice spans integer number of chunks + slice_length = array_slice.stop - array_slice.start + if slice_length % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + index_of_first_chunk = int(array_slice.start / chunk_length) + n_chunks = int(slice_length / chunk_length) + + chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) + + return chunk_slice + + +def possibly_expand_trailing_ellipses(indexer, ndim: int) -> tuple[Union[int, slice, EllipsisType, None], ...]: + if indexer[-1] == ...: + extra_slices_needed = ndim - (len(indexer) - 1) + *indexer_1d, ellipsis = indexer + return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) + else: + return indexer + + +def array_indexer_to_chunk_grid_indexer( + indexer_nd, + shape: tuple[int, ...], + chunks: tuple[int, ...], +) -> tuple[tuple[slice, ...], tuple[int, ...]]: + """Convert an indexer in array element space into the corresponding indexer in chunk grid space.""" + + chunk_slices = [] + new_arr_shape = [] + for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( + zip(indexer_nd, shape, chunks) + ): + if isinstance(indexer_1d, int): + array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) + elif isinstance(indexer_1d, NoneType): + array_slice_1d = slice(0, arr_length, 1) + elif isinstance(indexer_1d, slice): + array_slice_1d = slice( + indexer_1d.start if indexer_1d.start is not None else 0, + indexer_1d.stop if indexer_1d.stop is not None else arr_length, + indexer_1d.step if indexer_1d.step is not None else 1, + ) + else: + # TODO we could attempt to also support indexing with numpy arrays + raise TypeError( + f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" + ) + + chunk_slice_1d = array_slice_to_chunk_slice( + array_slice_1d, arr_length, chunk_length + ) + chunk_slices.append(chunk_slice_1d) + + n_elements_in_slice = abs( + (array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step + ) + new_arr_shape.append(n_elements_in_slice) + + return tuple(chunk_slices), tuple(new_arr_shape) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 86736b814..b2a15c024 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -369,11 +369,20 @@ def test_refuse_combine(array_v3_metadata): class TestIndexing: - def test_slice_aligned_with_chunks(self): - marr = create_manifestarray(shape=(4,), chunks=(2,)) - marr[0:2] - marr[2:4] - marr[0:4] + def test_slice_aligned_with_chunks(self, manifest_array): + marr = manifest_array(shape=(4,), chunks=(2,)) + + subarr = marr[0:4] + assert isinstance(subarr, ManifestArray) + assert subarr.metadata == marr.metadata + assert subarr.chunks == marr.chunks + assert subarr.shape == (4,) + + subarr = marr[0:2] + subarr = marr[2:4] + + def test_slice_misaligned_with_chunks(self, manifest_array): + marr = manifest_array(shape=(4,), chunks=(2,)) with pytest.raises( NotImplementedError, match="slice would split individual chunks" From 80259ba617f28e054ae376f7b3385dd95a2c0561 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 22 Mar 2025 00:38:15 -0400 Subject: [PATCH 04/10] sketch how chunk-aligned indexing could be implemented using primitives from zarr-python v3 internals --- virtualizarr/manifests/array.py | 54 ++++--- virtualizarr/manifests/indexing.py | 135 ++++++++++-------- .../tests/test_manifests/test_array.py | 15 +- virtualizarr/utils.py | 2 +- 4 files changed, 117 insertions(+), 89 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index aaaf0b2c4..4a11565c1 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,9 +1,10 @@ import warnings -from types import EllipsisType, NoneType +from types import EllipsisType from typing import Any, Callable, Union import numpy as np from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +from zarr.core.indexing import BasicIndexer from virtualizarr.manifests.array_api import ( MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, @@ -14,6 +15,7 @@ import virtualizarr.manifests.indexing as indexing + class ManifestArray: """ Virtualized array representation of the chunk data in a single Zarr Array. @@ -208,7 +210,7 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": def __getitem__( self, - key: Union[ + selection: Union[ int, slice, EllipsisType, @@ -223,38 +225,46 @@ def __getitem__( Only supports indexing where slices are aligned exactly with chunk boundaries. - Follows the array API standard otherwise. - """ - indexer = key - - indexer_nd: tuple[Union[int, slice, EllipsisType, None, np.ndarray], ...] - if isinstance(indexer, (int, slice, EllipsisType, NoneType, np.ndarray)): - indexer_nd = (indexer,) - else: - indexer_nd = indexer + Effectively, this means that the following indexing modes are supported: - # _validate_indexer(indexer) + - integer indexing + - slice indexing + - mixed slice and integer indexing - indexer_nd = indexing.possibly_expand_trailing_ellipses(indexer_nd, self.ndim) + Follows the array API standard otherwise. + """ + print(f"{selection=}") + + # TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer + # TODO will this expand trailing ellipses? + indexer = BasicIndexer( + selection, + self.shape, + self.metadata.chunk_grid, + ) - if len(indexer_nd) != self.ndim: - raise ValueError( - f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}" - ) + # TODO is this where we would differ codepath for an uncompressed array? + chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer(indexer) - chunk_indexer, new_arr_shape = indexing.array_indexer_to_chunk_grid_indexer() + print(f"{chunk_grid_indexer=}") + + # TODO translate new chunk_grid_indexer BasicIndexer into normal Selection that numpy can understand # do slicing of entries in manifest - # TODO add ChunkManifest.__getitem__ for this - sliced_paths = self.manifest._paths[chunk_indexer] - sliced_offsets = self.manifest._offsets[chunk_indexer] - sliced_lengths = self.manifest._lengths[chunk_indexer] + # TODO add ChunkManifest.__getitem__ for this? + # TODO or add some kind of dedicated method that can't be confused with the API of Mapping + sliced_paths = self.manifest._paths[chunk_grid_indexer] + sliced_offsets = self.manifest._offsets[chunk_grid_indexer] + sliced_lengths = self.manifest._lengths[chunk_grid_indexer] sliced_manifest = ChunkManifest.from_arrays( paths=sliced_paths, offsets=sliced_offsets, lengths=sliced_lengths, ) + # TODO deduce the new array shape from the new chunk grid shape + + # chunk sizes are unchanged by slicing that aligns with chunk boundaries new_metadata = utils.copy_and_replace_metadata(self.metadata, new_shape=new_arr_shape) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 92603bf41..3917fbd56 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -1,18 +1,80 @@ from types import NoneType, EllipsisType from typing import Union +from virtualizarr.utils import determine_chunk_grid_shape -def array_slice_to_chunk_slice( - array_slice: slice, - arr_length: int, - chunk_length: int, -) -> slice: +from zarr.core.indexing import BasicIndexer, IntDimIndexer, SliceDimIndexer + + + +# TODO write a custom message for this +class SubChunkIndexingError(IndexError): + ... + + +def array_indexer_to_chunk_grid_indexer( + indexer: BasicIndexer, +) -> BasicIndexer: + """ + Translate an indexer into an array into a corresponding indexer into the underlying chunk grid. + + As compressed chunks cannot be subdivided, this will raise on any array slices that require slicing within individual chunks. + """ + + print(f"{indexer=}") + + [ + print(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ] + + # TODO move this check outside? Because we can arbitrarily index into uncompressed arrays... + if not all(is_complete_chunk for _, _, _, is_complete_chunk in indexer): + raise SubChunkIndexingError() + + array_shape = indexer.shape + chunk_shape = indexer.chunk_grid.chunk_shape + chunk_grid_shape = determine_chunk_grid_shape(array_shape, chunk_shape) + + chunk_grid_dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] + for dim_indexer, dim_len, dim_chunk_len in zip( + indexer.dim_indexers, + indexer.shape, + chunk_grid_shape, + strict=True, + ): + chunk_grid_dim_indexer: IntDimIndexer | SliceDimIndexer + if isinstance(dim_indexer, IntDimIndexer): + if dim_len == 1: + chunk_grid_dim_indexer = dim_indexer + else: + raise SubChunkIndexingError + + elif isinstance(dim_indexer, SliceDimIndexer): + dim_indexer = array_slice_to_chunk_grid_slice(dim_indexer) + + chunk_grid_dim_indexers.append(chunk_grid_dim_indexer) + + obj = BasicIndexer.__new__() + obj.dim_indexers = chunk_grid_dim_indexers + obj.shape = tuple(s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer)) + obj.drop_axes = () + + return obj + + +def array_slice_to_chunk_grid_slice( + array_slice: SliceDimIndexer, +) -> SliceDimIndexer: """ Translate a slice into an array into a corresponding slice into the underlying chunk grid. - Will raise on any array slices that require slicing within individual chunks. + Will raise on any array slices that would require slicing within individual chunks. """ + arr_length = array_slice.dim_len + chunk_length = array_slice.dim_chunk_len + if chunk_length == 1: # alot of indexing is possible only in this case, because this is basically just a normal array along that axis chunk_slice = array_slice @@ -20,14 +82,14 @@ def array_slice_to_chunk_slice( # Check that start of slice aligns with start of a chunk if array_slice.start % chunk_length != 0: - raise NotImplementedError( + raise IndexError( f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" ) # Check that slice spans integer number of chunks slice_length = array_slice.stop - array_slice.start if slice_length % chunk_length != 0: - raise NotImplementedError( + raise IndexError( f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" ) @@ -36,54 +98,9 @@ def array_slice_to_chunk_slice( chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) - return chunk_slice - - -def possibly_expand_trailing_ellipses(indexer, ndim: int) -> tuple[Union[int, slice, EllipsisType, None], ...]: - if indexer[-1] == ...: - extra_slices_needed = ndim - (len(indexer) - 1) - *indexer_1d, ellipsis = indexer - return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) - else: - return indexer - - -def array_indexer_to_chunk_grid_indexer( - indexer_nd, - shape: tuple[int, ...], - chunks: tuple[int, ...], -) -> tuple[tuple[slice, ...], tuple[int, ...]]: - """Convert an indexer in array element space into the corresponding indexer in chunk grid space.""" - - chunk_slices = [] - new_arr_shape = [] - for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( - zip(indexer_nd, shape, chunks) - ): - if isinstance(indexer_1d, int): - array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) - elif isinstance(indexer_1d, NoneType): - array_slice_1d = slice(0, arr_length, 1) - elif isinstance(indexer_1d, slice): - array_slice_1d = slice( - indexer_1d.start if indexer_1d.start is not None else 0, - indexer_1d.stop if indexer_1d.stop is not None else arr_length, - indexer_1d.step if indexer_1d.step is not None else 1, - ) - else: - # TODO we could attempt to also support indexing with numpy arrays - raise TypeError( - f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" - ) - - chunk_slice_1d = array_slice_to_chunk_slice( - array_slice_1d, arr_length, chunk_length - ) - chunk_slices.append(chunk_slice_1d) - - n_elements_in_slice = abs( - (array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step - ) - new_arr_shape.append(n_elements_in_slice) - - return tuple(chunk_slices), tuple(new_arr_shape) + return SliceDimIndexer( + dim_sel=chunk_slice, + # TODO which dim does this refer to? That of the chunk grid? + dim_len=..., + dim_chunk_len=..., + ) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index b2a15c024..a8ee71d2a 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -369,16 +369,17 @@ def test_refuse_combine(array_v3_metadata): class TestIndexing: + # TODO parametrize over a bunch of valid options here def test_slice_aligned_with_chunks(self, manifest_array): - marr = manifest_array(shape=(4,), chunks=(2,)) + marr = manifest_array(shape=(8,), chunks=(2,)) - subarr = marr[0:4] - assert isinstance(subarr, ManifestArray) - assert subarr.metadata == marr.metadata - assert subarr.chunks == marr.chunks - assert subarr.shape == (4,) + # subarr = marr[0:4] + # assert isinstance(subarr, ManifestArray) + # assert subarr.metadata == marr.metadata + # assert subarr.chunks == marr.chunks + # assert subarr.shape == (4,) - subarr = marr[0:2] + subarr = marr[0:6] subarr = marr[2:4] def test_slice_misaligned_with_chunks(self, manifest_array): diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index d36f0826e..3aec8c077 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -106,7 +106,7 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): else: return None - +# TODO move this and determine_chunk_grid_shape to manifests.utils.py def ceildiv(a: int, b: int) -> int: """ Ceiling division operator for integers. From 9e802a27317ee2474a3188c5547200486904dd80 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 22 Mar 2025 04:41:24 +0000 Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/manifests/array.py | 20 +++++++++---------- virtualizarr/manifests/indexing.py | 19 ++++++++---------- .../tests/test_manifests/test_array.py | 2 +- virtualizarr/utils.py | 1 + 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 4a11565c1..254590fe0 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,17 +3,16 @@ from typing import Any, Callable, Union import numpy as np -from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid from zarr.core.indexing import BasicIndexer +from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +import virtualizarr.manifests.indexing as indexing +import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan, ) from virtualizarr.manifests.manifest import ChunkManifest -import virtualizarr.manifests.utils as utils -import virtualizarr.manifests.indexing as indexing - class ManifestArray: @@ -222,7 +221,7 @@ def __getitem__( ) -> "ManifestArray": """ Slice this ManifestArray by indexing in array element space (as opposed to in chunk grid space). - + Only supports indexing where slices are aligned exactly with chunk boundaries. Effectively, this means that the following indexing modes are supported: @@ -238,8 +237,8 @@ def __getitem__( # TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer # TODO will this expand trailing ellipses? indexer = BasicIndexer( - selection, - self.shape, + selection, + self.shape, self.metadata.chunk_grid, ) @@ -247,7 +246,7 @@ def __getitem__( chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer(indexer) print(f"{chunk_grid_indexer=}") - + # TODO translate new chunk_grid_indexer BasicIndexer into normal Selection that numpy can understand # do slicing of entries in manifest @@ -264,9 +263,10 @@ def __getitem__( # TODO deduce the new array shape from the new chunk grid shape - # chunk sizes are unchanged by slicing that aligns with chunk boundaries - new_metadata = utils.copy_and_replace_metadata(self.metadata, new_shape=new_arr_shape) + new_metadata = utils.copy_and_replace_metadata( + self.metadata, new_shape=new_arr_shape + ) return ManifestArray(chunkmanifest=sliced_manifest, metadata=new_metadata) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 3917fbd56..9d8a2a4a2 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -1,15 +1,10 @@ -from types import NoneType, EllipsisType -from typing import Union - -from virtualizarr.utils import determine_chunk_grid_shape - from zarr.core.indexing import BasicIndexer, IntDimIndexer, SliceDimIndexer +from virtualizarr.utils import determine_chunk_grid_shape # TODO write a custom message for this -class SubChunkIndexingError(IndexError): - ... +class SubChunkIndexingError(IndexError): ... def array_indexer_to_chunk_grid_indexer( @@ -31,7 +26,7 @@ def array_indexer_to_chunk_grid_indexer( # TODO move this check outside? Because we can arbitrarily index into uncompressed arrays... if not all(is_complete_chunk for _, _, _, is_complete_chunk in indexer): raise SubChunkIndexingError() - + array_shape = indexer.shape chunk_shape = indexer.chunk_grid.chunk_shape chunk_grid_shape = determine_chunk_grid_shape(array_shape, chunk_shape) @@ -49,7 +44,7 @@ def array_indexer_to_chunk_grid_indexer( chunk_grid_dim_indexer = dim_indexer else: raise SubChunkIndexingError - + elif isinstance(dim_indexer, SliceDimIndexer): dim_indexer = array_slice_to_chunk_grid_slice(dim_indexer) @@ -57,7 +52,9 @@ def array_indexer_to_chunk_grid_indexer( obj = BasicIndexer.__new__() obj.dim_indexers = chunk_grid_dim_indexers - obj.shape = tuple(s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer)) + obj.shape = tuple( + s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer) + ) obj.drop_axes = () return obj @@ -99,7 +96,7 @@ def array_slice_to_chunk_grid_slice( chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) return SliceDimIndexer( - dim_sel=chunk_slice, + dim_sel=chunk_slice, # TODO which dim does this refer to? That of the chunk grid? dim_len=..., dim_chunk_len=..., diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index a8ee71d2a..a04d4a798 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -381,7 +381,7 @@ def test_slice_aligned_with_chunks(self, manifest_array): subarr = marr[0:6] subarr = marr[2:4] - + def test_slice_misaligned_with_chunks(self, manifest_array): marr = manifest_array(shape=(4,), chunks=(2,)) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 3aec8c077..fa20fb6a7 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -106,6 +106,7 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): else: return None + # TODO move this and determine_chunk_grid_shape to manifests.utils.py def ceildiv(a: int, b: int) -> int: """ From 55407e1522cfe6a51ff45e43429a4f9ad6f13501 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 22 Mar 2025 13:15:36 -0400 Subject: [PATCH 06/10] moved utilities to manifests.utils.py --- conftest.py | 7 +++--- virtualizarr/manifests/array_api.py | 32 +++++++++++----------------- virtualizarr/manifests/utils.py | 26 ++++++++++++++++++++++ virtualizarr/translators/kerchunk.py | 10 ++++----- virtualizarr/utils.py | 16 -------------- 5 files changed, 45 insertions(+), 46 deletions(-) diff --git a/conftest.py b/conftest.py index 7b827812f..f6e0083b0 100644 --- a/conftest.py +++ b/conftest.py @@ -15,8 +15,7 @@ # Local imports from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.utils import ceildiv +import virtualizarr.manifests.utils as utils # Pytest configuration @@ -76,7 +75,7 @@ def _generate_chunk_entries( Mapping of chunk keys to entry dictionaries """ chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) + utils.ceildiv(axis_length, chunk_length) for axis_length, chunk_length in zip(shape, chunks) ) @@ -261,7 +260,7 @@ def _create_metadata( fill_value: int | None = None, ): codecs = codecs or [{"configuration": {"endian": "little"}, "name": "bytes"}] - return create_v3_array_metadata( + return utils.create_v3_array_metadata( shape=shape, chunk_shape=chunks, data_type=data_type, diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index c084f40d9..7c7ff7bc7 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -2,16 +2,8 @@ import numpy as np -from virtualizarr.utils import determine_chunk_grid_shape - -from .manifest import ChunkManifest -from .utils import ( - check_combinable_zarr_arrays, - check_same_ndims, - check_same_shapes, - check_same_shapes_except_on_concat_axis, - copy_and_replace_metadata, -) +from virtualizarr.manifests.manifest import ChunkManifest +import virtualizarr.manifests.utils as utils if TYPE_CHECKING: from .array import ManifestArray @@ -65,9 +57,9 @@ def concatenate( raise TypeError() # ensure dtypes, shapes, codecs etc. are consistent - check_combinable_zarr_arrays(arrays) + utils.check_combinable_zarr_arrays(arrays) - check_same_ndims([arr.ndim for arr in arrays]) + utils.check_same_ndims([arr.ndim for arr in arrays]) # Ensure we handle axis being passed as a negative integer first_arr = arrays[0] @@ -75,7 +67,7 @@ def concatenate( axis = axis % first_arr.ndim arr_shapes = [arr.shape for arr in arrays] - check_same_shapes_except_on_concat_axis(arr_shapes, axis) + utils.check_same_shapes_except_on_concat_axis(arr_shapes, axis) # find what new array shape must be new_length_along_concat_axis = sum([shape[axis] for shape in arr_shapes]) @@ -102,7 +94,7 @@ def concatenate( lengths=concatenated_lengths, ) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape ) @@ -128,11 +120,11 @@ def stack( raise TypeError() # ensure dtypes, shapes, codecs etc. are consistent - check_combinable_zarr_arrays(arrays) + utils.check_combinable_zarr_arrays(arrays) - check_same_ndims([arr.ndim for arr in arrays]) + utils.check_same_ndims([arr.ndim for arr in arrays]) arr_shapes = [arr.shape for arr in arrays] - check_same_shapes(arr_shapes) + utils.check_same_shapes(arr_shapes) # Ensure we handle axis being passed as a negative integer first_arr = arrays[0] @@ -172,7 +164,7 @@ def stack( new_chunks = list(old_chunks) new_chunks.insert(axis, 1) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks ) @@ -212,7 +204,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra ) # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) + new_chunk_grid_shape = utils.determine_chunk_grid_shape(new_shape, new_chunk_shape) # do broadcasting of entries in manifest broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype @@ -237,7 +229,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra lengths=broadcasted_lengths, ) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=x.metadata, new_shape=list(new_shape), new_chunks=list(new_chunk_shape), diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index b4dbfb175..96eec2019 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -187,3 +187,29 @@ def copy_and_replace_metadata( # ArrayV3Metadata.from_dict removes extra keys zarr_format and node_type new_metadata = ArrayV3Metadata.from_dict(metadata_copy) return new_metadata + + +def ceildiv(a: int, b: int) -> int: + """ + Ceiling division operator for integers. + + See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python + """ + return -(a // -b) + + +def determine_chunk_grid_shape( + shape: tuple[int, ...], chunks: tuple[int, ...] +) -> tuple[int, ...]: + """Calculate the shape of the chunk grid based on array shape and chunk size.""" + return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) + + +def determine_array_shape( + chunk_grid_shape: tuple[int, ...], + chunk_shape: tuple[int, ...], +) -> tuple[int, ...]: + return tuple( + dim_chunk_grid * dim_chunk_len + for dim_chunk_grid, dim_chunk_len in zip(chunk_grid_shape, chunk_shape) + ) diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py index 3e1c7108c..df8403f6f 100644 --- a/virtualizarr/translators/kerchunk.py +++ b/virtualizarr/translators/kerchunk.py @@ -1,9 +1,8 @@ from typing import Any, Mapping, MutableMapping, cast import numpy as np -from xarray import Dataset from xarray.core.indexes import Index -from xarray.core.variable import Variable +from xarray import Dataset, Variable from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -13,13 +12,12 @@ ) from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import ChunkEntry, ChunkKey -from virtualizarr.manifests.utils import create_v3_array_metadata +import virtualizarr.manifests.utils as utils from virtualizarr.readers.common import separate_coords from virtualizarr.types.kerchunk import ( KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import determine_chunk_grid_shape def to_kerchunk_json(v2_metadata: ArrayV2Metadata) -> str: @@ -84,7 +82,7 @@ def from_kerchunk_refs(decoded_arr_refs_zarray) -> "ArrayV3Metadata": numcodec_configs = [ numcodec_config_to_configurable(config) for config in codec_configs ] - return create_v3_array_metadata( + return utils.create_v3_array_metadata( chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]), data_type=dtype, codecs=numcodec_configs, @@ -236,7 +234,7 @@ def variable_from_kerchunk_refs( # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D - shape = determine_chunk_grid_shape( + shape = utils.determine_chunk_grid_shape( metadata.shape, metadata.chunks, ) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 3aec8c077..540c2c076 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -106,22 +106,6 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): else: return None -# TODO move this and determine_chunk_grid_shape to manifests.utils.py -def ceildiv(a: int, b: int) -> int: - """ - Ceiling division operator for integers. - - See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - """ - return -(a // -b) - - -def determine_chunk_grid_shape( - shape: tuple[int, ...], chunks: tuple[int, ...] -) -> tuple[int, ...]: - """Calculate the shape of the chunk grid based on array shape and chunk size.""" - return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) - def convert_v3_to_v2_metadata( v3_metadata: ArrayV3Metadata, fill_value: Any = None From 20e93311f2747027eb93f49b4c15d78790cd8855 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 22 Mar 2025 13:18:19 -0400 Subject: [PATCH 07/10] some initial indexing tests working --- virtualizarr/manifests/array.py | 44 ++++++--- virtualizarr/manifests/indexing.py | 91 ++++++++++++++----- .../tests/test_manifests/test_array.py | 43 ++++++--- 3 files changed, 128 insertions(+), 50 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 4a11565c1..35eda3bac 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,17 +3,16 @@ from typing import Any, Callable, Union import numpy as np -from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid from zarr.core.indexing import BasicIndexer +from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +import virtualizarr.manifests.indexing as indexing +import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan, ) from virtualizarr.manifests.manifest import ChunkManifest -import virtualizarr.manifests.utils as utils -import virtualizarr.manifests.indexing as indexing - class ManifestArray: @@ -222,7 +221,7 @@ def __getitem__( ) -> "ManifestArray": """ Slice this ManifestArray by indexing in array element space (as opposed to in chunk grid space). - + Only supports indexing where slices are aligned exactly with chunk boundaries. Effectively, this means that the following indexing modes are supported: @@ -238,35 +237,52 @@ def __getitem__( # TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer # TODO will this expand trailing ellipses? indexer = BasicIndexer( - selection, - self.shape, + selection, + self.shape, self.metadata.chunk_grid, ) # TODO is this where we would differ codepath for an uncompressed array? - chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer(indexer) + chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer( + indexer=indexer, + arr_shape=self.shape, + chunk_shape=self.chunks, + ) print(f"{chunk_grid_indexer=}") - + # TODO translate new chunk_grid_indexer BasicIndexer into normal Selection that numpy can understand + chunk_grid_selection = indexing.indexer_to_selection(chunk_grid_indexer) + + print(f"{chunk_grid_selection=}") # do slicing of entries in manifest # TODO add ChunkManifest.__getitem__ for this? # TODO or add some kind of dedicated method that can't be confused with the API of Mapping - sliced_paths = self.manifest._paths[chunk_grid_indexer] - sliced_offsets = self.manifest._offsets[chunk_grid_indexer] - sliced_lengths = self.manifest._lengths[chunk_grid_indexer] + sliced_paths = self.manifest._paths[chunk_grid_selection] + sliced_offsets = self.manifest._offsets[chunk_grid_selection] + sliced_lengths = self.manifest._lengths[chunk_grid_selection] + print(f"{sliced_paths=}") sliced_manifest = ChunkManifest.from_arrays( paths=sliced_paths, offsets=sliced_offsets, lengths=sliced_lengths, ) - # TODO deduce the new array shape from the new chunk grid shape + print(f"{sliced_manifest=}") + new_arr_shape = utils.determine_array_shape( + chunk_grid_shape=sliced_manifest.shape_chunk_grid, + chunk_shape=self.chunks, + ) + + print(f"{new_arr_shape=}") # chunk sizes are unchanged by slicing that aligns with chunk boundaries - new_metadata = utils.copy_and_replace_metadata(self.metadata, new_shape=new_arr_shape) + new_metadata = utils.copy_and_replace_metadata( + self.metadata, + new_shape=new_arr_shape, + ) return ManifestArray(chunkmanifest=sliced_manifest, metadata=new_metadata) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 3917fbd56..2a74d8e38 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -1,24 +1,31 @@ -from types import NoneType, EllipsisType -from typing import Union - -from virtualizarr.utils import determine_chunk_grid_shape - -from zarr.core.indexing import BasicIndexer, IntDimIndexer, SliceDimIndexer - +from zarr.core.indexing import ( + BasicIndexer, + BasicSelection, + BasicSelector, + IntDimIndexer, + SliceDimIndexer, +) # TODO write a custom message for this -class SubChunkIndexingError(IndexError): - ... +class SubChunkIndexingError(IndexError): ... def array_indexer_to_chunk_grid_indexer( indexer: BasicIndexer, + arr_shape: tuple[int, ...], + chunk_shape: tuple[int, ...], ) -> BasicIndexer: """ Translate an indexer into an array into a corresponding indexer into the underlying chunk grid. As compressed chunks cannot be subdivided, this will raise on any array slices that require slicing within individual chunks. + + Parameters + ---------- + indexer + arr_shape : tuple[int, ...] + Shape of the ManifestArray we are trying to index into. """ print(f"{indexer=}") @@ -29,38 +36,49 @@ def array_indexer_to_chunk_grid_indexer( ] # TODO move this check outside? Because we can arbitrarily index into uncompressed arrays... + # TODO or is that unrelated? Uncompressed arrays allow us to choose arbitrary chunking at reference generation time if not all(is_complete_chunk for _, _, _, is_complete_chunk in indexer): raise SubChunkIndexingError() - - array_shape = indexer.shape - chunk_shape = indexer.chunk_grid.chunk_shape - chunk_grid_shape = determine_chunk_grid_shape(array_shape, chunk_shape) + # TODO does the array shape have to match the indexer shape? Should this have been checked already? chunk_grid_dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] - for dim_indexer, dim_len, dim_chunk_len in zip( + for ( + dim_indexer, + dim_len, + dim_chunk_len, + ) in zip( indexer.dim_indexers, - indexer.shape, - chunk_grid_shape, + arr_shape, + chunk_shape, + # chunk_grid_shape, # TODO do we really not need this?? strict=True, ): chunk_grid_dim_indexer: IntDimIndexer | SliceDimIndexer if isinstance(dim_indexer, IntDimIndexer): - if dim_len == 1: + print(f"{dim_len=}") + if dim_chunk_len == 1: chunk_grid_dim_indexer = dim_indexer else: raise SubChunkIndexingError - + elif isinstance(dim_indexer, SliceDimIndexer): dim_indexer = array_slice_to_chunk_grid_slice(dim_indexer) chunk_grid_dim_indexers.append(chunk_grid_dim_indexer) - obj = BasicIndexer.__new__() - obj.dim_indexers = chunk_grid_dim_indexers - obj.shape = tuple(s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer)) - obj.drop_axes = () + # TODO check this - I'm not sure if I've understood the "shape" of an indexer correctly + chunk_grid_dim_indexer_shape = tuple( + s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer) + ) + + # The BasicIndexer constructor doesn't allow us to set the attributes we want to + # Also BasicIndexer is a frozen dataclass so we have to use .__setattr__() + chunk_grid_indexer = object.__new__(BasicIndexer) + object.__setattr__(chunk_grid_indexer, "dim_indexers", chunk_grid_dim_indexers) + object.__setattr__(chunk_grid_indexer, "shape", chunk_grid_dim_indexer_shape) + object.__setattr__(chunk_grid_indexer, "drop_axes", ()) - return obj + return chunk_grid_indexer def array_slice_to_chunk_grid_slice( @@ -99,8 +117,33 @@ def array_slice_to_chunk_grid_slice( chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) return SliceDimIndexer( - dim_sel=chunk_slice, + dim_sel=chunk_slice, # TODO which dim does this refer to? That of the chunk grid? dim_len=..., dim_chunk_len=..., ) + + +def indexer_to_selection(indexer: BasicIndexer) -> BasicSelection: + """Translate an indexer into a selection that numpy can understand.""" + selection = [] + for dim_indexer in indexer.dim_indexers: + dim_selection: BasicSelector + if isinstance(dim_indexer, IntDimIndexer): + # avoid numpy returning a scalar value instead of np.ndarray + dim_selection = slice( + dim_indexer.dim_sel, + dim_indexer.dim_sel + 1, + ) + elif isinstance(dim_indexer, SliceDimIndexer): + dim_selection = slice( + dim_indexer.start, + dim_indexer.stop, + dim_indexer.step, + ) + else: + raise NotImplementedError + + selection.append(dim_selection) + + return tuple(selection) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index a8ee71d2a..e1f884211 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -2,6 +2,7 @@ import pytest from zarr.core.metadata.v3 import ArrayV3Metadata +import virtualizarr.manifests.utils as utils from conftest import ( ARRAYBYTES_CODEC, ZLIB_CODEC, @@ -370,18 +371,36 @@ def test_refuse_combine(array_v3_metadata): class TestIndexing: # TODO parametrize over a bunch of valid options here - def test_slice_aligned_with_chunks(self, manifest_array): - marr = manifest_array(shape=(8,), chunks=(2,)) - - # subarr = marr[0:4] - # assert isinstance(subarr, ManifestArray) - # assert subarr.metadata == marr.metadata - # assert subarr.chunks == marr.chunks - # assert subarr.shape == (4,) - - subarr = marr[0:6] - subarr = marr[2:4] - + @pytest.mark.parametrize( + "in_shape, in_chunks, selection, out_shape, out_chunks", + [ + ((2,), (1,), 0, (1,), (1,)), + ((2,), (1,), 1, (1,), (1,)), + ((8,), (2,), slice(0, 4), (4,), (2,)), + ((2,), (2,), slice(2, 4), (2,), (2,)), + ], + ) + def test_slice_aligned_with_chunks( + self, manifest_array, in_shape, in_chunks, selection, out_shape, out_chunks + ): + marr = manifest_array(shape=in_shape, chunks=in_chunks) + + subarr = marr[selection] + + assert isinstance(subarr, ManifestArray) + assert subarr.shape == out_shape + assert subarr.chunks == out_chunks + + # all metadata should be the same except for shape (and possibly chunks?) + expected_metadata = utils.copy_and_replace_metadata( + marr.metadata, + new_shape=out_shape, + ) + # expected_metadata.chunks = out_chunks + assert subarr.metadata == expected_metadata + + def test_slice_concat_roundtrip(self): ... + def test_slice_misaligned_with_chunks(self, manifest_array): marr = manifest_array(shape=(4,), chunks=(2,)) From 335fa02501fba585c6be42382389f6ee246f97e9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 22 Mar 2025 13:18:40 -0400 Subject: [PATCH 08/10] linting --- conftest.py | 3 ++- virtualizarr/manifests/array_api.py | 2 +- virtualizarr/translators/kerchunk.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index f6e0083b0..7ccbc4f50 100644 --- a/conftest.py +++ b/conftest.py @@ -12,10 +12,11 @@ import xarray as xr from xarray.core.variable import Variable +import virtualizarr.manifests.utils as utils + # Local imports from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join -import virtualizarr.manifests.utils as utils # Pytest configuration diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 7c7ff7bc7..38ffa5fc2 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -2,8 +2,8 @@ import numpy as np -from virtualizarr.manifests.manifest import ChunkManifest import virtualizarr.manifests.utils as utils +from virtualizarr.manifests.manifest import ChunkManifest if TYPE_CHECKING: from .array import ManifestArray diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py index df8403f6f..7d859667b 100644 --- a/virtualizarr/translators/kerchunk.py +++ b/virtualizarr/translators/kerchunk.py @@ -1,18 +1,18 @@ from typing import Any, Mapping, MutableMapping, cast import numpy as np -from xarray.core.indexes import Index from xarray import Dataset, Variable +from xarray.core.indexes import Index from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata +import virtualizarr.manifests.utils as utils from virtualizarr.codecs import ( numcodec_config_to_configurable, ) from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import ChunkEntry, ChunkKey -import virtualizarr.manifests.utils as utils from virtualizarr.readers.common import separate_coords from virtualizarr.types.kerchunk import ( KerchunkArrRefs, From 9ebacb80943967d26411345ef3838690512407fc Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 22 Mar 2025 14:23:13 -0400 Subject: [PATCH 09/10] added tests for drop_axes --- virtualizarr/manifests/array.py | 2 +- virtualizarr/manifests/indexing.py | 61 +++++++++++++------ .../tests/test_manifests/test_array.py | 14 +++++ 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 35eda3bac..26f3c89f0 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -235,7 +235,7 @@ def __getitem__( print(f"{selection=}") # TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer - # TODO will this expand trailing ellipses? + # TODO will this expand trailing ellipses? (it should) indexer = BasicIndexer( selection, self.shape, diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 2a74d8e38..1c410c5f3 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -40,6 +40,9 @@ def array_indexer_to_chunk_grid_indexer( if not all(is_complete_chunk for _, _, _, is_complete_chunk in indexer): raise SubChunkIndexingError() + if indexer.drop_axes: + raise NotImplementedError + # TODO does the array shape have to match the indexer shape? Should this have been checked already? chunk_grid_dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] for ( @@ -55,14 +58,10 @@ def array_indexer_to_chunk_grid_indexer( ): chunk_grid_dim_indexer: IntDimIndexer | SliceDimIndexer if isinstance(dim_indexer, IntDimIndexer): - print(f"{dim_len=}") - if dim_chunk_len == 1: - chunk_grid_dim_indexer = dim_indexer - else: - raise SubChunkIndexingError + chunk_grid_dim_indexer = array_int_indexer_to_chunk_grid_int_indexer(dim_indexer) elif isinstance(dim_indexer, SliceDimIndexer): - dim_indexer = array_slice_to_chunk_grid_slice(dim_indexer) + chunk_grid_dim_indexer = array_slice_indexer_to_chunk_grid_slice_indexer(dim_indexer) chunk_grid_dim_indexers.append(chunk_grid_dim_indexer) @@ -81,8 +80,27 @@ def array_indexer_to_chunk_grid_indexer( return chunk_grid_indexer -def array_slice_to_chunk_grid_slice( - array_slice: SliceDimIndexer, +def array_int_indexer_to_chunk_grid_int_indexer( + array_int_indexer: IntDimIndexer, +) -> IntDimIndexer: + """ + Translate an integer indexer into an array into a corresponding integer indexer into the underlying chunk grid. + + Will raise on any array indexer that would require slicing within individual chunks. + """ + if array_int_indexer.dim_chunk_len == 1: + # degenerate case where array space == chunk grid space, so array indexer == chunk grid indexer + # TODO pull out the degenerate case for both ints and slices? + chunk_grid_dim_indexer = array_int_indexer + else: + # TODO what about case of array of integers that don't split up chunks? + raise SubChunkIndexingError + + return chunk_grid_dim_indexer + + +def array_slice_indexer_to_chunk_grid_slice_indexer( + arr_slice_dim_indexer: SliceDimIndexer, ) -> SliceDimIndexer: """ Translate a slice into an array into a corresponding slice into the underlying chunk grid. @@ -90,39 +108,44 @@ def array_slice_to_chunk_grid_slice( Will raise on any array slices that would require slicing within individual chunks. """ - arr_length = array_slice.dim_len - chunk_length = array_slice.dim_chunk_len + arr_length = arr_slice_dim_indexer.dim_len + chunk_length = arr_slice_dim_indexer.dim_chunk_len if chunk_length == 1: - # alot of indexing is possible only in this case, because this is basically just a normal array along that axis - chunk_slice = array_slice - return chunk_slice + # degenerate case where array space == chunk grid space, so array indexer == chunk grid indexer + chunk_slice = slice( + arr_slice_dim_indexer.start, + arr_slice_dim_indexer.stop, + arr_slice_dim_indexer.step, + ) # Check that start of slice aligns with start of a chunk - if array_slice.start % chunk_length != 0: - raise IndexError( + if arr_slice_dim_indexer.start % chunk_length != 0: + raise SubChunkIndexingError( f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" ) # Check that slice spans integer number of chunks - slice_length = array_slice.stop - array_slice.start + slice_length = arr_slice_dim_indexer.stop - arr_slice_dim_indexer.start if slice_length % chunk_length != 0: - raise IndexError( + raise SubChunkIndexingError( f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" ) - index_of_first_chunk = int(array_slice.start / chunk_length) + index_of_first_chunk = int(arr_slice_dim_indexer.start / chunk_length) n_chunks = int(slice_length / chunk_length) chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) - return SliceDimIndexer( + chunk_grid_slice_dim_indexer = SliceDimIndexer( dim_sel=chunk_slice, # TODO which dim does this refer to? That of the chunk grid? dim_len=..., dim_chunk_len=..., ) + return chunk_grid_slice_dim_indexer + def indexer_to_selection(indexer: BasicIndexer) -> BasicSelection: """Translate an indexer into a selection that numpy can understand.""" diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index e1f884211..4dcefaf27 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -374,10 +374,24 @@ class TestIndexing: @pytest.mark.parametrize( "in_shape, in_chunks, selection, out_shape, out_chunks", [ + # single-dim integer selection ((2,), (1,), 0, (1,), (1,)), ((2,), (1,), 1, (1,), (1,)), + # multi-dim integer selection + ((2, 2), (1, 1), (0, 0), (1, 1), (1, 1)), + # drop axes + ((2, 2), (1, 2), (0,), (2,), (2,)), + ((2, 2), (1, 2), (1,), (2,), (2,)), + # single-dim slicing selection + ((2,), (1,), ..., (2,), (1,)), + # TODO: drop axes + ((2,), (1,), slice(0, 1), (1,), (1,)), + ((2,), (1,), slice(1, 2), (1,), (1,)), + ((2,), (1,), slice(0, 2), (2,), (1,)), ((8,), (2,), slice(0, 4), (4,), (2,)), ((2,), (2,), slice(2, 4), (2,), (2,)), + # TODO: multi-dim slicing selection + # TODO: mixed integer and slicing selection ], ) def test_slice_aligned_with_chunks( From 385790d94e9c0a749b2deb70c55fa8bddc937d61 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 22 Mar 2025 18:23:20 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/manifests/indexing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 1c410c5f3..613dc0081 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -58,10 +58,14 @@ def array_indexer_to_chunk_grid_indexer( ): chunk_grid_dim_indexer: IntDimIndexer | SliceDimIndexer if isinstance(dim_indexer, IntDimIndexer): - chunk_grid_dim_indexer = array_int_indexer_to_chunk_grid_int_indexer(dim_indexer) + chunk_grid_dim_indexer = array_int_indexer_to_chunk_grid_int_indexer( + dim_indexer + ) elif isinstance(dim_indexer, SliceDimIndexer): - chunk_grid_dim_indexer = array_slice_indexer_to_chunk_grid_slice_indexer(dim_indexer) + chunk_grid_dim_indexer = array_slice_indexer_to_chunk_grid_slice_indexer( + dim_indexer + ) chunk_grid_dim_indexers.append(chunk_grid_dim_indexer) @@ -95,7 +99,7 @@ def array_int_indexer_to_chunk_grid_int_indexer( else: # TODO what about case of array of integers that don't split up chunks? raise SubChunkIndexingError - + return chunk_grid_dim_indexer