diff --git a/conftest.py b/conftest.py index 7b827812f..7ccbc4f50 100644 --- a/conftest.py +++ b/conftest.py @@ -12,11 +12,11 @@ import xarray as xr from xarray.core.variable import Variable +import virtualizarr.manifests.utils as utils + # Local imports from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join -from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.utils import ceildiv # Pytest configuration @@ -76,7 +76,7 @@ def _generate_chunk_entries( Mapping of chunk keys to entry dictionaries """ chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) + utils.ceildiv(axis_length, chunk_length) for axis_length, chunk_length in zip(shape, chunks) ) @@ -261,7 +261,7 @@ def _create_metadata( fill_value: int | None = None, ): codecs = codecs or [{"configuration": {"endian": "little"}, "name": "bytes"}] - return create_v3_array_metadata( + return utils.create_v3_array_metadata( shape=shape, chunk_shape=chunks, data_type=data_type, diff --git a/pyproject.toml b/pyproject.toml index 0bb5e6f4f..11ebdc4af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "ujson", "packaging", "zarr>=3.0.2", + "icechunk>=0.2.5", ] # Dependency sets under optional-dependencies are available via PyPI @@ -95,7 +96,6 @@ upstream = [ 'fsspec @ git+https://github.com/fsspec/filesystem_spec', 's3fs @ git+https://github.com/fsspec/s3fs', 'kerchunk @ git+https://github.com/fsspec/kerchunk', - 'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python', ] docs = [ "sphinx", diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 5838e4d58..26f3c89f0 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,9 +1,13 @@ import warnings +from types import EllipsisType from typing import Any, Callable, Union import numpy as np +from zarr.core.indexing import BasicIndexer from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +import virtualizarr.manifests.indexing as indexing +import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan, @@ -205,36 +209,82 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": def __getitem__( self, - key, + selection: Union[ + int, + slice, + EllipsisType, + None, + tuple[Union[int, slice, EllipsisType, None], ...], + np.ndarray, + ], /, ) -> "ManifestArray": """ - Only supports extremely limited indexing. + Slice this ManifestArray by indexing in array element space (as opposed to in chunk grid space). - Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway. + Only supports indexing where slices are aligned exactly with chunk boundaries. + + Effectively, this means that the following indexing modes are supported: + + - integer indexing + - slice indexing + - mixed slice and integer indexing + + Follows the array API standard otherwise. """ - from xarray.core.indexing import BasicIndexer + print(f"{selection=}") + + # TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer + # TODO will this expand trailing ellipses? (it should) + indexer = BasicIndexer( + selection, + self.shape, + self.metadata.chunk_grid, + ) - if isinstance(key, BasicIndexer): - indexer = key.tuple - else: - indexer = key + # TODO is this where we would differ codepath for an uncompressed array? + chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer( + indexer=indexer, + arr_shape=self.shape, + chunk_shape=self.chunks, + ) - indexer = _possibly_expand_trailing_ellipsis(key, self.ndim) + print(f"{chunk_grid_indexer=}") - if len(indexer) != self.ndim: - raise ValueError( - f"Invalid indexer for array with ndim={self.ndim}: {indexer}" - ) + # TODO translate new chunk_grid_indexer BasicIndexer into normal Selection that numpy can understand + chunk_grid_selection = indexing.indexer_to_selection(chunk_grid_indexer) - if all( - isinstance(axis_indexer, slice) and axis_indexer == slice(None) - for axis_indexer in indexer - ): - # indexer is all slice(None)'s, so this is a no-op - return self - else: - raise NotImplementedError(f"Doesn't support slicing with {indexer}") + print(f"{chunk_grid_selection=}") + + # do slicing of entries in manifest + # TODO add ChunkManifest.__getitem__ for this? + # TODO or add some kind of dedicated method that can't be confused with the API of Mapping + sliced_paths = self.manifest._paths[chunk_grid_selection] + sliced_offsets = self.manifest._offsets[chunk_grid_selection] + sliced_lengths = self.manifest._lengths[chunk_grid_selection] + print(f"{sliced_paths=}") + sliced_manifest = ChunkManifest.from_arrays( + paths=sliced_paths, + offsets=sliced_offsets, + lengths=sliced_lengths, + ) + + print(f"{sliced_manifest=}") + + new_arr_shape = utils.determine_array_shape( + chunk_grid_shape=sliced_manifest.shape_chunk_grid, + chunk_shape=self.chunks, + ) + + print(f"{new_arr_shape=}") + + # chunk sizes are unchanged by slicing that aligns with chunk boundaries + new_metadata = utils.copy_and_replace_metadata( + self.metadata, + new_shape=new_arr_shape, + ) + + return ManifestArray(chunkmanifest=sliced_manifest, metadata=new_metadata) def rename_paths( self, @@ -275,12 +325,3 @@ def rename_paths( """ renamed_manifest = self.manifest.rename_paths(new) return ManifestArray(metadata=self.metadata, chunkmanifest=renamed_manifest) - - -def _possibly_expand_trailing_ellipsis(key, ndim: int): - if key[-1] == ...: - extra_slices_needed = ndim - (len(key) - 1) - *indexer, ellipsis = key - return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed) - else: - return key diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index c084f40d9..38ffa5fc2 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -2,16 +2,8 @@ import numpy as np -from virtualizarr.utils import determine_chunk_grid_shape - -from .manifest import ChunkManifest -from .utils import ( - check_combinable_zarr_arrays, - check_same_ndims, - check_same_shapes, - check_same_shapes_except_on_concat_axis, - copy_and_replace_metadata, -) +import virtualizarr.manifests.utils as utils +from virtualizarr.manifests.manifest import ChunkManifest if TYPE_CHECKING: from .array import ManifestArray @@ -65,9 +57,9 @@ def concatenate( raise TypeError() # ensure dtypes, shapes, codecs etc. are consistent - check_combinable_zarr_arrays(arrays) + utils.check_combinable_zarr_arrays(arrays) - check_same_ndims([arr.ndim for arr in arrays]) + utils.check_same_ndims([arr.ndim for arr in arrays]) # Ensure we handle axis being passed as a negative integer first_arr = arrays[0] @@ -75,7 +67,7 @@ def concatenate( axis = axis % first_arr.ndim arr_shapes = [arr.shape for arr in arrays] - check_same_shapes_except_on_concat_axis(arr_shapes, axis) + utils.check_same_shapes_except_on_concat_axis(arr_shapes, axis) # find what new array shape must be new_length_along_concat_axis = sum([shape[axis] for shape in arr_shapes]) @@ -102,7 +94,7 @@ def concatenate( lengths=concatenated_lengths, ) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape ) @@ -128,11 +120,11 @@ def stack( raise TypeError() # ensure dtypes, shapes, codecs etc. are consistent - check_combinable_zarr_arrays(arrays) + utils.check_combinable_zarr_arrays(arrays) - check_same_ndims([arr.ndim for arr in arrays]) + utils.check_same_ndims([arr.ndim for arr in arrays]) arr_shapes = [arr.shape for arr in arrays] - check_same_shapes(arr_shapes) + utils.check_same_shapes(arr_shapes) # Ensure we handle axis being passed as a negative integer first_arr = arrays[0] @@ -172,7 +164,7 @@ def stack( new_chunks = list(old_chunks) new_chunks.insert(axis, 1) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks ) @@ -212,7 +204,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra ) # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) + new_chunk_grid_shape = utils.determine_chunk_grid_shape(new_shape, new_chunk_shape) # do broadcasting of entries in manifest broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype @@ -237,7 +229,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra lengths=broadcasted_lengths, ) - new_metadata = copy_and_replace_metadata( + new_metadata = utils.copy_and_replace_metadata( old_metadata=x.metadata, new_shape=list(new_shape), new_chunks=list(new_chunk_shape), diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py new file mode 100644 index 000000000..613dc0081 --- /dev/null +++ b/virtualizarr/manifests/indexing.py @@ -0,0 +1,176 @@ +from zarr.core.indexing import ( + BasicIndexer, + BasicSelection, + BasicSelector, + IntDimIndexer, + SliceDimIndexer, +) + + +# TODO write a custom message for this +class SubChunkIndexingError(IndexError): ... + + +def array_indexer_to_chunk_grid_indexer( + indexer: BasicIndexer, + arr_shape: tuple[int, ...], + chunk_shape: tuple[int, ...], +) -> BasicIndexer: + """ + Translate an indexer into an array into a corresponding indexer into the underlying chunk grid. + + As compressed chunks cannot be subdivided, this will raise on any array slices that require slicing within individual chunks. + + Parameters + ---------- + indexer + arr_shape : tuple[int, ...] + Shape of the ManifestArray we are trying to index into. + """ + + print(f"{indexer=}") + + [ + print(chunk_coords, chunk_selection, out_selection, is_complete_chunk) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ] + + # TODO move this check outside? Because we can arbitrarily index into uncompressed arrays... + # TODO or is that unrelated? Uncompressed arrays allow us to choose arbitrary chunking at reference generation time + if not all(is_complete_chunk for _, _, _, is_complete_chunk in indexer): + raise SubChunkIndexingError() + + if indexer.drop_axes: + raise NotImplementedError + + # TODO does the array shape have to match the indexer shape? Should this have been checked already? + chunk_grid_dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] + for ( + dim_indexer, + dim_len, + dim_chunk_len, + ) in zip( + indexer.dim_indexers, + arr_shape, + chunk_shape, + # chunk_grid_shape, # TODO do we really not need this?? + strict=True, + ): + chunk_grid_dim_indexer: IntDimIndexer | SliceDimIndexer + if isinstance(dim_indexer, IntDimIndexer): + chunk_grid_dim_indexer = array_int_indexer_to_chunk_grid_int_indexer( + dim_indexer + ) + + elif isinstance(dim_indexer, SliceDimIndexer): + chunk_grid_dim_indexer = array_slice_indexer_to_chunk_grid_slice_indexer( + dim_indexer + ) + + chunk_grid_dim_indexers.append(chunk_grid_dim_indexer) + + # TODO check this - I'm not sure if I've understood the "shape" of an indexer correctly + chunk_grid_dim_indexer_shape = tuple( + s.nitems for s in chunk_grid_dim_indexers if not isinstance(s, IntDimIndexer) + ) + + # The BasicIndexer constructor doesn't allow us to set the attributes we want to + # Also BasicIndexer is a frozen dataclass so we have to use .__setattr__() + chunk_grid_indexer = object.__new__(BasicIndexer) + object.__setattr__(chunk_grid_indexer, "dim_indexers", chunk_grid_dim_indexers) + object.__setattr__(chunk_grid_indexer, "shape", chunk_grid_dim_indexer_shape) + object.__setattr__(chunk_grid_indexer, "drop_axes", ()) + + return chunk_grid_indexer + + +def array_int_indexer_to_chunk_grid_int_indexer( + array_int_indexer: IntDimIndexer, +) -> IntDimIndexer: + """ + Translate an integer indexer into an array into a corresponding integer indexer into the underlying chunk grid. + + Will raise on any array indexer that would require slicing within individual chunks. + """ + if array_int_indexer.dim_chunk_len == 1: + # degenerate case where array space == chunk grid space, so array indexer == chunk grid indexer + # TODO pull out the degenerate case for both ints and slices? + chunk_grid_dim_indexer = array_int_indexer + else: + # TODO what about case of array of integers that don't split up chunks? + raise SubChunkIndexingError + + return chunk_grid_dim_indexer + + +def array_slice_indexer_to_chunk_grid_slice_indexer( + arr_slice_dim_indexer: SliceDimIndexer, +) -> SliceDimIndexer: + """ + Translate a slice into an array into a corresponding slice into the underlying chunk grid. + + Will raise on any array slices that would require slicing within individual chunks. + """ + + arr_length = arr_slice_dim_indexer.dim_len + chunk_length = arr_slice_dim_indexer.dim_chunk_len + + if chunk_length == 1: + # degenerate case where array space == chunk grid space, so array indexer == chunk grid indexer + chunk_slice = slice( + arr_slice_dim_indexer.start, + arr_slice_dim_indexer.stop, + arr_slice_dim_indexer.step, + ) + + # Check that start of slice aligns with start of a chunk + if arr_slice_dim_indexer.start % chunk_length != 0: + raise SubChunkIndexingError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + # Check that slice spans integer number of chunks + slice_length = arr_slice_dim_indexer.stop - arr_slice_dim_indexer.start + if slice_length % chunk_length != 0: + raise SubChunkIndexingError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + index_of_first_chunk = int(arr_slice_dim_indexer.start / chunk_length) + n_chunks = int(slice_length / chunk_length) + + chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) + + chunk_grid_slice_dim_indexer = SliceDimIndexer( + dim_sel=chunk_slice, + # TODO which dim does this refer to? That of the chunk grid? + dim_len=..., + dim_chunk_len=..., + ) + + return chunk_grid_slice_dim_indexer + + +def indexer_to_selection(indexer: BasicIndexer) -> BasicSelection: + """Translate an indexer into a selection that numpy can understand.""" + selection = [] + for dim_indexer in indexer.dim_indexers: + dim_selection: BasicSelector + if isinstance(dim_indexer, IntDimIndexer): + # avoid numpy returning a scalar value instead of np.ndarray + dim_selection = slice( + dim_indexer.dim_sel, + dim_indexer.dim_sel + 1, + ) + elif isinstance(dim_indexer, SliceDimIndexer): + dim_selection = slice( + dim_indexer.start, + dim_indexer.stop, + dim_indexer.step, + ) + else: + raise NotImplementedError + + selection.append(dim_selection) + + return tuple(selection) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index b4dbfb175..96eec2019 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -187,3 +187,29 @@ def copy_and_replace_metadata( # ArrayV3Metadata.from_dict removes extra keys zarr_format and node_type new_metadata = ArrayV3Metadata.from_dict(metadata_copy) return new_metadata + + +def ceildiv(a: int, b: int) -> int: + """ + Ceiling division operator for integers. + + See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python + """ + return -(a // -b) + + +def determine_chunk_grid_shape( + shape: tuple[int, ...], chunks: tuple[int, ...] +) -> tuple[int, ...]: + """Calculate the shape of the chunk grid based on array shape and chunk size.""" + return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) + + +def determine_array_shape( + chunk_grid_shape: tuple[int, ...], + chunk_shape: tuple[int, ...], +) -> tuple[int, ...]: + return tuple( + dim_chunk_grid * dim_chunk_len + for dim_chunk_grid, dim_chunk_len in zip(chunk_grid_shape, chunk_shape) + ) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index befc86510..4dcefaf27 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -2,6 +2,7 @@ import pytest from zarr.core.metadata.v3 import ArrayV3Metadata +import virtualizarr.manifests.utils as utils from conftest import ( ARRAYBYTES_CODEC, ZLIB_CODEC, @@ -366,3 +367,68 @@ def test_refuse_combine(array_v3_metadata): for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): func([marr1, marr2], axis=0) + + +class TestIndexing: + # TODO parametrize over a bunch of valid options here + @pytest.mark.parametrize( + "in_shape, in_chunks, selection, out_shape, out_chunks", + [ + # single-dim integer selection + ((2,), (1,), 0, (1,), (1,)), + ((2,), (1,), 1, (1,), (1,)), + # multi-dim integer selection + ((2, 2), (1, 1), (0, 0), (1, 1), (1, 1)), + # drop axes + ((2, 2), (1, 2), (0,), (2,), (2,)), + ((2, 2), (1, 2), (1,), (2,), (2,)), + # single-dim slicing selection + ((2,), (1,), ..., (2,), (1,)), + # TODO: drop axes + ((2,), (1,), slice(0, 1), (1,), (1,)), + ((2,), (1,), slice(1, 2), (1,), (1,)), + ((2,), (1,), slice(0, 2), (2,), (1,)), + ((8,), (2,), slice(0, 4), (4,), (2,)), + ((2,), (2,), slice(2, 4), (2,), (2,)), + # TODO: multi-dim slicing selection + # TODO: mixed integer and slicing selection + ], + ) + def test_slice_aligned_with_chunks( + self, manifest_array, in_shape, in_chunks, selection, out_shape, out_chunks + ): + marr = manifest_array(shape=in_shape, chunks=in_chunks) + + subarr = marr[selection] + + assert isinstance(subarr, ManifestArray) + assert subarr.shape == out_shape + assert subarr.chunks == out_chunks + + # all metadata should be the same except for shape (and possibly chunks?) + expected_metadata = utils.copy_and_replace_metadata( + marr.metadata, + new_shape=out_shape, + ) + # expected_metadata.chunks = out_chunks + assert subarr.metadata == expected_metadata + + def test_slice_concat_roundtrip(self): ... + + def test_slice_misaligned_with_chunks(self, manifest_array): + marr = manifest_array(shape=(4,), chunks=(2,)) + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:1] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:3] diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py index 3e1c7108c..7d859667b 100644 --- a/virtualizarr/translators/kerchunk.py +++ b/virtualizarr/translators/kerchunk.py @@ -1,25 +1,23 @@ from typing import Any, Mapping, MutableMapping, cast import numpy as np -from xarray import Dataset +from xarray import Dataset, Variable from xarray.core.indexes import Index -from xarray.core.variable import Variable from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata +import virtualizarr.manifests.utils as utils from virtualizarr.codecs import ( numcodec_config_to_configurable, ) from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import ChunkEntry, ChunkKey -from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.readers.common import separate_coords from virtualizarr.types.kerchunk import ( KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import determine_chunk_grid_shape def to_kerchunk_json(v2_metadata: ArrayV2Metadata) -> str: @@ -84,7 +82,7 @@ def from_kerchunk_refs(decoded_arr_refs_zarray) -> "ArrayV3Metadata": numcodec_configs = [ numcodec_config_to_configurable(config) for config in codec_configs ] - return create_v3_array_metadata( + return utils.create_v3_array_metadata( chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]), data_type=dtype, codecs=numcodec_configs, @@ -236,7 +234,7 @@ def variable_from_kerchunk_refs( # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D - shape = determine_chunk_grid_shape( + shape = utils.determine_chunk_grid_shape( metadata.shape, metadata.chunks, ) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index d36f0826e..540c2c076 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -107,22 +107,6 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): return None -def ceildiv(a: int, b: int) -> int: - """ - Ceiling division operator for integers. - - See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - """ - return -(a // -b) - - -def determine_chunk_grid_shape( - shape: tuple[int, ...], chunks: tuple[int, ...] -) -> tuple[int, ...]: - """Calculate the shape of the chunk grid based on array shape and chunk size.""" - return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) - - def convert_v3_to_v2_metadata( v3_metadata: ArrayV3Metadata, fill_value: Any = None ) -> ArrayV2Metadata: