Skip to content
Draft
8 changes: 4 additions & 4 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
import xarray as xr
from xarray.core.variable import Variable

import virtualizarr.manifests.utils as utils

# Local imports
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests.manifest import join
from virtualizarr.manifests.utils import create_v3_array_metadata
from virtualizarr.utils import ceildiv


# Pytest configuration
Expand Down Expand Up @@ -76,7 +76,7 @@ def _generate_chunk_entries(
Mapping of chunk keys to entry dictionaries
"""
chunk_grid_shape = tuple(
ceildiv(axis_length, chunk_length)
utils.ceildiv(axis_length, chunk_length)
for axis_length, chunk_length in zip(shape, chunks)
)

Expand Down Expand Up @@ -261,7 +261,7 @@ def _create_metadata(
fill_value: int | None = None,
):
codecs = codecs or [{"configuration": {"endian": "little"}, "name": "bytes"}]
return create_v3_array_metadata(
return utils.create_v3_array_metadata(
shape=shape,
chunk_shape=chunks,
data_type=data_type,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"ujson",
"packaging",
"zarr>=3.0.2",
"icechunk>=0.2.5",
]

# Dependency sets under optional-dependencies are available via PyPI
Expand Down Expand Up @@ -95,7 +96,6 @@ upstream = [
'fsspec @ git+https://github.com/fsspec/filesystem_spec',
's3fs @ git+https://github.com/fsspec/s3fs',
'kerchunk @ git+https://github.com/fsspec/kerchunk',
'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python',
]
docs = [
"sphinx",
Expand Down
101 changes: 71 additions & 30 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import warnings
from types import EllipsisType
from typing import Any, Callable, Union

import numpy as np
from zarr.core.indexing import BasicIndexer
from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid

import virtualizarr.manifests.indexing as indexing
import virtualizarr.manifests.utils as utils
from virtualizarr.manifests.array_api import (
MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS,
_isnan,
Expand Down Expand Up @@ -205,36 +209,82 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray":

def __getitem__(
self,
key,
selection: Union[
int,
slice,
EllipsisType,
None,
tuple[Union[int, slice, EllipsisType, None], ...],
np.ndarray,
],
/,
) -> "ManifestArray":
"""
Only supports extremely limited indexing.
Slice this ManifestArray by indexing in array element space (as opposed to in chunk grid space).

Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway.
Only supports indexing where slices are aligned exactly with chunk boundaries.

Effectively, this means that the following indexing modes are supported:

- integer indexing
- slice indexing
- mixed slice and integer indexing

Follows the array API standard otherwise.
"""
from xarray.core.indexing import BasicIndexer
print(f"{selection=}")

# TODO validate the selection, and identify if the selection can't be represented as a BasicIndexer
# TODO will this expand trailing ellipses? (it should)
indexer = BasicIndexer(
selection,
self.shape,
self.metadata.chunk_grid,
)

if isinstance(key, BasicIndexer):
indexer = key.tuple
else:
indexer = key
# TODO is this where we would differ codepath for an uncompressed array?
chunk_grid_indexer = indexing.array_indexer_to_chunk_grid_indexer(
indexer=indexer,
arr_shape=self.shape,
chunk_shape=self.chunks,
)

indexer = _possibly_expand_trailing_ellipsis(key, self.ndim)
print(f"{chunk_grid_indexer=}")

if len(indexer) != self.ndim:
raise ValueError(
f"Invalid indexer for array with ndim={self.ndim}: {indexer}"
)
# TODO translate new chunk_grid_indexer BasicIndexer into normal Selection that numpy can understand
chunk_grid_selection = indexing.indexer_to_selection(chunk_grid_indexer)

if all(
isinstance(axis_indexer, slice) and axis_indexer == slice(None)
for axis_indexer in indexer
):
# indexer is all slice(None)'s, so this is a no-op
return self
else:
raise NotImplementedError(f"Doesn't support slicing with {indexer}")
print(f"{chunk_grid_selection=}")

# do slicing of entries in manifest
# TODO add ChunkManifest.__getitem__ for this?
# TODO or add some kind of dedicated method that can't be confused with the API of Mapping
sliced_paths = self.manifest._paths[chunk_grid_selection]
sliced_offsets = self.manifest._offsets[chunk_grid_selection]
sliced_lengths = self.manifest._lengths[chunk_grid_selection]
print(f"{sliced_paths=}")
sliced_manifest = ChunkManifest.from_arrays(
paths=sliced_paths,
offsets=sliced_offsets,
lengths=sliced_lengths,
)

print(f"{sliced_manifest=}")

new_arr_shape = utils.determine_array_shape(
chunk_grid_shape=sliced_manifest.shape_chunk_grid,
chunk_shape=self.chunks,
)

print(f"{new_arr_shape=}")

# chunk sizes are unchanged by slicing that aligns with chunk boundaries
new_metadata = utils.copy_and_replace_metadata(
self.metadata,
new_shape=new_arr_shape,
)

return ManifestArray(chunkmanifest=sliced_manifest, metadata=new_metadata)

def rename_paths(
self,
Expand Down Expand Up @@ -275,12 +325,3 @@ def rename_paths(
"""
renamed_manifest = self.manifest.rename_paths(new)
return ManifestArray(metadata=self.metadata, chunkmanifest=renamed_manifest)


def _possibly_expand_trailing_ellipsis(key, ndim: int):
if key[-1] == ...:
extra_slices_needed = ndim - (len(key) - 1)
*indexer, ellipsis = key
return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed)
else:
return key
32 changes: 12 additions & 20 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,8 @@

import numpy as np

from virtualizarr.utils import determine_chunk_grid_shape

from .manifest import ChunkManifest
from .utils import (
check_combinable_zarr_arrays,
check_same_ndims,
check_same_shapes,
check_same_shapes_except_on_concat_axis,
copy_and_replace_metadata,
)
import virtualizarr.manifests.utils as utils
from virtualizarr.manifests.manifest import ChunkManifest

if TYPE_CHECKING:
from .array import ManifestArray
Expand Down Expand Up @@ -65,17 +57,17 @@ def concatenate(
raise TypeError()

# ensure dtypes, shapes, codecs etc. are consistent
check_combinable_zarr_arrays(arrays)
utils.check_combinable_zarr_arrays(arrays)

check_same_ndims([arr.ndim for arr in arrays])
utils.check_same_ndims([arr.ndim for arr in arrays])

# Ensure we handle axis being passed as a negative integer
first_arr = arrays[0]
if axis < 0:
axis = axis % first_arr.ndim

arr_shapes = [arr.shape for arr in arrays]
check_same_shapes_except_on_concat_axis(arr_shapes, axis)
utils.check_same_shapes_except_on_concat_axis(arr_shapes, axis)

# find what new array shape must be
new_length_along_concat_axis = sum([shape[axis] for shape in arr_shapes])
Expand All @@ -102,7 +94,7 @@ def concatenate(
lengths=concatenated_lengths,
)

new_metadata = copy_and_replace_metadata(
new_metadata = utils.copy_and_replace_metadata(
old_metadata=first_arr.metadata, new_shape=new_shape
)

Expand All @@ -128,11 +120,11 @@ def stack(
raise TypeError()

# ensure dtypes, shapes, codecs etc. are consistent
check_combinable_zarr_arrays(arrays)
utils.check_combinable_zarr_arrays(arrays)

check_same_ndims([arr.ndim for arr in arrays])
utils.check_same_ndims([arr.ndim for arr in arrays])
arr_shapes = [arr.shape for arr in arrays]
check_same_shapes(arr_shapes)
utils.check_same_shapes(arr_shapes)

# Ensure we handle axis being passed as a negative integer
first_arr = arrays[0]
Expand Down Expand Up @@ -172,7 +164,7 @@ def stack(
new_chunks = list(old_chunks)
new_chunks.insert(axis, 1)

new_metadata = copy_and_replace_metadata(
new_metadata = utils.copy_and_replace_metadata(
old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks
)

Expand Down Expand Up @@ -212,7 +204,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
)

# find new chunk grid shape by dividing new array shape by new chunk shape
new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape)
new_chunk_grid_shape = utils.determine_chunk_grid_shape(new_shape, new_chunk_shape)

# do broadcasting of entries in manifest
broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype
Expand All @@ -237,7 +229,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
lengths=broadcasted_lengths,
)

new_metadata = copy_and_replace_metadata(
new_metadata = utils.copy_and_replace_metadata(
old_metadata=x.metadata,
new_shape=list(new_shape),
new_chunks=list(new_chunk_shape),
Expand Down
Loading
Loading