diff --git a/pyproject.toml b/pyproject.toml index 66b6ce672..dd3e6d841 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -128,6 +128,7 @@ dev = [ "s3fs", "lithops", "dask", + "tifffile>=2026.2.16,<2027" ] [project.urls] @@ -176,6 +177,11 @@ h5netcdf = ">=1.5.0,<2" [tool.pixi.feature.icechunk-dev.dependencies] rust = "*" +[tool.pixi.feature.rectilinear.pypi-dependencies] +zarr = { git = "https://github.com/jhamman/zarr-python.git", branch = "feature/rectilinear-chunk-grid" } +xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "variable-chunking" } +virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "rectilinear-chunks" } + [tool.pixi.feature.minimum-versions.dependencies] xarray = "==2025.6.0" numpy = "==2.0.0" @@ -206,6 +212,7 @@ test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "ke minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"] minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"] upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"] +rectilinear = ["dev", "test", "tiff", "hdf5-lib", "rectilinear", "py313"] all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"] docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"] diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 967ea986d..a902f8883 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,7 +3,13 @@ import numpy as np import xarray as xr -from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.metadata.v3 import ArrayV3Metadata + +has_rectilinear_chunk_grid_support = hasattr( + __import__("zarr.core.chunk_grids", fromlist=["RectilinearChunkGrid"]), + "RectilinearChunkGrid", +) import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -49,11 +55,6 @@ def __init__( # try unpacking the dict _metadata = ArrayV3Metadata(**metadata) - if not isinstance(_metadata.chunk_grid, RegularChunkGrid): - raise NotImplementedError( - f"Only RegularChunkGrid is currently supported for chunk size, but got type {type(_metadata.chunk_grid)}" - ) - if isinstance(chunkmanifest, ChunkManifest): _chunkmanifest = chunkmanifest elif isinstance(chunkmanifest, dict): @@ -78,10 +79,17 @@ def metadata(self) -> ArrayV3Metadata: return self._metadata @property - def chunks(self) -> tuple[int, ...]: + def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: """ Individual chunk size by number of elements. + + For RegularChunkGrid, returns a tuple of ints (e.g., (30, 50)). + For RectilinearChunkGrid, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). """ + if has_rectilinear_chunk_grid_support and not isinstance( + self._metadata.chunk_grid, RegularChunkGrid + ): + return self._metadata.chunk_grid.chunk_shapes # type: ignore[attr-defined] return self._metadata.chunks @property diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 238876d43..b19eac763 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -3,6 +3,7 @@ import numpy as np from virtualizarr.utils import determine_chunk_grid_shape +from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence from .manifest import ChunkManifest from .utils import ( @@ -188,7 +189,11 @@ def stack( # chunk shape has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks new_chunks = list(old_chunks) - new_chunks.insert(axis, 1) + # For rectilinear grids, each element is a sequence; insert a single-element tuple + if _is_nested_sequence(old_chunks): + new_chunks.insert(axis, (1,)) + else: + new_chunks.insert(axis, 1) new_metadata = copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks @@ -224,7 +229,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra # new chunk_shape is old chunk_shape with singleton dimensions prepended # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements) - old_chunk_shape = x.chunks + # broadcast_to only applies to regular chunk grids + old_chunk_shape: tuple[int, ...] = x.chunks # type: ignore[assignment] new_chunk_shape = _prepend_singleton_dimensions( old_chunk_shape, ndim=len(new_shape) ) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 1ea9cddcd..2881f1c4e 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -1,4 +1,5 @@ import re +from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union import numpy as np @@ -12,6 +13,7 @@ from zarr.dtype import parse_data_type from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs +from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence if TYPE_CHECKING: from .array import ManifestArray @@ -177,7 +179,7 @@ def check_same_codecs(codecs: list[Any]) -> None: ) -def check_same_chunk_shapes(chunks_list: list[tuple[int, ...]]) -> None: +def check_same_chunk_shapes(chunks_list: list[Sequence]) -> None: """Check all the chunk shapes are the same""" first_chunks, *other_chunks_list = chunks_list @@ -214,11 +216,17 @@ def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...] def check_no_partial_chunks_on_concat_axis( - shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int + shapes: list[tuple[int, ...]], chunks: list, axis: int ): - """Check that there are no partial chunks along the concatenation axis""" - # loop over the arrays to be concatenated + """Check that there are no partial chunks along the concatenation axis. + + Only applies to regular chunk grids; rectilinear grids explicitly encode + variable chunk sizes so partial-chunk checks are not needed. + """ for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)): + # Rectilinear grids have sequences along each axis; skip the check + if _is_nested_sequence(chunk_shape): + continue if shape[axis] % chunk_shape[axis] > 0: raise ValueError( "Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. " @@ -271,7 +279,7 @@ def check_compatible_arrays( def copy_and_replace_metadata( old_metadata: ArrayV3Metadata, new_shape: list[int] | None = None, - new_chunks: list[int] | None = None, + new_chunks: list | None = None, new_dimension_names: Iterable[str] | None | Literal["default"] = "default", new_attributes: dict | None = None, ) -> ArrayV3Metadata: @@ -285,10 +293,19 @@ def copy_and_replace_metadata( if new_shape is not None: metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment] if new_chunks is not None: - metadata_copy["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": tuple(new_chunks)}, - } + if _is_nested_sequence(new_chunks): + metadata_copy["chunk_grid"] = { + "name": "rectilinear", + "configuration": { + "chunk_shapes": [list(c) for c in new_chunks], + "kind": "inline", + }, + } + else: + metadata_copy["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": tuple(new_chunks)}, + } if new_dimension_names != "default": # need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names) diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index cf0e110c6..e0fb3a776 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -1,10 +1,13 @@ +import numpy as np import pytest +import xarray as xr from obspec_utils.registry import ObjectStoreRegistry -from obstore.store import S3Store +from obstore.store import LocalStore, S3Store from xarray import Dataset, DataTree from virtualizarr import open_virtual_dataset, open_virtual_datatree -from virtualizarr.tests import requires_network, requires_tiff +from virtualizarr.manifests.array import has_rectilinear_chunk_grid_support +from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile virtual_tiff = pytest.importorskip("virtual_tiff") @@ -40,3 +43,64 @@ def test_virtual_tiff_dataset() -> None: var = vds["0"].variable assert var.sizes == {"y": 10980, "x": 10980} assert var.dtype == " None: + """Test concatenating two virtual TIFF datasets with rectilinear chunk grids. + + Creates stripped TIFFs where image_height is not evenly divisible by rows_per_strip, + producing rectilinear chunks, then verifies they can be concatenated. + """ + import tifffile + + # Create two stripped TIFFs where image_height (100) is not evenly divisible + # by rows_per_strip (30), creating rectilinear chunks: [[30, 30, 30, 10], [50]] + shape = (100, 50) + rows_per_strip = 30 + + filepath1 = tmp_path / "test1.tif" + filepath2 = tmp_path / "test2.tif" + + tifffile.imwrite( + str(filepath1), np.ones(shape, dtype=np.uint8), rowsperstrip=rows_per_strip + ) + tifffile.imwrite( + str(filepath2), np.ones(shape, dtype=np.uint8) * 2, rowsperstrip=rows_per_strip + ) + + parser = virtual_tiff.VirtualTIFF(ifd=0) + registry = ObjectStoreRegistry({"file://": LocalStore()}) + + with ( + open_virtual_dataset( + url=f"file://{filepath1}", parser=parser, registry=registry + ) as vds1, + open_virtual_dataset( + url=f"file://{filepath2}", parser=parser, registry=registry + ) as vds2, + ): + # Verify both datasets have the expected shape + assert vds1["0"].sizes == {"y": 100, "x": 50} + assert vds2["0"].sizes == {"y": 100, "x": 50} + + # Verify both datasets have rectilinear (non-regular) chunk grids + from zarr.core.chunk_grids import RegularChunkGrid + + assert not isinstance( + vds1["0"].variable.data.metadata.chunk_grid, RegularChunkGrid + ) + assert not isinstance( + vds2["0"].variable.data.metadata.chunk_grid, RegularChunkGrid + ) + + # Concatenate along a new dimension + combined = xr.concat([vds1, vds2], dim="time") + + assert isinstance(combined, Dataset) + assert combined["0"].sizes == {"time": 2, "y": 100, "x": 50} diff --git a/virtualizarr/vendor/zarr/core/chunk_grids.py b/virtualizarr/vendor/zarr/core/chunk_grids.py new file mode 100644 index 000000000..309ceb3ac --- /dev/null +++ b/virtualizarr/vendor/zarr/core/chunk_grids.py @@ -0,0 +1,35 @@ +""" +Vendored utilities from zarr-python for chunk grid handling. + +See https://github.com/zarr-developers/zarr-python/pull/3534 +""" + +from typing import Any + +from zarr.core.chunk_grids import ChunkGrid + + +def _is_nested_sequence(chunks: Any) -> bool: + """ + Check if chunks is a nested sequence (tuple of tuples/lists). + + Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)]. + Returns False for flat sequences like (10, 10) or [10, 10]. + + Vendored from https://github.com/zarr-developers/zarr-python/pull/3534 + """ + if isinstance(chunks, str | int | ChunkGrid): + return False + + if not hasattr(chunks, "__iter__"): + return False + + try: + first_elem = next(iter(chunks), None) + if first_elem is None: + return False + return hasattr(first_elem, "__iter__") and not isinstance( + first_elem, str | bytes | int + ) + except (TypeError, StopIteration): + return False