Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ dev = [
"s3fs",
"lithops",
"dask",
"tifffile>=2026.2.16,<2027"
]

[project.urls]
Expand Down Expand Up @@ -176,6 +177,11 @@ h5netcdf = ">=1.5.0,<2"
[tool.pixi.feature.icechunk-dev.dependencies]
rust = "*"

[tool.pixi.feature.rectilinear.pypi-dependencies]
zarr = { git = "https://github.com/jhamman/zarr-python.git", branch = "feature/rectilinear-chunk-grid" }
xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "variable-chunking" }
virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "rectilinear-chunks" }

[tool.pixi.feature.minimum-versions.dependencies]
xarray = "==2025.6.0"
numpy = "==2.0.0"
Expand Down Expand Up @@ -206,6 +212,7 @@ test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "ke
minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
rectilinear = ["dev", "test", "tiff", "hdf5-lib", "rectilinear", "py313"]
all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]

Expand Down
22 changes: 15 additions & 7 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@

import numpy as np
import xarray as xr
from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid
from zarr.core.chunk_grids import RegularChunkGrid
from zarr.core.metadata.v3 import ArrayV3Metadata

has_rectilinear_chunk_grid_support = hasattr(
__import__("zarr.core.chunk_grids", fromlist=["RectilinearChunkGrid"]),
"RectilinearChunkGrid",
)

import virtualizarr.manifests.utils as utils
from virtualizarr.manifests.array_api import (
Expand Down Expand Up @@ -49,11 +55,6 @@ def __init__(
# try unpacking the dict
_metadata = ArrayV3Metadata(**metadata)

if not isinstance(_metadata.chunk_grid, RegularChunkGrid):
raise NotImplementedError(
f"Only RegularChunkGrid is currently supported for chunk size, but got type {type(_metadata.chunk_grid)}"
)

if isinstance(chunkmanifest, ChunkManifest):
_chunkmanifest = chunkmanifest
elif isinstance(chunkmanifest, dict):
Expand All @@ -78,10 +79,17 @@ def metadata(self) -> ArrayV3Metadata:
return self._metadata

@property
def chunks(self) -> tuple[int, ...]:
def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]:
"""
Individual chunk size by number of elements.

For RegularChunkGrid, returns a tuple of ints (e.g., (30, 50)).
For RectilinearChunkGrid, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))).
"""
if has_rectilinear_chunk_grid_support and not isinstance(
self._metadata.chunk_grid, RegularChunkGrid
):
return self._metadata.chunk_grid.chunk_shapes # type: ignore[attr-defined]
return self._metadata.chunks

@property
Expand Down
10 changes: 8 additions & 2 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np

from virtualizarr.utils import determine_chunk_grid_shape
from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence

from .manifest import ChunkManifest
from .utils import (
Expand Down Expand Up @@ -188,7 +189,11 @@ def stack(
# chunk shape has changed because a length-1 axis has been inserted
old_chunks = first_arr.chunks
new_chunks = list(old_chunks)
new_chunks.insert(axis, 1)
# For rectilinear grids, each element is a sequence; insert a single-element tuple
if _is_nested_sequence(old_chunks):
new_chunks.insert(axis, (1,))
else:
new_chunks.insert(axis, 1)

new_metadata = copy_and_replace_metadata(
old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks
Expand Down Expand Up @@ -224,7 +229,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra

# new chunk_shape is old chunk_shape with singleton dimensions prepended
# (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements)
old_chunk_shape = x.chunks
# broadcast_to only applies to regular chunk grids
old_chunk_shape: tuple[int, ...] = x.chunks # type: ignore[assignment]
new_chunk_shape = _prepend_singleton_dimensions(
old_chunk_shape, ndim=len(new_shape)
)
Expand Down
35 changes: 26 additions & 9 deletions virtualizarr/manifests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union

import numpy as np
Expand All @@ -12,6 +13,7 @@
from zarr.dtype import parse_data_type

from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs
from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence

if TYPE_CHECKING:
from .array import ManifestArray
Expand Down Expand Up @@ -177,7 +179,7 @@ def check_same_codecs(codecs: list[Any]) -> None:
)


def check_same_chunk_shapes(chunks_list: list[tuple[int, ...]]) -> None:
def check_same_chunk_shapes(chunks_list: list[Sequence]) -> None:
"""Check all the chunk shapes are the same"""

first_chunks, *other_chunks_list = chunks_list
Expand Down Expand Up @@ -214,11 +216,17 @@ def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...]


def check_no_partial_chunks_on_concat_axis(
shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int
shapes: list[tuple[int, ...]], chunks: list, axis: int
):
"""Check that there are no partial chunks along the concatenation axis"""
# loop over the arrays to be concatenated
"""Check that there are no partial chunks along the concatenation axis.

Only applies to regular chunk grids; rectilinear grids explicitly encode
variable chunk sizes so partial-chunk checks are not needed.
"""
for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)):
# Rectilinear grids have sequences along each axis; skip the check
if _is_nested_sequence(chunk_shape):
continue
if shape[axis] % chunk_shape[axis] > 0:
raise ValueError(
"Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. "
Expand Down Expand Up @@ -271,7 +279,7 @@ def check_compatible_arrays(
def copy_and_replace_metadata(
old_metadata: ArrayV3Metadata,
new_shape: list[int] | None = None,
new_chunks: list[int] | None = None,
new_chunks: list | None = None,
new_dimension_names: Iterable[str] | None | Literal["default"] = "default",
new_attributes: dict | None = None,
) -> ArrayV3Metadata:
Expand All @@ -285,10 +293,19 @@ def copy_and_replace_metadata(
if new_shape is not None:
metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment]
if new_chunks is not None:
metadata_copy["chunk_grid"] = {
"name": "regular",
"configuration": {"chunk_shape": tuple(new_chunks)},
}
if _is_nested_sequence(new_chunks):
metadata_copy["chunk_grid"] = {
"name": "rectilinear",
"configuration": {
"chunk_shapes": [list(c) for c in new_chunks],
"kind": "inline",
},
}
else:
metadata_copy["chunk_grid"] = {
"name": "regular",
"configuration": {"chunk_shape": tuple(new_chunks)},
}
if new_dimension_names != "default":
# need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names
metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names)
Expand Down
68 changes: 66 additions & 2 deletions virtualizarr/tests/test_parsers/test_tiff.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import numpy as np
import pytest
import xarray as xr
from obspec_utils.registry import ObjectStoreRegistry
from obstore.store import S3Store
from obstore.store import LocalStore, S3Store
from xarray import Dataset, DataTree

from virtualizarr import open_virtual_dataset, open_virtual_datatree
from virtualizarr.tests import requires_network, requires_tiff
from virtualizarr.manifests.array import has_rectilinear_chunk_grid_support
from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile

virtual_tiff = pytest.importorskip("virtual_tiff")

Expand Down Expand Up @@ -40,3 +43,64 @@ def test_virtual_tiff_dataset() -> None:
var = vds["0"].variable
assert var.sizes == {"y": 10980, "x": 10980}
assert var.dtype == "<u2"


@requires_tiff
@requires_tifffile
@pytest.mark.skipif(
not has_rectilinear_chunk_grid_support,
reason="requires zarr with RectilinearChunkGrid support",
)
def test_concat_rectilinear_tiff_datasets(tmp_path) -> None:
"""Test concatenating two virtual TIFF datasets with rectilinear chunk grids.

Creates stripped TIFFs where image_height is not evenly divisible by rows_per_strip,
producing rectilinear chunks, then verifies they can be concatenated.
"""
import tifffile

# Create two stripped TIFFs where image_height (100) is not evenly divisible
# by rows_per_strip (30), creating rectilinear chunks: [[30, 30, 30, 10], [50]]
shape = (100, 50)
rows_per_strip = 30

filepath1 = tmp_path / "test1.tif"
filepath2 = tmp_path / "test2.tif"

tifffile.imwrite(
str(filepath1), np.ones(shape, dtype=np.uint8), rowsperstrip=rows_per_strip
)
tifffile.imwrite(
str(filepath2), np.ones(shape, dtype=np.uint8) * 2, rowsperstrip=rows_per_strip
)

parser = virtual_tiff.VirtualTIFF(ifd=0)
registry = ObjectStoreRegistry({"file://": LocalStore()})

with (
open_virtual_dataset(
url=f"file://{filepath1}", parser=parser, registry=registry
) as vds1,
open_virtual_dataset(
url=f"file://{filepath2}", parser=parser, registry=registry
) as vds2,
):
# Verify both datasets have the expected shape
assert vds1["0"].sizes == {"y": 100, "x": 50}
assert vds2["0"].sizes == {"y": 100, "x": 50}

# Verify both datasets have rectilinear (non-regular) chunk grids
from zarr.core.chunk_grids import RegularChunkGrid

assert not isinstance(
vds1["0"].variable.data.metadata.chunk_grid, RegularChunkGrid
)
assert not isinstance(
vds2["0"].variable.data.metadata.chunk_grid, RegularChunkGrid
)

# Concatenate along a new dimension
combined = xr.concat([vds1, vds2], dim="time")

assert isinstance(combined, Dataset)
assert combined["0"].sizes == {"time": 2, "y": 100, "x": 50}
35 changes: 35 additions & 0 deletions virtualizarr/vendor/zarr/core/chunk_grids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Vendored utilities from zarr-python for chunk grid handling.

See https://github.com/zarr-developers/zarr-python/pull/3534
"""

from typing import Any

from zarr.core.chunk_grids import ChunkGrid


def _is_nested_sequence(chunks: Any) -> bool:
"""
Check if chunks is a nested sequence (tuple of tuples/lists).

Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)].
Returns False for flat sequences like (10, 10) or [10, 10].

Vendored from https://github.com/zarr-developers/zarr-python/pull/3534
"""
if isinstance(chunks, str | int | ChunkGrid):
return False

if not hasattr(chunks, "__iter__"):
return False

try:
first_elem = next(iter(chunks), None)
if first_elem is None:
return False
return hasattr(first_elem, "__iter__") and not isinstance(
first_elem, str | bytes | int
)
except (TypeError, StopIteration):
return False
Loading