Merge branch 'main' into pandas-3.0

LucaMarconato · LucaMarconato · commit 48bb91e28bc5 · 2026-01-03T17:50:43.000+01:00
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,6 +1,6 @@
 # Installation
 
-`spatialdata` requires Python version >= 3.9 to run and the installation time requires a few minutes on a standard desktop computer.
+`spatialdata` requires Python (the minimum required version is specified in PyPI) to run and the installation time requires a few minutes on a standard desktop computer.
 
 ## PyPI
 
diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks
@@ -1 +1 @@
-Subproject commit 9cf35b236c4fdbce01a7c9e83f20256738b9a8fd
+Subproject commit 37e6d233b3b39a09be6006a3ef923a11fa8e28b6
diff --git a/src/spatialdata/_core/query/spatial_query.py b/src/spatialdata/_core/query/spatial_query.py
@@ -5,7 +5,6 @@
 from functools import singledispatch
 from typing import TYPE_CHECKING, Any
 
-import dask.array as da
 import dask.dataframe as dd
 import numpy as np
 from dask.dataframe import DataFrame as DaskDataFrame
@@ -385,7 +384,7 @@ def _bounding_box_mask_points(
     axes: tuple[str, ...],
     min_coordinate: list[Number] | ArrayLike,
     max_coordinate: list[Number] | ArrayLike,
-) -> da.Array:
+) -> list[ArrayLike]:
     """Compute a mask that is true for the points inside axis-aligned bounding boxes.
 
     Parameters
@@ -427,12 +426,9 @@ def _bounding_box_mask_points(
                 continue
             min_value = min_coordinate[box, axis_index]
             max_value = max_coordinate[box, axis_index]
-            box_masks.append(
-                points[axis_name].gt(min_value).to_dask_array(lengths=True)
-                & points[axis_name].lt(max_value).to_dask_array(lengths=True)
-            )
-        bounding_box_mask = da.stack(box_masks, axis=-1)
-        in_bounding_box_masks.append(da.all(bounding_box_mask, axis=1))
+            box_masks.append(points[axis_name].gt(min_value).compute() & points[axis_name].lt(max_value).compute())
+        bounding_box_mask = np.stack(box_masks, axis=-1)
+        in_bounding_box_masks.append(np.all(bounding_box_mask, axis=1))
     return in_bounding_box_masks
 
 
@@ -673,19 +669,20 @@ def _(
     )
 
     if not (len_df := len(in_intrinsic_bounding_box)) == (len_bb := len(min_coordinate)):
-        raise ValueError(f"Number of dataframes `{len_df}` is not equal to the number of bounding boxes `{len_bb}`.")
+        raise ValueError(
+            f"Length of list of dataframes `{len_df}` is not equal to the number of bounding boxes axes `{len_bb}`."
+        )
     points_in_intrinsic_bounding_box: list[DaskDataFrame | None] = []
     points_pd = points.compute()
     attrs = points.attrs.copy()
-    for mask in in_intrinsic_bounding_box:
-        if mask.sum() == 0:
+    for mask_np in in_intrinsic_bounding_box:
+        if mask_np.sum() == 0:
             points_in_intrinsic_bounding_box.append(None)
         else:
             # TODO there is a problem when mixing dask dataframe graph with dask array graph. Need to compute for now.
             # we can't compute either mask or points as when we calculate either one of them
             # test_query_points_multiple_partitions will fail as the mask will be used to index each partition.
             # However, if we compute and then create the dask array again we get the mixed dask graph problem.
-            mask_np = mask.compute()
             filtered_pd = points_pd[mask_np]
             points_filtered = dd.from_pandas(filtered_pd, npartitions=points.npartitions)
             points_filtered.attrs.update(attrs)
@@ -724,9 +721,9 @@ def _(
                 min_coordinate=min_c,  # type: ignore[arg-type]
                 max_coordinate=max_c,  # type: ignore[arg-type]
             )
-            if len(bounding_box_mask) == 1:
-                bounding_box_mask = bounding_box_mask[0]
-            bounding_box_indices = np.where(bounding_box_mask.compute())[0]
+            if len(bounding_box_mask) != 1:
+                raise ValueError(f"Expected a single mask, got {len(bounding_box_mask)} masks. Please report this bug.")
+            bounding_box_indices = np.where(bounding_box_mask[0])[0]
 
             if len(bounding_box_indices) == 0:
                 output.append(None)
diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py
@@ -16,6 +16,7 @@
 from dask.dataframe import Scalar, read_parquet
 from geopandas import GeoDataFrame
 from shapely import MultiPolygon, Polygon
+from upath import UPath
 from xarray import DataArray, DataTree
 from zarr.errors import GroupNotFoundError
 
@@ -1810,15 +1811,17 @@ def tables(self, tables: dict[str, AnnData]) -> None:
 
     @staticmethod
     def read(
-        file_path: Path | str, selection: tuple[str] | None = None, reconsolidate_metadata: bool = False
+        file_path: str | Path | UPath | zarr.Group,
+        selection: tuple[str] | None = None,
+        reconsolidate_metadata: bool = False,
     ) -> SpatialData:
         """
         Read a SpatialData object from a Zarr storage (on-disk or remote).
 
         Parameters
         ----------
         file_path
-            The path or URL to the Zarr storage.
+            The path, URL, or zarr.Group to the Zarr storage.
         selection
             The elements to read (images, labels, points, shapes, table). If None, all elements are read.
         reconsolidate_metadata
diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py
@@ -470,8 +470,8 @@ def _resolve_zarr_store(
     if isinstance(path, zarr.Group):
         # if the input is a zarr.Group, wrap it with a store
         if isinstance(path.store, LocalStore):
-            # create a simple FSStore if the store is a LocalStore with just the path
-            return FsspecStore(os.path.join(path.store.path, path.path), **kwargs)
+            store_path = UPath(path.store.root) / path.path
+            return LocalStore(store_path.path)
         if isinstance(path.store, FsspecStore):
             # if the store within the zarr.Group is an FSStore, return it
             # but extend the path of the store with that of the zarr.Group
diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py
@@ -11,6 +11,7 @@
 from geopandas import GeoDataFrame
 from ome_zarr.format import Format
 from pyarrow import ArrowInvalid
+from upath import UPath
 from zarr.errors import ArrayNotFoundError
 
 from spatialdata._core.spatialdata import SpatialData
@@ -120,7 +121,7 @@ def get_raster_format_for_read(
 
 
 def read_zarr(
-    store: str | Path,
+    store: str | Path | UPath | zarr.Group,
     selection: None | tuple[str] = None,
     on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR,
 ) -> SpatialData:
@@ -130,7 +131,7 @@ def read_zarr(
     Parameters
     ----------
     store
-        Path to the zarr store (on-disk or remote).
+        Path, URL, or zarr.Group to the zarr store (on-disk or remote).
 
     selection
         List of elements to read from the zarr store (images, labels, points, shapes, table). If None, all elements are
@@ -228,7 +229,7 @@ def read_zarr(
         tables=tables,
         attrs=attrs,
     )
-    sdata.path = Path(store)
+    sdata.path = resolved_store.root
     return sdata
 
 
diff --git a/src/spatialdata/dataloader/__init__.py b/src/spatialdata/dataloader/__init__.py
@@ -1,4 +1,21 @@
-try:
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import spatialdata
+
+if TYPE_CHECKING:
     from spatialdata.dataloader.datasets import ImageTilesDataset
-except ImportError:
-    ImageTilesDataset = None  # type: ignore[assignment, misc]
+
+__all__ = [
+    "ImageTilesDataset",
+]
+
+
+def __getattr__(attr_name: str) -> ImageTilesDataset | Any:
+    if attr_name == "ImageTilesDataset":
+        from spatialdata.dataloader.datasets import ImageTilesDataset
+
+        return ImageTilesDataset
+
+    return getattr(spatialdata.dataloader, attr_name)
diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
@@ -239,6 +239,10 @@ def parse(
                 chunks=chunks,
             )
             _parse_transformations(data, parsed_transform)
+        else:
+            # Chunk single scale images
+            if chunks is not None:
+                data = data.chunk(chunks=chunks)
         cls()._check_chunk_size_not_too_large(data)
         # recompute coordinates for (multiscale) spatial image
         return compute_coordinates(data)
diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py
@@ -11,6 +11,7 @@
 import zarr
 from anndata import AnnData
 from numpy.random import default_rng
+from upath import UPath
 from zarr.errors import GroupNotFoundError
 
 from spatialdata import SpatialData, deepcopy, read_zarr
@@ -963,3 +964,30 @@ def test_can_read_sdata_with_reconsolidation(full_sdata, sdata_container_format:
 
         new_sdata = SpatialData.read(path, reconsolidate_metadata=True)
         assert_spatial_data_objects_are_identical(full_sdata, new_sdata)
+
+
+def test_read_sdata(tmp_path: Path, points: SpatialData) -> None:
+    sdata_path = tmp_path / "sdata.zarr"
+    points.write(sdata_path)
+
+    # path as Path
+    sdata_from_path = SpatialData.read(sdata_path)
+    assert sdata_from_path.path == sdata_path
+
+    # path as str
+    sdata_from_str = SpatialData.read(str(sdata_path))
+    assert sdata_from_str.path == sdata_path
+
+    # path as UPath
+    sdata_from_upath = SpatialData.read(UPath(sdata_path))
+    assert sdata_from_upath.path == sdata_path
+
+    # path as zarr Group
+    zarr_group = zarr.open_group(sdata_path, mode="r")
+    sdata_from_zarr_group = SpatialData.read(zarr_group)
+    assert sdata_from_zarr_group.path == sdata_path
+
+    # Assert all read methods produce identical SpatialData objects
+    assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_str)
+    assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_upath)
+    assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group)
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
@@ -195,6 +195,65 @@ def test_raster_schema(
             with pytest.raises(ValueError):
                 model.parse(image, **kwargs)
 
+    @pytest.mark.parametrize(
+        "model,chunks,expected",
+        [
+            (Labels2DModel, None, (10, 10)),
+            (Labels2DModel, 5, (5, 5)),
+            (Labels2DModel, (5, 5), (5, 5)),
+            (Labels2DModel, {"x": 5, "y": 5}, (5, 5)),
+            (Labels3DModel, None, (2, 10, 10)),
+            (Labels3DModel, 5, (2, 5, 5)),
+            (Labels3DModel, (2, 5, 5), (2, 5, 5)),
+            (Labels3DModel, {"z": 2, "x": 5, "y": 5}, (2, 5, 5)),
+            (Image2DModel, None, (1, 10, 10)),  # Image2D Models always have a c dimension
+            (Image2DModel, 5, (1, 5, 5)),
+            (Image2DModel, (1, 5, 5), (1, 5, 5)),
+            (Image2DModel, {"c": 1, "x": 5, "y": 5}, (1, 5, 5)),
+            (Image3DModel, None, (1, 2, 10, 10)),  # Image3D models have z in addition, so 4 total dimensions
+            (Image3DModel, 5, (1, 2, 5, 5)),
+            (Image3DModel, (1, 2, 5, 5), (1, 2, 5, 5)),
+            (
+                Image3DModel,
+                {"c": 1, "z": 2, "x": 5, "y": 5},
+                (1, 2, 5, 5),
+            ),
+        ],
+    )
+    def test_raster_models_parse_with_chunks_parameter(self, model, chunks, expected):
+        image: ArrayLike = np.arange(100).reshape((10, 10))
+        if model in [Labels3DModel, Image3DModel]:
+            image = np.stack([image] * 2)
+
+        if model in [Image2DModel, Image3DModel]:
+            image = np.expand_dims(image, axis=0)
+
+        # parse as numpy array
+        # single scale
+        x_ss = model.parse(image, chunks=chunks)
+        assert x_ss.data.chunksize == expected
+        # multi scale
+        x_ms = model.parse(image, chunks=chunks, scale_factors=(2,))
+        assert x_ms["scale0"]["image"].data.chunksize == expected
+
+        # parse as dask array
+        dask_image = from_array(image)
+        # single scale
+        y_ss = model.parse(dask_image, chunks=chunks)
+        assert y_ss.data.chunksize == expected
+        # multi scale
+        y_ms = model.parse(dask_image, chunks=chunks, scale_factors=(2,))
+        assert y_ms["scale0"]["image"].data.chunksize == expected
+
+        # parse as DataArray
+        data_array = DataArray(image, dims=model.dims.dims)
+        # single scale
+        z_ss = model.parse(data_array, chunks=chunks)
+        assert z_ss.data.chunksize == expected
+        # multi scale
+        z_ms = model.parse(data_array, chunks=chunks, scale_factors=(2,))
+        assert z_ms["scale0"]["image"].data.chunksize == expected
+
     @pytest.mark.parametrize("model", [Labels2DModel, Labels3DModel])
     def test_labels_model_with_multiscales(self, model):
         # Passing "scale_factors" should generate multiscales with a "method" appropriate for labels