MannLabs · sophiamaedler · Nov 11, 2024 · Nov 3, 2025 · Nov 3, 2025 · Nov 6, 2025
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -7,6 +7,8 @@ anndata<0.12
 spatialdata>=0.3.0,<0.6
 pyarrow<22.0.0
 py-lmd>=1.3.1
+affine
+rasterio
 
 spatialdata-plot>=0.2.14
 matplotlib

diff --git a/src/scportrait/io/h5sc.py b/src/scportrait/io/h5sc.py
@@ -12,6 +12,7 @@
 
 from scportrait.pipeline._utils.constants import (
     DEFAULT_CELL_ID_NAME,
+    DEFAULT_IDENTIFIER_FILENAME,
     DEFAULT_NAME_SINGLE_CELL_IMAGES,
     DEFAULT_SEGMENTATION_DTYPE,
     DEFAULT_SINGLE_CELL_IMAGE_DTYPE,
@@ -55,6 +56,9 @@ def read_h5sc(filename: str | Path) -> AnnData:
         _clean_uns(adata)
 
     adata.obsm[DEFAULT_NAME_SINGLE_CELL_IMAGES] = f.get(IMAGE_DATACONTAINER_NAME)
+    adata.uns[DEFAULT_IDENTIFIER_FILENAME] = filename
+    adata.uns["_h5sc_file_handle"] = f
+
     return adata
 
 

diff --git a/src/scportrait/pipeline/_utils/constants.py b/src/scportrait/pipeline/_utils/constants.py
@@ -35,6 +35,7 @@
 IMAGE_DATACONTAINER_NAME = f"obsm/{DEFAULT_NAME_SINGLE_CELL_IMAGES}"
 DEFAULT_CELL_ID_NAME = "scportrait_cell_id"
 INDEX_DATACONTAINER_NAME = f"obs/{DEFAULT_CELL_ID_NAME}"
+DEFAULT_IDENTIFIER_FILENAME = "h5sc_source_path"
 
 DEFAULT_IMAGE_DTYPE: np.dtype = np.uint16
 DEFAULT_SEGMENTATION_DTYPE: np.dtype = np.uint64

diff --git a/src/scportrait/pipeline/_utils/spatialdata_helper.py b/src/scportrait/pipeline/_utils/spatialdata_helper.py
@@ -182,6 +182,7 @@ def calculate_centroids(mask: xarray.DataArray, coordinate_system: str = "global
     transform = get_transformation(mask, coordinate_system)
 
     if check_memory(mask):
+        print("Array fits in memory, using in-memory calculation.")
         centers, _, _ids = numba_mask_centroid(mask.values)
         return make_centers_object(centers, _ids, transform, coordinate_system)
 

diff --git a/src/scportrait/pipeline/mask_filtering/region_masking.py b/src/scportrait/pipeline/mask_filtering/region_masking.py
@@ -0,0 +1,102 @@
+import dask
+import numpy as np
+import rasterio
+import spatialdata as sd
+from napari_spatialdata import Interactive
+from rasterio.features import geometry_mask
+from scipy import ndimage
+from shapely import unary_union
+from shapely.geometry import Polygon, mapping
+from skimage.draw import disk
+from skimage.measure import find_contours
+from skimage.segmentation import watershed
+from spatialdata.models import Image2DModel
+
+
+def mask_image(sdata, image, mask, invert, automatic_masking, threshold, overwrite, masked_image_name):
+    """
+    Given an image and mask, either masks or crops the image.
+
+    Parameters
+    ----------
+    sdata : sd.SpatialData
+        spatialdata object containing the image and mask.
+    image : str
+        Name of the image in sdata.images to mask.
+    mask : str | shapely.geometry.Polygon
+        Mask, either str of the name of the shape in sdata.shapes or a shapely polygon.
+    invert : bool
+        If True, inverts the mask, such that only pixels within mask remain, while the rest gets cropped.
+    automatic_masking : bool
+        If True, uses threshold + watershed to automatically create a mask based on shapes. Threshold needs to be adjusted manually.
+    threshold : float
+            Threshold for pixel intensity values at which to segment image into foreground and background.
+    overwrite : bool
+        Whether to overwrite the image in sdata.images.
+    masked_image_name : None | str
+        Name of the masked image in sdata.images if overwrite==True. Defaults to f"{image}_masked".
+    Returns
+    -------
+    sd.SpatialData
+        spatialdata object with masked image
+    """
+    channels, height, width = sdata.images[image].data.shape
+
+    if automatic_masking:
+        polygon = _draw_polygons(sdata.images[image].data, threshold)
+    elif isinstance(mask, str):
+        polygon = sdata.shapes[mask].iloc[0].geometry
+    else:
+        polygon = mask
+
+    polygon_geom = [mapping(polygon)]
+
+    transform = rasterio.transform.Affine(1, 0, 0, 0, 1, 0)  # identity transform
+
+    image_mask = geometry_mask(polygon_geom, invert=invert, out_shape=(height, width), transform=transform)
+
+    if channels > 1:
+        image_mask = dask.array.broadcast_to(image_mask, (channels, height, width))
+
+    masked_image = sdata.images[image].data * image_mask
+    images = {}
+    images["masked_image"] = Image2DModel.parse(masked_image)
+
+    if overwrite:
+        sdata.images[image] = images["masked_image"]
+    else:
+        if masked_image_name is None:
+            masked_image_name = f"{image}_masked"
+        sdata.images[masked_image_name] = images["masked_image"]
+
+
+def _draw_polygons(image, threshold):
+    """
+    Given an image, detect regions to turn into polygon shapes, which are then used as a mask.
+
+    Parameters
+    ----------
+    image : np.ndarray
+        Image to find regions in.
+    threshold : float
+        Threshold for pixel intensity values at which to segment image into foreground and background.
+    Returns
+    -------
+    shapely.geometry.Polygon
+        Polygon containing the detected regions.
+    """
+    if image.shape[0] == 1:
+        image = image[0]
+    binary_image = image > np.percentile(image.flatten(), threshold)
+
+    distance = ndimage.distance_transform_edt(binary_image)
+    markers, _ = ndimage.label(distance)
+
+    segmented = watershed(-distance, markers, mask=binary_image)
+
+    contours = find_contours(segmented, level=0.5)
+
+    polygons = [Polygon(contour) for contour in contours if len(contour) > 2]
+    polygon = unary_union(polygons)
+
+    return polygon
diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
@@ -123,6 +123,9 @@ class Project(Logable):
     DEFAULT_SINGLE_CELL_IMAGE_DTYPE = DEFAULT_SINGLE_CELL_IMAGE_DTYPE
     DEFAULT_CELL_ID_NAME = DEFAULT_CELL_ID_NAME
 
+    _h5sc_handle = None
+    _h5sc_adata = None
+
     PALETTE = [
         "blue",
         "green",
@@ -232,6 +235,15 @@ def __exit__(self):
     def __del__(self):
         self._clear_temp_dir()
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_h5sc_handle"] = None  # ensure closed before pickling
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._h5sc_handle = None  # will be reopened lazily
+
     @property
     def sdata_path(self) -> str:
         return self._get_sdata_path()
@@ -243,14 +255,22 @@ def sdata(self) -> SpatialData:
 
     @property
     def h5sc(self) -> AnnData:
-        if self.extraction_f is None:
-            raise ValueError("No extraction method has been set.")
-        else:
-            if self.extraction_f.output_path is None:
-                path = self.extraction_f.extraction_file
-            else:
-                path = self.extraction_f.output_path
-            return read_h5sc(path)
+        # Always safely close previous handle if it exists
+        if hasattr(self, "_h5sc_handle") and self._h5sc_handle is not None:
+            try:
+                self._h5sc_handle.close()
+            except (ValueError, RuntimeError):
+                # handle was already closed or in invalid state
+                pass
+            self._h5sc_handle = None
+
+        # Load a fresh AnnData
+        adata = read_h5sc(self.extraction_f.output_path or self.extraction_f.extraction_file)
-        # Load a fresh AnnData
-        adata = read_h5sc(self.extraction_f.output_path or self.extraction_f.extraction_file)
+        # Ensure extraction configuration is available before accessing its attributes
+        extraction_f = getattr(self, "extraction_f", None)
+        if extraction_f is None:
+            raise ValueError(
+                "Extraction configuration 'extraction_f' is not set. "
+                "Run the extraction step before accessing 'h5sc'."
+            )
+
+        # Load a fresh AnnData
+        adata = read_h5sc(extraction_f.output_path or extraction_f.extraction_file)
-        # Load a fresh AnnData
-        adata = read_h5sc(self.extraction_f.output_path or self.extraction_f.extraction_file)
+        # Ensure extraction configuration is available before accessing its attributes
+        extraction_f = getattr(self, "extraction_f", None)
+        if extraction_f is None:
+            raise ValueError(
+                "Extraction configuration 'extraction_f' is not set. "
+                "Run the extraction step before accessing 'h5sc'."
+            )
+
+        # Load a fresh AnnData
+        adata = read_h5sc(extraction_f.output_path or extraction_f.extraction_file)
+
+        # Track only the handle, not the adata
+        self._h5sc_handle = adata.uns["_h5sc_file_handle"]
+
+        return adata
 
     ##### Setup Functions #####
 

diff --git a/src/scportrait/plotting/h5sc.py b/src/scportrait/plotting/h5sc.py
@@ -3,8 +3,10 @@
 import warnings
 from collections.abc import Iterable
 
+import h5py
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 from anndata import AnnData
 from matplotlib.axes import Axes
 from matplotlib.figure import Figure
@@ -272,7 +274,20 @@ def cell_grid_single_channel(
         fig = ax.get_figure()
 
     spacing = spacing * single_cell_size
-    images = get_image_with_cellid(adata, _cell_ids, channel_id)
+
+    # Collect images in a list
+    if isinstance(adata.obsm["single_cell_images"], h5py.Dataset):
+        # if working on a memory-backed array
+        images = get_image_with_cellid(adata, _cell_ids, channel_id)
+
+    else:
+        # non backed h5sc adata objects can be accessed directly
+        # these are created by slicing original h5sc objects
+        col = "scportrait_cell_id"
-        col = "scportrait_cell_id"
+        col = DEFAULT_CELL_ID_NAME
-        col = "scportrait_cell_id"
+        col = DEFAULT_CELL_ID_NAME
+        mapping = pd.Series(data=np.arange(len(adata.obs), dtype=int), index=adata.obs[col].values)
+        idx = mapping.loc[_cell_ids].to_numpy()
+        images = adata.obsm["single_cell_images"][idx, channel_id, :, :]
+
     _plot_image_grid(
         ax,
         images,
@@ -368,7 +383,18 @@ def cell_grid_multi_channel(
     channel_names = adata.uns["single_cell_images"]["channel_names"]
 
     # Collect images in a list
-    images = get_image_with_cellid(adata, _cell_ids)
+    if isinstance(adata.obsm["single_cell_images"], h5py.Dataset):
+        # if working on a memory-backed array
+        images = get_image_with_cellid(adata, _cell_ids)
+
+    else:
+        # non backed h5sc adata objects can be accessed directly
+        # these are created by slicing original h5sc objects
+        col = "scportrait_cell_id"
+        mapping = pd.Series(data=np.arange(len(adata.obs), dtype=int), index=adata.obs[col].values)
+        idx = mapping.loc[_cell_ids].to_numpy()
+        images = adata.obsm["single_cell_images"][idx]
+
     if select_channels is not None:
         if not isinstance(select_channels, Iterable):
             select_channels = [select_channels]

diff --git a/src/scportrait/plotting/sdata.py b/src/scportrait/plotting/sdata.py
@@ -4,8 +4,10 @@
 
 import matplotlib as mpl
 import matplotlib.pyplot as plt
+import numpy as np
 import spatialdata
 import xarray
+from geopandas.geodataframe import GeoDataFrame
 from matplotlib.axes import Axes
 
 PALETTE = [
@@ -48,6 +50,11 @@ def _get_shape_element(sdata, element_name) -> tuple[int, int]:
     """
     if isinstance(sdata[element_name], xarray.DataTree):
         shape = sdata[element_name].scale0.image.shape
+    elif isinstance(sdata[element_name], GeoDataFrame):
+        bounds = sdata[element_name].geometry.bounds
+        x = int(np.ceil(bounds["maxx"] - bounds["minx"]))
+        y = int(np.ceil(bounds["maxy"] - bounds["miny"]))
+        shape = (x, y)
     else:
         shape = sdata[element_name].data.shape
 

diff --git a/src/scportrait/tools/h5sc/__init__.py b/src/scportrait/tools/h5sc/__init__.py
@@ -5,6 +5,20 @@
 Functions to work with scPortrait's standardized single-cell data format.
 """
 
-from .operations import get_image_index, get_image_with_cellid
+from .operations import (
+    add_spatial_coordinates,
+    get_cell_id_index,
+    get_image_with_cellid,
+    subset_cells_region,
+    subset_h5sc,
+    update_obs_on_disk,
+)
 
-__all__ = ["get_image_with_cellid", "get_image_index"]
+__all__ = [
+    "update_obs_on_disk",
+    "get_image_with_cellid",
+    "get_cell_id_index",
+    "add_spatial_coordinates",
+    "subset_h5sc",
+    "subset_cells_region",
+]