🔧 Add Zarr-based Caching and Memory-Aware Patch Merging for WSI Inference (#949)

shaneahmed · web-flow · commit cf70c97b8252 · 2025-08-22T13:22:33.000+01:00
This PR introduces Zarr-based caching and memory-efficient patch merging for semantic segmentation workflows. The changes aim to handle large whole-slide image (WSI) inference by implementing dynamic memory management and disk-based caching when system resources are constrained.

Key changes:
- Adds Zarr integration for intermediate canvas and count array storage during WSI inference
- Implements memory threshold monitoring using psutil to trigger disk spilling when RAM usage exceeds limits
- Refactors patch merging logic into modular helper functions for better maintainability
diff --git a/tests/engines/test_semantic_segmentor.py b/tests/engines/test_semantic_segmentor.py
@@ -5,7 +5,7 @@
 import json
 import sqlite3
 from pathlib import Path
-from typing import Callable
+from typing import TYPE_CHECKING, Callable
 
 import numpy as np
 import torch
@@ -16,6 +16,9 @@
 from tiatoolbox.utils import env_detection as toolbox_env
 from tiatoolbox.utils.misc import imread
 
+if TYPE_CHECKING:
+    import pytest
+
 device = "cuda" if toolbox_env.has_gpu() else "cpu"
 
 
@@ -212,14 +215,16 @@ def test_wsi_segmentor_zarr(
     remote_sample: Callable,
     sample_svs: Path,
     tmp_path: Path,
+    caplog: pytest.LogCaptureFixture,
 ) -> None:
     """Test SemanticSegmentor for WSIs with zarr output."""
     wsi1_2k_2k_svs = Path(remote_sample("wsi1_2k_2k_svs"))
 
     segmentor = SemanticSegmentor(
         model="fcn-tissue_mask",
-        batch_size=32,
+        batch_size=64,
         verbose=False,
+        num_loader_workers=1,
     )
     # Return Probabilities is False
     output = segmentor.run(
@@ -229,15 +234,26 @@ def test_wsi_segmentor_zarr(
         device=device,
         patch_mode=False,
         save_dir=tmp_path / "wsi_out_check",
+        batch_size=2,
         output_type="zarr",
+        memory_threshold=1,
     )
 
     output_ = zarr.open(output[sample_svs], mode="r")
     assert 0.17 < np.mean(output_["predictions"][:]) < 0.19
     assert "probabilities" not in output_
+    assert "canvas" not in output_
+    assert "count" not in output_
+    assert "Current Memory usage:" in caplog.text
 
     # Return Probabilities is True
     # Using small image for faster run
+    segmentor = SemanticSegmentor(
+        model="fcn-tissue_mask",
+        batch_size=32,
+        verbose=False,
+        num_loader_workers=1,
+    )
     segmentor.drop_keys = []
     output = segmentor.run(
         images=[sample_svs, wsi1_2k_2k_svs],
diff --git a/tiatoolbox/models/dataset/dataset_abc.py b/tiatoolbox/models/dataset/dataset_abc.py
@@ -441,20 +441,7 @@ def __init__(  # skipcq: PY-R1000
         patch_input_shape = np.array(patch_input_shape)
         stride_shape = np.array(stride_shape)
 
-        if (
-            not np.issubdtype(patch_input_shape.dtype, np.integer)
-            or np.size(patch_input_shape) > 2  # noqa: PLR2004
-            or np.any(patch_input_shape < 0)
-        ):
-            msg = f"Invalid `patch_input_shape` value {patch_input_shape}."
-            raise ValueError(msg)
-        if (
-            not np.issubdtype(stride_shape.dtype, np.integer)
-            or np.size(stride_shape) > 2  # noqa: PLR2004
-            or np.any(stride_shape < 0)
-        ):
-            msg = f"Invalid `stride_shape` value {stride_shape}."
-            raise ValueError(msg)
+        _validate_patch_stride_shape(patch_input_shape, stride_shape)
 
         self.preproc_func = preproc_func
         img_path = Path(img_path)
@@ -475,6 +462,7 @@ def __init__(  # skipcq: PY-R1000
                 stride_shape=stride_shape[::-1],
                 patch_output_shape=patch_output_shape,
             )
+            self.full_outputs = self.outputs
         else:
             self.inputs = PatchExtractor.get_coordinates(
                 image_shape=wsi_shape,
@@ -510,6 +498,7 @@ def __init__(  # skipcq: PY-R1000
             )
             self.inputs = self.inputs[selected]
             if hasattr(self, "outputs"):
+                self.full_outputs = self.outputs  # Full list of outputs
                 self.outputs = self.outputs[selected]
 
         if len(self.inputs) == 0:
@@ -639,3 +628,40 @@ def __getitem__(self: PatchDataset, idx: int) -> dict:
             return data
 
         return data
+
+
+def _validate_patch_stride_shape(
+    patch_input_shape: np.ndarray, stride_shape: np.ndarray
+) -> None:
+    """Validate patch and stride shape inputs for semantic segmentation.
+
+    Checks that both `patch_input_shape` and `stride_shape` are integer arrays of
+    length ≤ 2 and contain non-negative values. Raises a ValueError if any
+    condition fails.
+
+    Parameters:
+        patch_input_shape (np.ndarray):
+            Shape of the input patch (e.g., height, width).
+        stride_shape (np.ndarray):
+            Stride dimensions used for patch extraction.
+
+    Raises:
+        ValueError:
+            If either input is not a valid integer array of appropriate
+            shape and values.
+
+    """
+    if (
+        not np.issubdtype(patch_input_shape.dtype, np.integer)
+        or np.size(patch_input_shape) > 2  # noqa: PLR2004
+        or np.any(patch_input_shape < 0)
+    ):
+        msg = f"Invalid `patch_input_shape` value {patch_input_shape}."
+        raise ValueError(msg)
+    if (
+        not np.issubdtype(stride_shape.dtype, np.integer)
+        or np.size(stride_shape) > 2  # noqa: PLR2004
+        or np.any(stride_shape < 0)
+    ):
+        msg = f"Invalid `stride_shape` value {stride_shape}."
+        raise ValueError(msg)
diff --git a/tiatoolbox/models/engine/engine_abc.py b/tiatoolbox/models/engine/engine_abc.py
@@ -43,6 +43,7 @@
 import dask.array as da
 import numpy as np
 import torch
+import zarr
 from dask import compute
 from dask.diagnostics import ProgressBar
 from torch import nn
@@ -71,58 +72,6 @@
     from tiatoolbox.wsicore.wsireader import WSIReader
 
 
-def prepare_engines_save_dir(
-    save_dir: str | Path | None,
-    *,
-    patch_mode: bool,
-    overwrite: bool = False,
-) -> Path | None:
-    """Create or validate the save directory for engine outputs.
-
-    Args:
-        save_dir (str | Path | None):
-            Path to the output directory.
-        patch_mode (bool):
-            Whether the input is treated as patches.
-        overwrite (bool):
-            Whether to overwrite existing directory. Default is False.
-
-    Returns:
-        Path | None:
-            Path to the output directory if created or validated, else None.
-
-    Raises:
-        OSError:
-            If patch_mode is False and save_dir is not provided.
-
-    """
-    if patch_mode:
-        if save_dir is not None:
-            save_dir = Path(save_dir)
-            save_dir.mkdir(parents=True, exist_ok=overwrite)
-            return save_dir
-        return None
-
-    if save_dir is None:
-        msg = (
-            "Input WSIs detected but no save directory provided. "
-            "Please provide a 'save_dir'."
-        )
-        raise OSError(msg)
-
-    logger.info(
-        "When providing multiple whole slide images, "
-        "the outputs will be saved and the locations of outputs "
-        "will be returned to the calling function when `run()` "
-        "finishes successfully."
-    )
-
-    save_dir = Path(save_dir)
-    save_dir.mkdir(parents=True, exist_ok=overwrite)
-
-    return save_dir
-
-
 class EngineABCRunParams(TypedDict, total=False):
     """Parameters for configuring the :func:`EngineABC.run()` method.
 
@@ -180,6 +129,9 @@ class EngineABCRunParams(TypedDict, total=False):
     return_labels: bool
     scale_factor: tuple[float, float]
     stride_shape: IntPair
+    memory_threshold: int
+    da_length_threshold: int
+    auto_get_mask: bool
     verbose: bool
 
 
@@ -432,6 +384,7 @@ def get_dataloader(
         ioconfig: ModelIOConfigABC | None = None,
         *,
         patch_mode: bool = True,
+        auto_get_mask: bool = True,
     ) -> torch.utils.data.DataLoader:
         """Pre-process images and masks and return a DataLoader for inference.
 
@@ -450,6 +403,12 @@ def get_dataloader(
                 IO configuration object specifying patch size, stride, and resolution.
             patch_mode (bool):
                 Whether to treat input as patches (`True`) or WSIs (`False`).
+            auto_get_mask (bool):
+                Auto generates tissue mask using `wsireader.tissue_mask()` when
+                patch_mode is False.
+                If set to `True`, this mask processes only the tissue regions in the
+                image. If `False` all the patches in the image are processed.
+                Default is `True`.
 
         Returns:
             torch.utils.data.DataLoader:
@@ -468,6 +427,7 @@ def get_dataloader(
                 stride_shape=ioconfig.stride_shape,
                 resolution=ioconfig.input_resolutions[0]["resolution"],
                 units=ioconfig.input_resolutions[0]["units"],
+                auto_get_mask=auto_get_mask,
             )
 
             dataset.preproc_func = self.model.preproc_func
@@ -692,6 +652,11 @@ def save_predictions(
             with ProgressBar():
                 compute(*write_tasks)
 
+            zarr_group = zarr.open(save_path, mode="r+")
+            for key in self.drop_keys:
+                if key in zarr_group:
+                    del zarr_group[key]
+
             return save_path
 
         values_to_compute = [processed_predictions[k] for k in keys_to_compute]
@@ -726,6 +691,7 @@ def save_predictions(
     def infer_wsi(
         self: EngineABC,
         dataloader: DataLoader,
+        save_path: Path,
         **kwargs: Unpack[EngineABCRunParams],
     ) -> dict:
         """Run model inference on a whole slide image (WSI).
@@ -737,6 +703,9 @@ def infer_wsi(
         Args:
             dataloader (DataLoader):
                 PyTorch DataLoader configured for WSI processing.
+            save_path (Path):
+                Path to save the intermediate output. The intermediate output is saved
+                in a zarr file.
             **kwargs (EngineABCRunParams):
                 Additional runtime parameters used during inference.
 
@@ -746,6 +715,7 @@ def infer_wsi(
 
         """
         _ = kwargs.get("patch_mode", False)
+        _ = save_path
         return self.infer_patches(
             dataloader=dataloader,
             return_coordinates=True,
@@ -1267,12 +1237,14 @@ def _run_wsi_mode(
                 masks=mask,
                 patch_mode=False,
                 ioconfig=self._ioconfig,
+                auto_get_mask=kwargs.get("auto_get_mask", True),
             )
 
             scale_factor = self._calculate_scale_factor(dataloader=self.dataloader)
 
             raw_predictions = self.infer_wsi(
                 dataloader=self.dataloader,
+                save_path=save_path[image],
                 **kwargs,
             )
 
@@ -1403,3 +1375,55 @@ def run(
             save_dir=save_dir,
             **kwargs,
         )
+
+
+def prepare_engines_save_dir(
+    save_dir: str | Path | None,
+    *,
+    patch_mode: bool,
+    overwrite: bool = False,
+) -> Path | None:
+    """Create or validate the save directory for engine outputs.
+
+    Args:
+        save_dir (str | Path | None):
+            Path to the output directory.
+        patch_mode (bool):
+            Whether the input is treated as patches.
+        overwrite (bool):
+            Whether to overwrite existing directory. Default is False.
+
+    Returns:
+        Path | None:
+            Path to the output directory if created or validated, else None.
+
+    Raises:
+        OSError:
+            If patch_mode is False and save_dir is not provided.
+
+    """
+    if patch_mode:
+        if save_dir is not None:
+            save_dir = Path(save_dir)
+            save_dir.mkdir(parents=True, exist_ok=overwrite)
+            return save_dir
+        return None
+
+    if save_dir is None:
+        msg = (
+            "Input WSIs detected but no save directory provided. "
+            "Please provide a 'save_dir'."
+        )
+        raise OSError(msg)
+
+    logger.info(
+        "When providing multiple whole slide images, "
+        "the outputs will be saved and the locations of outputs "
+        "will be returned to the calling function when `run()` "
+        "finishes successfully."
+    )
+
+    save_dir = Path(save_dir)
+    save_dir.mkdir(parents=True, exist_ok=overwrite)
+
+    return save_dir
diff --git a/tiatoolbox/models/engine/semantic_segmentor.py b/tiatoolbox/models/engine/semantic_segmentor.py