Merge pull request #679 from DiamondLightSource/snapshot_saver

dkazanc · web-flow · commit e0a5808aa2dd · 2026-02-04T11:32:48.000Z
Snapshot saver feature to help with debugging
diff --git a/docs/source/howto/run_httomo.rst b/docs/source/howto/run_httomo.rst
@@ -235,14 +235,15 @@ directory created by HTTomo would be
 Options/flags
 #############
 
-The :code:`run` command has 16 options/flags:
+The :code:`run` command has 17 options/flags:
 
 - :code:`--output-folder-name`
 - :code:`--save-all`
 - :code:`--gpu-id`
 - :code:`--reslice-dir`
 - :code:`--max-cpu-slices`
 - :code:`--max-memory`
+- :code:`--save-snapshots`
 - :code:`--monitor`
 - :code:`--monitor-output`
 - :code:`--intermediate-format`
@@ -364,6 +365,22 @@ The :code:`--max-memory` flag is for telling HTTomo how much RAM the machine has
 so then it can switch to using a file during execution of the pipeline if
 necessary.
 
+:code:`--save-snapshots`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+When this flag is enabled, the pipeline saves image snapshots at specific execution points.
+These snapshots are captured during selected methods - typically when a section boundary 
+is reached and data is transferred to the CPU. At which time a slice of the data is saved for 
+inspection.
+
+This feature is particularly useful for complex pipelines (e.g. 360 degrees with stitching and phase contrast),
+where intermediate processing steps involved in reconstruction may unintentionally alter
+the data. By reviewing these snapshot images (JPEGs), users can more easily pinpoint
+where issues are introduced in the pipeline. 
+
+Enabling snapshots incurs almost no additional computational cost, unlike the :code:`--save-all` 
+flag, which requires saving the entire dataset into a file for each method.
+
 :code:`--monitor`
 ~~~~~~~~~~~~~~~~~
 
diff --git a/httomo/cli.py b/httomo/cli.py
@@ -136,6 +136,11 @@ def check(pipeline: Union[Path, str], in_data_file: Optional[Path] = None):
     default="0",
     help="Limit the amount of memory used by the pipeline to the given memory (supports strings like 3.2G or bytes)",
 )
+@click.option(
+    "--save-snapshots",
+    is_flag=True,
+    help="Save intermediate images (snapshots) from some methods in the pipeline.",
+)
 @click.option(
     "--monitor",
     type=click.STRING,
@@ -211,6 +216,7 @@ def run(
     reslice_dir: Union[Path, None],
     max_cpu_slices: int,
     max_memory: str,
+    save_snapshots: bool,
     monitor: List[str],
     pipeline_format: str,
     monitor_output: TextIO,
@@ -264,6 +270,7 @@ def run(
             monitor,
             monitor_output,
             reslice_dir,
+            save_snapshots,
         )
     else:
         execute_sweep_run(pipeline, global_comm)
@@ -397,6 +404,7 @@ def execute_high_throughput_run(
     monitor: List[str],
     monitor_output: TextIO,
     reslice_dir: Union[Path, None],
+    save_snapshots: bool,
 ) -> None:
     # we use half the memory for blocks since we typically have inputs/output
     memory_limit = transform_limit_str_to_bytes(max_memory) // 2
@@ -415,6 +423,7 @@ def execute_high_throughput_run(
             global_comm,
             monitor=mon,
             memory_limit_bytes=memory_limit,
+            save_snapshots=save_snapshots,
         )
         runner.execute()
         if mon is not None:
diff --git a/httomo/method_wrappers/images.py b/httomo/method_wrappers/images.py
@@ -8,7 +8,6 @@
 
 from mpi4py.MPI import Comm
 
-
 import os
 from typing import Dict, Optional
 
diff --git a/httomo/runner/task_runner.py b/httomo/runner/task_runner.py
@@ -19,6 +19,7 @@
     DummySink,
     ReadableDataSetSink,
 )
+from httomo.utils import save_2d_snapshot
 from httomo.runner.gpu_utils import get_available_gpu_memory, gpumem_cleanup
 from httomo.runner.monitoring_interface import MonitoringInterface
 from httomo.runner.pipeline import Pipeline
@@ -49,11 +50,13 @@ def __init__(
         comm: MPI.Comm,
         memory_limit_bytes: int = 0,
         monitor: Optional[MonitoringInterface] = None,
+        save_snapshots: bool = False,
     ):
         self.pipeline = pipeline
         self.reslice_dir = reslice_dir
         self.comm = comm
         self.monitor = monitor
+        self.save_snapshots = save_snapshots
 
         self.side_outputs: Dict[str, Any] = dict()
         self.source: Optional[DataSetSource] = None
@@ -145,6 +148,7 @@ def _execute_section(self, section: Section, section_index: int = 0):
 
         splitter = BlockSplitter(self.source, section.max_slices)
         no_of_blocks = len(splitter)
+        section_length = len(section)
 
         # Redirect tqdm progress bar output to /dev/null, and instead manually write block
         # processing progress to logfile within loop
@@ -160,8 +164,8 @@ def _execute_section(self, section: Section, section_index: int = 0):
             if self.monitor is not None:
                 self.monitor.report_source_block(
                     f"sec_{section_index}",
-                    section.methods[0].task_id if len(section) > 0 else "",
-                    _get_slicing_dim(section.pattern) - 1,
+                    section.methods[0].task_id if section_length > 0 else "",
+                    slicing_dim_section,
                     block.shape,
                     block.chunk_index,
                     block.global_index,
@@ -170,6 +174,23 @@ def _execute_section(self, section: Section, section_index: int = 0):
 
             log_once(f"   {str(progress)}", level=logging.INFO)
             block = self._execute_section_block(section, block)
+            if (
+                self.save_snapshots
+                and self.comm.rank == self.comm.size // 2
+                and idx == no_of_blocks // 2
+            ):
+                # save the 2D state-snapshot of the mid-data block from mid-cunk
+                snapshot_slicer = [slice(None)] * block.data.ndim
+                snapshot_slicer[slicing_dim_section] = (
+                    np.shape(block.data)[slicing_dim_section] // 2
+                )
+                snapshot_slice = block.data[tuple(snapshot_slicer)]
+                method_to_snapshot_name = self._get_methods_name_for_snapshot(section)
+                save_2d_snapshot(
+                    snapshot_slice,
+                    methods_name=method_to_snapshot_name,
+                    section_index=section_index,
+                )
             log_rank(
                 f"    Finished processing block {idx + 1} of {no_of_blocks}",
                 comm=self.comm,
@@ -181,7 +202,7 @@ def _execute_section(self, section: Section, section_index: int = 0):
             if self.monitor is not None:
                 self.monitor.report_sink_block(
                     f"sec_{section_index}",
-                    section.methods[-1].task_id if len(section) > 0 else "",
+                    section.methods[-1].task_id if section_length > 0 else "",
                     _get_slicing_dim(section.pattern) - 1,
                     block.shape,
                     block.chunk_index,
@@ -280,6 +301,21 @@ def _execute_section_block(
             if_previous_block_is_on_gpu = if_current_block_is_on_gpu
         return block
 
+    def _get_methods_name_for_snapshot(self, section: Section) -> str:
+        # iteratively checking if the method's name doesn't belong to irrelevant_method_names_snapshots
+        irrelevant_method_names_snapshots = [
+            "data_checker",
+            "calculate_stats",
+            "find_center_360",
+            "find_center_pc",
+            "find_center_vo",
+            "save_intermediate_data",
+        ]
+        for wrapper in list(reversed(section.methods)):
+            if wrapper.method_name not in irrelevant_method_names_snapshots:
+                return wrapper.method_name
+        raise ValueError("Unable to find method name in section for snapshot saving")
+
     def _log_pipeline(self, msg: Any, level: int = logging.INFO):
         log_once(msg, level=level)
 
diff --git a/httomo/utils.py b/httomo/utils.py
@@ -1,13 +1,17 @@
 import sys
 import logging
-from enum import Enum
 from time import perf_counter_ns
 from traceback import format_tb
 from typing import Any, Callable, Dict, List, Literal, Tuple
 
 from loguru import logger
 from mpi4py import MPI
 import numpy as np
+from PIL import Image
+import os
+import httomo.globals
+from pathlib import Path
+
 
 from httomo_backends.methods_database.query import Pattern
 
@@ -25,6 +29,33 @@
     import numpy as xp
 
 
+def save_2d_snapshot(
+    data_slice: xp.ndarray, methods_name: str, section_index: int
+) -> None:
+    """
+    A utility to save stage snapshots as images to help debugging process
+
+    :param data_slice: a 2D array to save as a jpeg image
+    :type data_slice: xp.ndarray
+    :param methods_name: the name of the image to be saved (e.g. method's name)
+    :type methods_name: str
+    :param section_index: the index of the section
+    :type section_index: int
+    """
+    output_dir_snapshots = (
+        Path(httomo.globals.run_out_dir) / "pipeline_stages_snapshots"
+    )
+    output_dir_snapshots.mkdir(parents=True, exist_ok=True)
+    data_slice = np.nan_to_num(data_slice, copy=False, nan=0.0, posinf=0, neginf=0)
+    vmin, vmax = np.percentile(data_slice, [1, 99])
+    data_slice = np.clip(data_slice, vmin, vmax)
+    data_slice = (data_slice - vmin) / (vmax - vmin)
+    data_slice = (data_slice * 255).astype(np.uint8)
+    filename = f"{0}{section_index}{methods_name}.{'jpeg'}"
+    filepath_name = os.path.join(output_dir_snapshots, f"{filename}")
+    Image.fromarray(data_slice, mode="L").save(filepath_name, quality=95)
+
+
 def log_once(output: Any, level: int = logging.INFO) -> None:
     """
     Log output to console and log file if the process' global rank is zero.
diff --git a/tests/runner/test_task_runner.py b/tests/runner/test_task_runner.py
@@ -457,6 +457,63 @@ def test_warns_with_multiple_reslices(
     assert "Data saving or/and reslicing operation will be performed 4 times" in args[0]
 
 
+def test_get_method_names_for_snapshot_saver(
+    mocker: MockerFixture,
+    dummy_block: DataSetBlock,
+    tmp_path: PathLike,
+):
+    loader = make_test_loader(mocker, block=dummy_block, pattern=Pattern.projection)
+    method1 = make_test_method(mocker, method_name="m1", pattern=Pattern.projection)
+    method2 = make_test_method(mocker, method_name="m2_rec", pattern=Pattern.projection)
+    method3 = make_test_method(
+        mocker, method_name="data_checker", pattern=Pattern.projection
+    )
+    method4 = make_test_method(
+        mocker, method_name="find_center_pc", pattern=Pattern.sinogram
+    )
+    method5 = make_test_method(mocker, method_name="m5_rec", pattern=Pattern.sinogram)
+    method6 = make_test_method(
+        mocker, method_name="data_checker", pattern=Pattern.sinogram
+    )
+    method7 = make_test_method(mocker, method_name="m7_rec", pattern=Pattern.projection)
+    method8 = make_test_method(
+        mocker, method_name="data_checker", pattern=Pattern.projection
+    )
+    method9 = make_test_method(mocker, method_name="m9_rec", pattern=Pattern.sinogram)
+    method10 = make_test_method(
+        mocker, method_name="data_checker", pattern=Pattern.sinogram
+    )
+    method11 = make_test_method(
+        mocker, method_name="calculate_stats", pattern=Pattern.all
+    )
+    p = Pipeline(
+        loader=loader,
+        methods=[
+            method1,
+            method2,
+            method3,
+            method4,
+            method5,
+            method6,
+            method7,
+            method8,
+            method9,
+            method10,
+            method11,
+        ],
+    )
+    t = TaskRunner(p, reslice_dir=tmp_path, comm=MPI.COMM_WORLD)
+    _sections = t._sectionize()
+
+    sections_number = len(_sections)
+    METHODS_NAMES_EXPECTED = ["m2_rec", "m5_rec", "m7_rec", "m9_rec"]
+    for ind in range(0, sections_number):
+        assert (
+            t._get_methods_name_for_snapshot(_sections[ind])
+            == METHODS_NAMES_EXPECTED[ind]
+        )
+
+
 def test_warns_with_multiple_stores_from_side_outputs(
     mocker: MockerFixture,
     dummy_block: DataSetBlock,