Merge pull request #652 from DiamondLightSource/remove-stripe-fw-radway-58

dkazanc · web-flow · commit 48f570c2067b · 2025-12-18T11:56:45.000Z
Iterative search method for memory estimator
diff --git a/docs/source/pipelines/yaml.rst b/docs/source/pipelines/yaml.rst
@@ -58,19 +58,6 @@ Those pipelines consist of methods from HTTomolibgpu (GPU) and HTTomolib (CPU) b
     .. literalinclude:: ../pipelines_full/FISTA3d_tomobar.yaml
         :language: yaml
 
-.. _tutorials_pl_templates_cpu:
-
-Pipelines using TomoPy library
-------------------------------
-
-One can build CPU-only pipelines by using mostly TomoPy methods. They are expected to be slower than the pipelines above.
-
-.. dropdown:: CPU pipeline using auto-centering and the gridrec reconstruction method from TomoPy.
-
-    .. literalinclude:: ../pipelines_full/tomopy_gridrec.yaml
-        :language: yaml
-
-
 .. _tutorials_pl_templates_dls:
 
 DLS-specific pipelines
@@ -95,17 +82,31 @@ Pipelines with parameter sweeps
 
 Here we demonstrate how to perform a sweep across multiple values of a single parameter (see :ref:`parameter_sweeping` for more details).
 
-.. note::  There is no need to add image saving plugin for sweep runs as it will be added automatically. It is also preferable to keep the `preview` small as the time of computation can be substantial.
+.. note::  There is no need to add image saving plugin for sweep runs as it will be added automatically. 
 
 .. dropdown:: Parameter sweep using the :code:`!SweepRange` tag to do a sweep over several CoR values of the :code:`center` parameter in the reconstruction method. 
 
    .. literalinclude:: ../pipelines_full/sweep_center_FBP3d_tomobar.yaml
        :language: yaml
-       :emphasize-lines: 34-37
+       :emphasize-lines: 36-39
 
 .. dropdown:: Parameter sweep using the :code:`!Sweep` tag over several particular values (not a range) of the :code:`ratio_delta_beta` parameter for the Paganin filter. 
 
    .. literalinclude:: ../pipelines_full/sweep_paganin_FBP3d_tomobar.yaml
        :language: yaml
-       :emphasize-lines: 53-56
+       :emphasize-lines: 51-54
             
+
+.. _tutorials_pl_templates_cpu:
+
+Pipelines using TomoPy library
+------------------------------
+
+One can build CPU-only pipelines by using mostly TomoPy methods. 
+
+.. note::  Methods from TomoPy are expected to be slower than the GPU-accelerated methods from the libraries above.
+
+.. dropdown:: CPU pipeline using auto-centering and the gridrec reconstruction method from TomoPy.
+
+    .. literalinclude:: ../pipelines_full/tomopy_gridrec.yaml
+        :language: yaml
diff --git a/httomo/method_wrappers/generic.py b/httomo/method_wrappers/generic.py
@@ -14,7 +14,14 @@
     MethodRepository,
 )
 from httomo.runner.output_ref import OutputRef
-from httomo.utils import catch_gputime, catchtime, gpu_enabled, log_rank, xp
+from httomo.utils import (
+    catch_gputime,
+    catchtime,
+    gpu_enabled,
+    log_rank,
+    xp,
+    search_max_slices_iterative,
+)
 
 
 import numpy as np
@@ -447,6 +454,17 @@ def calculate_max_slices(
                 * np.prod(non_slice_dims_shape)
                 * data_dtype.itemsize
             )
+        elif self.memory_gpu.method == "iterative":
+            # The iterative method may use the GPU
+            assert gpu_enabled, "GPU method used on a system without GPU support"
+            with xp.cuda.Device(self._gpu_id):
+                gpumem_cleanup()
+                return (
+                    self._calculate_max_slices_iterative(
+                        data_dtype, non_slice_dims_shape, available_memory
+                    ),
+                    available_memory,
+                )
         else:
             (
                 memory_bytes_method,
@@ -462,6 +480,31 @@ def calculate_max_slices(
             available_memory - subtract_bytes
         ) // memory_bytes_method, available_memory
 
+    def _calculate_max_slices_iterative(
+        self,
+        data_dtype: np.dtype,
+        non_slice_dims_shape: Tuple[int, int],
+        available_memory: int,
+    ) -> int:
+        def get_mem_bytes(current_slices):
+            try:
+                memory_bytes = self._query.calculate_memory_bytes_for_slices(
+                    dims_shape=(
+                        current_slices,
+                        non_slice_dims_shape[0],
+                        non_slice_dims_shape[1],
+                    ),
+                    dtype=data_dtype,
+                    **self._unwrap_output_ref_values(),
+                )
+                return memory_bytes
+            except:
+                return 2**64
+            finally:
+                gpumem_cleanup()
+
+        return search_max_slices_iterative(available_memory, get_mem_bytes)
+
     def _unwrap_output_ref_values(self) -> Dict[str, Any]:
         """
         Iterate through params in `self.config_params` and, for any value of type `OutputRef`,
diff --git a/httomo/runner/methods_repository_interface.py b/httomo/runner/methods_repository_interface.py
@@ -50,6 +50,12 @@ def calculate_memory_bytes(
         """Calculate the memory required in bytes, returning bytes method and subtract bytes tuple"""
         ...  # pragma: no cover
 
+    def calculate_memory_bytes_for_slices(
+        self, dims_shape: Tuple[int, int, int], dtype: np.dtype, **kwargs
+    ) -> int:
+        """Calculate the memory required in bytes for a given 3D grid"""
+        ...  # pragma: no cover
+
     def calculate_output_dims(
         self, non_slice_dims_shape: Tuple[int, int], **kwargs
     ) -> Tuple[int, int]:
diff --git a/httomo/sweep_runner/param_sweep_runner.py b/httomo/sweep_runner/param_sweep_runner.py
@@ -16,12 +16,12 @@
 from httomo.sweep_runner.param_sweep_block import ParamSweepBlock
 from httomo.sweep_runner.side_output_manager import SideOutputManager
 from httomo.sweep_runner.stages import NonSweepStage, Stages, SweepStage
-from httomo.utils import catchtime, log_exception, log_once
-from httomo.runner.gpu_utils import get_available_gpu_memory
+from httomo.utils import catchtime, log_exception, log_once, search_max_slices_iterative
+from httomo.runner.gpu_utils import get_available_gpu_memory, gpumem_cleanup
 from httomo.preview import PreviewConfig, PreviewDimConfig
 from httomo.runner.dataset_store_interfaces import DataSetSource
 from httomo_backends.methods_database.packages.backends.httomolibgpu.supporting_funcs.prep.phase import (
-    _calc_memory_bytes_paganin_filter,
+    _calc_memory_bytes_for_slices_paganin_filter,
 )
 
 
@@ -322,8 +322,15 @@ def _slices_to_fit_memory_Paganin(source: DataSetSource) -> int:
     angles_total = source.aux_data.angles_length
     det_X_length = source.chunk_shape[2]
 
-    (memory_bytes_method, subtract_bytes) = _calc_memory_bytes_paganin_filter(
-        (angles_total, det_X_length), dtype=np.float32()
-    )
+    def get_mem_bytes(slices):
+        try:
+            return _calc_memory_bytes_for_slices_paganin_filter(
+                (slices, angles_total, det_X_length), dtype=np.float32()
+            )
+        except:
+            return 2**64
+        finally:
+            gpumem_cleanup()
 
-    return (available_memory - subtract_bytes) // memory_bytes_method
+    gpumem_cleanup()
+    return search_max_slices_iterative(available_memory, get_mem_bytes)
diff --git a/httomo/utils.py b/httomo/utils.py
@@ -299,3 +299,60 @@ def mpi_abort_excepthook(type, value, traceback):
     log_rank("\n".join(format_tb(traceback)), MPI.COMM_WORLD)
     MPI.COMM_WORLD.Abort()
     sys.__excepthook__(type, value, traceback)
+
+
+def search_max_slices_iterative(
+    available_memory: int, get_mem_bytes: Callable[[int], int]
+) -> int:
+    """
+    Approximates the maximum number of fitting slices to the GPU memory for a given function.
+    The memory profile of the function must be increasing in the function of the number of slices.
+    First, a linear approximation of the memory profile is performed. If this is not accurate enough,
+    a binary search follows to determine the number of fitting slices. This function never returns a
+    number of slices for what `get_mem_bytes(slices) > available_memory`.
+
+    :param available_memory: Bytes of available device memory
+    :type available_memory: int
+    :param get_mem_bytes: A functor that produces the bytes of device memory needed for a given number of slices.
+    :type get_mem_bytes: Callable[[int], int]
+    :return: Returns the approximation of the maximum number of fitting slices.
+    :rtype: int
+    """
+    MEM_RATIO_THRESHOLD = 0.9  # 90% of the used device memory is the target
+
+    # Find a number of slices that does not fit
+    current_slices = 100
+    slices_high = None
+    memory_bytes = get_mem_bytes(current_slices)
+    if memory_bytes > available_memory:
+        # Found upper limit, continue to binary search
+        slices_high = current_slices
+    else:
+        # linear approximation
+        current_slices = int(current_slices * available_memory / memory_bytes)
+        while True:
+            memory_bytes = get_mem_bytes(current_slices)
+            if memory_bytes > available_memory:
+                # Found upper limit, continue to binary search
+                break
+            elif memory_bytes >= available_memory * MEM_RATIO_THRESHOLD:
+                # This is "good enough", return
+                return current_slices
+
+            # If linear approximation is not enough, just double every iteration
+            current_slices *= 2
+        slices_high = current_slices
+
+    # Binary search between low and high
+    slices_low = 0
+    while slices_high - slices_low > 1:
+        current_slices = (slices_low + slices_high) // 2
+        memory_bytes = get_mem_bytes(current_slices)
+        if memory_bytes > available_memory:
+            slices_high = current_slices
+        elif memory_bytes >= available_memory * MEM_RATIO_THRESHOLD:
+            return current_slices
+        else:
+            slices_low = current_slices
+
+    return slices_low
diff --git a/tests/method_wrappers/test_generic.py b/tests/method_wrappers/test_generic.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import numpy as np
 from httomo.method_wrappers import make_method_wrapper
@@ -689,6 +689,105 @@ def test_method(data):
         assert available_memory == 1_000_000_000
 
 
+def _linear_mem(*args, **kwargs):
+    proj, x, y = kwargs["dims_shape"]
+    dtype = kwargs["dtype"]
+    return proj * x * y * dtype.itemsize
+
+
+def _linear_offset_mem(*args, **kwargs):
+    proj, x, y = kwargs["dims_shape"]
+    dtype = kwargs["dtype"]
+    return (x * y + proj * x * y + proj * x**2) * dtype.itemsize
+
+
+def _quadratic_mem(*args, **kwargs):
+    proj, x, y = kwargs["dims_shape"]
+    dtype = kwargs["dtype"]
+    return (4 * x * y + proj * proj * x * y) * dtype.itemsize
+
+
+THROW_OVER_SLICES = 77
+
+
+def _quadratic_mem_throws(*args, **kwargs):
+    proj, x, y = kwargs["dims_shape"]
+    dtype = kwargs["dtype"]
+    if proj > THROW_OVER_SLICES:
+        raise Exception("Memory estimator failed")
+    return (4 * x * y + proj * proj * x * y) * dtype.itemsize
+
+
+@pytest.mark.cupy
+@pytest.mark.parametrize("available_memory", [0, 1_000, 1_000_000, 1_000_000_000])
+@pytest.mark.parametrize(
+    "memcalc_fn",
+    [_linear_mem, _linear_offset_mem, _quadratic_mem, _quadratic_mem_throws],
+)
+def test_generic_calculate_max_slices_iterative(
+    mocker: MockerFixture,
+    dummy_block: DataSetBlock,
+    available_memory: int,
+    memcalc_fn: Callable,
+):
+    class FakeModule:
+        def test_method(data):
+            return data
+
+    mocker.patch(
+        "httomo.method_wrappers.generic.import_module", return_value=FakeModule
+    )
+
+    memory_gpu = GpuMemoryRequirement(multiplier=None, method="iterative")
+    repo = make_mock_repo(
+        mocker,
+        pattern=Pattern.projection,
+        output_dims_change=True,
+        implementation="gpu_cupy",
+        memory_gpu=memory_gpu,
+    )
+
+    memcalc_mock = mocker.patch.object(
+        repo.query("", ""), "calculate_memory_bytes_for_slices", side_effect=memcalc_fn
+    )
+    wrp = make_method_wrapper(
+        repo,
+        "mocked_module_path",
+        "test_method",
+        MPI.COMM_WORLD,
+        make_mock_preview_config(mocker),
+    )
+    shape_t = list(dummy_block.chunk_shape)
+    shape_t.pop(0)
+    shape = (shape_t[0], shape_t[1])
+    max_slices, _ = wrp.calculate_max_slices(
+        dummy_block.data.dtype,
+        shape,
+        available_memory,
+    )
+
+    check_slices = lambda slices: memcalc_mock(
+        dims_shape=(slices, shape[0], shape[1]), dtype=dummy_block.data.dtype
+    )
+    threshold = 0.9
+    if check_slices(1) > available_memory:
+        # If zero slice fits
+        assert max_slices == 0
+    else:
+        # Computed slices must fit in the available memory
+        assert check_slices(max_slices) <= available_memory
+
+        if memcalc_fn == _quadratic_mem_throws and max_slices + 1 >= THROW_OVER_SLICES:
+            with pytest.raises(Exception):
+                check_slices(max_slices + 1)
+        else:
+            # And one more slice must not fit OR above threshold
+            assert (
+                check_slices(max_slices + 1) > available_memory
+                or check_slices(max_slices) >= available_memory * threshold
+            )
+
+
 @pytest.mark.cupy
 def test_generic_calculate_output_dims(mocker: MockerFixture):
     class FakeModule: