Merge pull request #244 from DiamondLightSource/paganin-iterative-mem-est-radway-71

dkazanc · web-flow · commit e807952e1d83 · 2025-12-18T11:12:25.000Z
Inline memory peak calculator for Paganin filter
diff --git a/httomolibgpu/memory_estimator_helpers.py b/httomolibgpu/memory_estimator_helpers.py
@@ -0,0 +1,24 @@
+ALLOCATION_UNIT_SIZE = 512
+
+
+class _DeviceMemStack:
+    def __init__(self) -> None:
+        self.allocations = []
+        self.current = 0
+        self.highwater = 0
+
+    def malloc(self, bytes):
+        self.allocations.append(bytes)
+        allocated = self._round_up(bytes)
+        self.current += allocated
+        self.highwater = max(self.current, self.highwater)
+
+    def free(self, bytes):
+        assert bytes in self.allocations
+        self.allocations.remove(bytes)
+        self.current -= self._round_up(bytes)
+        assert self.current >= 0
+
+    def _round_up(self, size):
+        size = (size + ALLOCATION_UNIT_SIZE - 1) // ALLOCATION_UNIT_SIZE
+        return size * ALLOCATION_UNIT_SIZE
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+from httomolibgpu.memory_estimator_helpers import _DeviceMemStack
 
 cp = cupywrapper.cp
 cupy_run = cupywrapper.cupy_run
@@ -30,13 +31,14 @@
 
 if cupy_run:
     from cupyx.scipy.fft import fft2, ifft2, fftshift
+    from cupyx.scipy.fftpack import get_fft_plan
 else:
     fft2 = Mock()
     ifft2 = Mock()
     fftshift = Mock()
 
 from numpy import float32
-from typing import Tuple
+from typing import Optional, Tuple
 import math
 
 __all__ = [
@@ -54,6 +56,7 @@ def paganin_filter(
     distance: float = 1.0,
     energy: float = 53.0,
     ratio_delta_beta: float = 250,
+    calc_peak_gpu_mem: bool = False,
 ) -> cp.ndarray:
     """
     Perform single-material phase retrieval from flats/darks corrected tomographic measurements. For more detailed information, see :ref:`phase_contrast_module`.
@@ -71,30 +74,50 @@ def paganin_filter(
         Beam energy in keV.
     ratio_delta_beta : float
         The ratio of delta/beta, where delta is the phase shift and real part of the complex material refractive index and beta is the absorption.
+    calc_peak_gpu_mem: bool
+        Parameter to support memory estimation in HTTomo. Irrelevant to the method itself and can be ignored by user.
 
     Returns
     -------
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
+    mem_stack = _DeviceMemStack() if calc_peak_gpu_mem else None
     # Check the input data is valid
-    if tomo.ndim != 3:
+    if not mem_stack and tomo.ndim != 3:
         raise ValueError(
             f"Invalid number of dimensions in data: {tomo.ndim},"
             " please provide a stack of 2D projections."
         )
-
-    dz_orig, dy_orig, dx_orig = tomo.shape
+    if mem_stack:
+        mem_stack.malloc(np.prod(tomo) * np.float32().itemsize)
+    dz_orig, dy_orig, dx_orig = tomo.shape if not mem_stack else tomo
 
     # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity
     # TODO: adding other options of padding?
-    padded_tomo, pad_tup = _pad_projections_to_second_power(tomo)
+    padded_tomo, pad_tup = _pad_projections_to_second_power(tomo, mem_stack)
 
-    dz, dy, dx = padded_tomo.shape
+    dz, dy, dx = padded_tomo.shape if not mem_stack else padded_tomo
 
     # 3D FFT of tomo data
-    padded_tomo = cp.asarray(padded_tomo, dtype=cp.complex64)
-    fft_tomo = fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
+    if mem_stack:
+        mem_stack.malloc(np.prod(padded_tomo) * np.complex64().itemsize)
+        mem_stack.free(np.prod(padded_tomo) * np.float32().itemsize)
+        fft_input = cp.empty(padded_tomo, dtype=cp.complex64)
+    else:
+        padded_tomo = cp.asarray(padded_tomo, dtype=cp.complex64)
+        fft_input = padded_tomo
+
+    fft_plan = get_fft_plan(fft_input, axes=(-2, -1))
+    if mem_stack:
+        mem_stack.malloc(fft_plan.work_area.mem.size)
+        mem_stack.free(fft_plan.work_area.mem.size)
+    else:
+        with fft_plan:
+            fft_tomo = fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
+        del padded_tomo
+    del fft_input
+    del fft_plan
 
     # calculate alpha constant
     alpha = _calculate_alpha(energy, distance / 1e-6, ratio_delta_beta)
@@ -103,18 +126,56 @@ def paganin_filter(
     indx = _reciprocal_coord(pixel_size, dy)
     indy = _reciprocal_coord(pixel_size, dx)
 
-    # Build Lorentzian-type filter
-    phase_filter = fftshift(
-        1.0 / (1.0 + alpha * (cp.add.outer(cp.square(indx), cp.square(indy))))
-    )
+    if mem_stack:
+        mem_stack.malloc(indx.size * indx.dtype.itemsize)  # cp.asarray(indx)
+        mem_stack.malloc(indx.size * indx.dtype.itemsize)  # cp.square
+        mem_stack.free(indx.size * indx.dtype.itemsize)  # cp.asarray(indx)
+        mem_stack.malloc(indy.size * indy.dtype.itemsize)  # cp.asarray(indy)
+        mem_stack.malloc(indy.size * indy.dtype.itemsize)  # cp.square
+        mem_stack.free(indy.size * indy.dtype.itemsize)  # cp.asarray(indy)
+
+        mem_stack.malloc(indx.size * indy.size * indx.dtype.itemsize)  # cp.add.outer
+        mem_stack.free(indx.size * indx.dtype.itemsize)  # cp.square
+        mem_stack.free(indy.size * indy.dtype.itemsize)  # cp.square
+        mem_stack.malloc(indx.size * indy.size * indx.dtype.itemsize)  # phase_filter
+        mem_stack.free(indx.size * indy.size * indx.dtype.itemsize)  # cp.add.outer
+        mem_stack.free(indx.size * indy.size * indx.dtype.itemsize)  # phase_filter
+
+    else:
+        # Build Lorentzian-type filter
+        phase_filter = fftshift(
+            1.0
+            / (
+                1.0
+                + alpha
+                * (
+                    cp.add.outer(
+                        cp.square(cp.asarray(indx)), cp.square(cp.asarray(indy))
+                    )
+                )
+            )
+        )
 
-    phase_filter = phase_filter / phase_filter.max()  # normalisation
+        phase_filter = phase_filter / phase_filter.max()  # normalisation
 
-    # Filter projections
-    fft_tomo *= phase_filter
+        # Filter projections
+        fft_tomo *= phase_filter
+        del phase_filter
 
     # Apply filter and take inverse FFT
-    ifft_filtered_tomo = ifft2(fft_tomo, axes=(-2, -1), overwrite_x=True).real
+    ifft_input = (
+        fft_tomo if not mem_stack else cp.empty(padded_tomo, dtype=cp.complex64)
+    )
+    ifft_plan = get_fft_plan(ifft_input, axes=(-2, -1))
+    if mem_stack:
+        mem_stack.malloc(ifft_plan.work_area.mem.size)
+        mem_stack.free(ifft_plan.work_area.mem.size)
+    else:
+        with ifft_plan:
+            ifft_filtered_tomo = ifft2(fft_tomo, axes=(-2, -1), overwrite_x=True).real
+        del fft_tomo
+    del ifft_plan
+    del ifft_input
 
     # slicing indices for cropping
     slc_indices = (
@@ -123,8 +184,19 @@ def paganin_filter(
         slice(pad_tup[2][0], pad_tup[2][0] + dx_orig, 1),
     )
 
+    if mem_stack:
+        mem_stack.malloc(np.prod(tomo) * np.float32().itemsize)  # astype(cp.float32)
+        mem_stack.free(
+            np.prod(padded_tomo) * np.complex64().itemsize
+        )  # ifft_filtered_tomo
+        mem_stack.malloc(
+            np.prod(tomo) * np.float32().itemsize
+        )  # return _log_kernel(tomo)
+        return mem_stack.highwater
+
     # crop the padded filtered data:
     tomo = ifft_filtered_tomo[slc_indices].astype(cp.float32)
+    del ifft_filtered_tomo
 
     # taking the negative log
     _log_kernel = cp.ElementwiseKernel(
@@ -177,7 +249,7 @@ def _calculate_pad_size(datashape: tuple) -> list:
 
 
 def _pad_projections_to_second_power(
-    tomo: cp.ndarray,
+    tomo: cp.ndarray, mem_stack: Optional[_DeviceMemStack]
 ) -> Tuple[cp.ndarray, Tuple[int, int]]:
     """
     Performs padding of each projection to the next power of 2.
@@ -194,11 +266,17 @@ def _pad_projections_to_second_power(
     ndarray: padded 3d projection data
     tuple: a tuple with padding dimensions
     """
-    full_shape_tomo = cp.shape(tomo)
+    full_shape_tomo = cp.shape(tomo) if not mem_stack else tomo
 
     pad_list = _calculate_pad_size(full_shape_tomo)
 
-    padded_tomo = cp.pad(tomo, tuple(pad_list), "edge")
+    if mem_stack:
+        padded_tomo = [
+            sh + pad[0] + pad[1] for sh, pad in zip(full_shape_tomo, pad_list)
+        ]
+        mem_stack.malloc(np.prod(padded_tomo) * np.float32().itemsize)
+    else:
+        padded_tomo = cp.pad(tomo, tuple(pad_list), "edge")
 
     return padded_tomo, tuple(pad_list)
 
@@ -209,7 +287,7 @@ def _wavelength_micron(energy: float) -> float:
     return 2 * math.pi * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
 
 
-def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
+def _reciprocal_coord(pixel_size: float, num_grid: int) -> np.ndarray:
     """
     Calculate reciprocal grid coordinates for a given pixel size
     and discretization.
@@ -227,7 +305,7 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
         Grid coordinates.
     """
     n = num_grid - 1
-    rc = cp.arange(-n, num_grid, 2, dtype=cp.float32)
+    rc = np.arange(-n, num_grid, 2, dtype=cp.float32)
     rc *= 2 * math.pi / (n * pixel_size)
     return rc
 
@@ -238,6 +316,7 @@ def paganin_filter_savu_legacy(
     distance: float = 1.0,
     energy: float = 53.0,
     ratio_delta_beta: float = 250,
+    calc_peak_gpu_mem: bool = False,
 ) -> cp.ndarray:
     """
     Perform single-material phase retrieval from flats/darks corrected tomographic measurements. For more detailed information, see :ref:`phase_contrast_module`.
@@ -256,11 +335,20 @@ def paganin_filter_savu_legacy(
         Beam energy in keV.
     ratio_delta_beta : float
         The ratio of delta/beta, where delta is the phase shift and real part of the complex material refractive index and beta is the absorption.
+    calc_peak_gpu_mem: bool
+        Parameter to support memory estimation in HTTomo. Irrelevant to the method itself and can be ignored by user.
 
     Returns
     -------
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
 
-    return paganin_filter(tomo, pixel_size, distance, energy, ratio_delta_beta / 4)
+    return paganin_filter(
+        tomo,
+        pixel_size,
+        distance,
+        energy,
+        ratio_delta_beta / 4,
+        calc_peak_gpu_mem=calc_peak_gpu_mem,
+    )
diff --git a/httomolibgpu/recon/_phase_cross_correlation.py b/httomolibgpu/recon/_phase_cross_correlation.py
@@ -36,9 +36,8 @@
 import cupyx.scipy.ndimage as ndi
 import numpy as np
 
-def _upsampled_dft(
-    data, upsampled_region_size, upsample_factor=1, axis_offsets=None
-):
+
+def _upsampled_dft(data, upsampled_region_size, upsample_factor=1, axis_offsets=None):
     """
     Upsampled DFT by matrix multiplication.
 
@@ -148,9 +147,7 @@ def _compute_error(cross_correlation_max, src_amp, target_amp):
         )
 
     with np.errstate(invalid="ignore"):
-        error = 1.0 - cross_correlation_max * cross_correlation_max.conj() / (
-            amp
-        )
+        error = 1.0 - cross_correlation_max * cross_correlation_max.conj() / (amp)
 
     return cp.sqrt(cp.abs(error))
 
@@ -192,9 +189,7 @@ def _disambiguate_shift(reference_image, moving_image, shift):
     negative_shift = [shift_i - s for shift_i, s in zip(positive_shift, shape)]
     subpixel = any(s % 1 != 0 for s in shift)
     interp_order = 3 if subpixel else 0
-    shifted = ndi.shift(
-        moving_image, shift, mode="grid-wrap", order=interp_order
-    )
+    shifted = ndi.shift(moving_image, shift, mode="grid-wrap", order=interp_order)
     indices = tuple(round(s) for s in positive_shift)
     splits_per_dim = [(slice(0, i), slice(i, None)) for i in indices]
     max_corr = -1.0
@@ -217,9 +212,7 @@ def _disambiguate_shift(reference_image, moving_image, shift):
         )
         return shift
     real_shift_acc = []
-    for sl, pos_shift, neg_shift in zip(
-        max_slice, positive_shift, negative_shift
-    ):
+    for sl, pos_shift, neg_shift in zip(max_slice, positive_shift, negative_shift):
         real_shift_acc.append(pos_shift if sl.stop is None else neg_shift)
     if not subpixel:
         real_shift = tuple(map(int, real_shift_acc))
@@ -359,16 +352,12 @@ def phase_cross_correlation(
         # Initial shift estimate in upsampled grid
         # shift = cp.around(shift * upsample_factor) / upsample_factor
         upsample_factor = float(upsample_factor)
-        shift = tuple(
-            round(s * upsample_factor) / upsample_factor for s in shift
-        )
+        shift = tuple(round(s * upsample_factor) / upsample_factor for s in shift)
         upsampled_region_size = math.ceil(upsample_factor * 1.5)
         # Center of output array at dftshift + 1
         dftshift = float(upsampled_region_size // 2)
         # Matrix multiply DFT around the current shift estimate
-        sample_region_offset = tuple(
-            dftshift - s * upsample_factor for s in shift
-        )
+        sample_region_offset = tuple(dftshift - s * upsample_factor for s in shift)
         cross_correlation = _upsampled_dft(
             image_product.conj(),
             upsampled_region_size,
@@ -394,9 +383,7 @@ def phase_cross_correlation(
 
     # If its only one row or column the shift along that dimension has no
     # effect. We set to zero.
-    shift = tuple(
-        s if axis_size != 1 else 0 for s, axis_size in zip(shift, shape)
-    )
+    shift = tuple(s if axis_size != 1 else 0 for s, axis_size in zip(shift, shape))
 
     if disambiguate:
         if space.lower() != "real":
@@ -406,10 +393,7 @@ def phase_cross_correlation(
 
     # Redirect user to masked_phase_cross_correlation if NaNs are observed
     if cp.isnan(CCmax) or cp.isnan(src_amp) or cp.isnan(target_amp):
-        raise ValueError(
-            "NaN values found, please remove NaNs from your "
-            "input data"
-        )
+        raise ValueError("NaN values found, please remove NaNs from your " "input data")
 
     return (
         shift,
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -60,9 +60,13 @@ def data_file(test_data_path):
 def ensure_clean_memory():
     cp.get_default_memory_pool().free_all_blocks()
     cp.get_default_pinned_memory_pool().free_all_blocks()
+    cache = cp.fft.config.get_plan_cache()
+    cache.clear()
     yield None
     cp.get_default_memory_pool().free_all_blocks()
     cp.get_default_pinned_memory_pool().free_all_blocks()
+    cache = cp.fft.config.get_plan_cache()
+    cache.clear()
 
 
 @pytest.fixture
@@ -135,3 +139,20 @@ def host_detector_x(data_file):
 @pytest.fixture
 def detector_x(host_detector_x, ensure_clean_memory):
     return cp.asarray(host_detector_x)
+
+
+class MaxMemoryHook(cp.cuda.MemoryHook):
+    def __init__(self, initial=0):
+        self.max_mem = initial
+        self.current = initial
+
+    def malloc_postprocess(
+        self, device_id: int, size: int, mem_size: int, mem_ptr: int, pmem_id: int
+    ):
+        self.current += mem_size
+        self.max_mem = max(self.max_mem, self.current)
+
+    def free_postprocess(
+        self, device_id: int, mem_size: int, mem_ptr: int, pmem_id: int
+    ):
+        self.current -= mem_size
diff --git a/tests/test_prep/test_phase.py b/tests/test_prep/test_phase.py