Add raven filter to stripe.py and add performance test

neon60 · neon60 · commit 2979406dc5dc · 2024-11-05T10:22:03.000+01:00
diff --git a/docs/source/examples/raven_filter_example.py b/docs/source/examples/raven_filter_example.py
@@ -8,6 +8,8 @@
 from httomolibgpu.misc.raven_filter import raven_filter
 
 import matplotlib.pyplot as plt
+import time
+
 
 # Load the sinogram data
 path_lib = os.path.dirname(httomolibgpu.__file__)
@@ -31,8 +33,12 @@
 # Make a numpy copy
 sinogram_padded = np.pad(sinogram.get(), 20, "edge")
 
+start_time = time.time()
 # GPU filter
 sinogram_gpu_filter = raven_filter(sinogram, u0, n, v0)
+print("--- %s seconds ---" % (time.time() - start_time))
+
+start_time = time.time()
 
 # Size
 width1 = sino_shape[1] + 2 * 20
@@ -61,6 +67,8 @@
 sino[row1:row2] = sino[row1:row2] * filtercomplex
 sino = ifft_object(fft.ifftshift(sino))
 
+print("--- %s seconds ---" % (time.time() - start_time))
+
 #subplot(r,c) provide the no. of rows and columns
 f, axarr = plt.subplots(2,2) 
 
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
@@ -30,14 +30,15 @@
 
 if cupy_run:
     from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
-    from httomolibgpu.misc.raven_filter import (
-        raven_filter,
-    )
+    from cupyx.scipy.fft import fft2, ifft2, fftshift
+    from httomolibgpu.cuda_kernels import load_cuda_module
 else:
     median_filter = Mock()
     binary_dilation = Mock()
     uniform_filter1d = Mock()
-    raven_filter = Mock()
+    fft2 = Mock()
+    ifft2 = Mock()
+    fftshift = Mock()
 
 from typing import Union
 
@@ -363,26 +364,48 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
         sinogram = _rs_large(sinogram, snr, size, matindex)
     return sinogram
 
-def _raven_filter(sinogram, snr, size, matindex, vvalue=10, uvalue=10, nvalue=10 ):
+def raven_filter(
+        sinogram,
+        uvalue: int = 20,
+        nvalue: int = 4,
+        vvalue: int = 2,
+        pad_y: int = 20,
+        pad_x: int = 20,
+        pad_method: str = "edge"):
     """
     Raven filter
     """
-    padding = 2
-    (nrow, ncol) = sinogram.shape
-    width1 =  nrow + 2 * padding #sino_shape[1] + 2 * self.pad
-    height1 = ncol + 2 * padding #sino_shape[0] + 2 * self.pad
-
-    # Create filter
-    centerx = np.ceil(width1 / 2.0) - 1.0
-    centery = np.int16(np.ceil(height1 / 2.0) - 1)
-    row1 = centery - vvalue
-    row2 = centery + vvalue + 1
-    listx = np.arange(width1) - centerx
-    filtershape = 1.0 / (1.0 + np.power(listx / uvalue, 2 * nvalue))
-    filtershapepad2d = np.zeros((self.row2 - self.row1, filtershape.size))
-    filtershapepad2d[:] = np.float64(filtershape)
-    filtercomplex = filtershapepad2d + filtershapepad2d * 1j
 
+    # Padding of the data
+    padded_data = cp.pad(sinogram, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
+    # padded_data = cp.pad(sinogram, ((pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
+
+    # FFT and shift of data
+    fft_data = fft2(padded_data, axes=(-2, -1), overwrite_x=True)
+    fft_data_shifted = fftshift(fft_data)
+
+    # Setup various values for the filter
+    _, height, width = sinogram.shape
+
+    height1 = height + 2 * pad_y
+    width1 = width + 2 * pad_x
+
+    # setting grid/block parameters
+    block_x = 128
+    block_dims = (block_x, 1, 1)
+    grid_x = (width1 + block_x - 1) // block_x
+    grid_y = height1
+    grid_dims = (grid_x, grid_y, 1)
+    params = (fft_data_shifted, fft_data, width1, height1, uvalue, nvalue, vvalue)
+
+    raven_module = load_cuda_module("raven_filter")
+    raven_filt = raven_module.get_function("raven_filter")
+    
+    raven_filt(grid_dims, block_dims, params)
+    
+    # raven_filt already doing ifftshifting
+    # fft_data = ifftshift(fft_data_shifted)
+    sinogram = ifft2(fft_data, axes=(-2, -1), overwrite_x=True)
 
     return sinogram
 
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
@@ -3,11 +3,14 @@
 from cupy.cuda import nvtx
 import numpy as np
 import pytest
+import pyfftw
+import pyfftw.interfaces.numpy_fft as fft
 from httomolibgpu.prep.normalize import normalize
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
     remove_all_stripe,
+    raven_filter,
 )
 from numpy.testing import assert_allclose
 
@@ -51,7 +54,6 @@ def test_remove_stripe_ti_on_data(data, flats, darks):
 #         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
 #     )
 
-
 def test_stripe_removal_sorting_cupy(data, flats, darks):
     # --- testing the CuPy port of TomoPy's implementation ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
@@ -66,7 +68,6 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
     assert corrected_data.dtype == np.float32
     assert corrected_data.flags.c_contiguous
 
-
 @pytest.mark.perf
 def test_stripe_removal_sorting_cupy_performance(ensure_clean_memory):
     data_host = (
@@ -116,6 +117,29 @@ def test_remove_stripe_ti_performance(ensure_clean_memory):
 
     assert "performance in ms" == duration_ms
 
+@pytest.mark.perf
+def test_raven_filter_performance(ensure_clean_memory):
+    data_host = (
+        np.random.random_sample(size=(1801, 5, 2560)).astype(np.float32) * 2.0 + 0.001
+    )
+    data = cp.asarray(data_host, dtype=np.float32)
+
+    # do a cold run first
+    raven_filter(cp.copy(data))
+
+    dev = cp.cuda.Device()
+    dev.synchronize()
+
+    start = time.perf_counter_ns()
+    nvtx.RangePush("Core")
+    for _ in range(10):
+        # have to take copy, as data is modified in-place
+        raven_filter(cp.copy(data))
+    nvtx.RangePop()
+    dev.synchronize()
+    duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
+
+    assert "performance in ms" == duration_ms
 
 def test_remove_all_stripe_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#