removing erroneous downsample kernel, adding performance test to zenodo tests

dkazanc · dkazanc · commit 694e9cad0290 · 2025-01-13T14:33:37.000Z
diff --git a/httomolibgpu/cuda_kernels/downsample_sino.cu b/httomolibgpu/cuda_kernels/downsample_sino.cu
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
@@ -66,7 +66,7 @@ def find_center_vo(
     step: float = 0.25,
     ratio: float = 0.5,
     drop: int = 20,
-) -> float:
+) -> np.float32:
     """
     Find the rotation axis location (aka the centre of rotation) using Nghia Vo's method. See the paper
     :cite:`vo2014reliable`.
@@ -99,7 +99,7 @@ def find_center_vo(
 
     Returns
     -------
-    float
+    float32
         Rotation axis location with a subpixel precision.
     """
     # if 2d sinogram is given it is extended into a 3D array along the vertical dimension
@@ -148,12 +148,6 @@ def find_center_vo(
     if dsp_angle > 1 or dsp_detX > 1:
         _sino_cs = _downsample(_sino_cs, dsp_angle, dsp_detX)
 
-    # NOTE: the gpu implementation of _downsample kernel bellow is erroneuos (different results with each run), needs to be re-written
-    # if dsp_angle > 1:
-    #     _sino_cs = _downsample_kernel(_sino_cs, level=dsp_angle, axis=0)
-    # if dsp_detX > 1:
-    #     _sino_cs = _downsample_kernel(_sino_cs, level=dsp_detX, axis=1)
-
     # NOTE: this is correct implementation that avoids running any CUDA kernels. The performance is suboptimal
     init_cen = _search_coarse(_sino_cs, start_cor, stop_cor, ratio, drop)
 
@@ -164,7 +158,7 @@ def find_center_vo(
     )
     cen_np = np.float32(cp.asnumpy(fine_cen))
     if cen_np == 0.0:
-        return cor_initialisation_value
+        return np.float32(cor_initialisation_value)
     else:
         return cen_np
 
@@ -174,22 +168,7 @@ def _search_coarse(sino, smin, smax, ratio, drop):
     flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
     comp_sino = cp.ascontiguousarray(cp.flipud(sino))
 
-    # # NOTE: gpu code here, half a mask created to avoid sinofram concatenitation and save memory?
-    # mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-    # # NOTE: old GPU code for the sizes with half data
-    # cen_fliplr = (ncol - 1.0) / 2.0
-    # smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
-    # smin = smin_clip_val - cen_fliplr
-    # smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
-    # smax = smax_clip_val - cen_fliplr
-    # start_cor = ncol // 2 + smin
-    # stop_cor = ncol // 2 + smax
-    # list_cor = cp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=cp.float32)
-    # list_shift = 2.0 * (list_cor - cen_fliplr)
-    # list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
-
     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-    # mask = cp.asarray(mask, dtype=cp.float32)
     cen_fliplr = (ncol - 1.0) / 2.0
     start_cor, stop_cor = np.sort((smin, smax))
     start_cor = np.int16(np.clip(start_cor, 0, ncol - 1))
@@ -198,10 +177,6 @@ def _search_coarse(sino, smin, smax, ratio, drop):
     list_shift = 2.0 * (list_cor - cen_fliplr)
     list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
 
-    # NOTE: this gives a different result to the CPU code, also works with a half data and a half mask
-    # _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
-
-    # This essentially repeats the CPU code... probably not optimal but correct
     sino_sino = cp.vstack((sino, flip_sino))
     for i, shift in enumerate(list_shift):
         _sino = sino_sino[nrow:]
@@ -258,6 +233,7 @@ def _create_mask_numpy(nrow, ncol, radius, drop):
     mask[:, cen_col - 1 : cen_col + 2] = 0.0
     return mask
 
+
 def _create_mask_half(nrow, ncol, radius, drop):
     du = 1.0 / ncol
     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
@@ -288,6 +264,7 @@ def _create_mask_half(nrow, ncol, radius, drop):
     kernel(grid_dims, block_dims, params)
     return mask
 
+
 def _create_mask(nrow, ncol, radius, drop):
     du = 1.0 / ncol
     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
@@ -345,7 +322,8 @@ def _calculate_chunks(
     available_memory -= shift_size
     freq_domain_size = (
         # shift_size  # it needs only half (RFFT), but complex64, so it's the same
-        shift_size * 2  # it needs full (FFT), with complex64, so it's double
+        shift_size
+        * 2  # it needs full (FFT), with complex64, so it's double
     )
     fft_plan_size = freq_domain_size
     size_per_shift = 2 * (fft_plan_size + freq_domain_size + shift_size)
@@ -450,6 +428,7 @@ def _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out):
             mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
         )
 
+
 def _downsample(image, dsp_fact0, dsp_fact1):
     """Downsample an image by averaging.
 
@@ -480,35 +459,6 @@ def _downsample(image, dsp_fact0, dsp_fact1):
     return image_dsp
 
 
-def _downsample_kernel(sino, level, axis):
-    assert sino.dtype == cp.float32, "single precision floating point input required"
-    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-
-    dx, dz = sino.shape
-    # Determine the new size, dim, of the downsampled dimension
-    # dim_new_size = int(sino.shape[axis] / math.pow(2, level))
-    dim_new_size = int(sino.shape[axis] / level)
-    shape = [dx, dz]
-    shape[axis] = dim_new_size
-    downsampled_data = cp.empty(shape, dtype="float32")
-
-    block_x = 8
-    block_y = 8
-    block_dims = (block_x, block_y)
-    grid_x = (sino.shape[1] + block_x - 1) // block_x
-    grid_y = (sino.shape[0] + block_y - 1) // block_y
-    grid_dims = (grid_x, grid_y)
-    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
-    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
-    # memeory per thread-block
-    shared_mem_bytes = 64
-    params = (sino, dx, dz, level, downsampled_data)
-    module = load_cuda_module("downsample_sino")
-    kernel = module.get_function("downsample_sino")
-    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
-    return downsampled_data
-
-
 ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
 # --- Center of rotation (COR) estimation method ---#
@@ -858,7 +808,7 @@ def find_center_pc(
         Subpixel accuracy. Defaults to 0.5.
     rotc_guess : float, optional
         Initial guess value for the rotation center. Defaults to None.
-        
+
     Returns
     ----------
     float
diff --git a/zenodo-tests/test_recon/test_rotation.py b/zenodo-tests/test_recon/test_rotation.py
@@ -1,11 +1,58 @@
 import cupy as cp
 import numpy as np
 import pytest
+from cupy.cuda import nvtx
+import time
 
 from httomolibgpu.prep.normalize import normalize
 from httomolibgpu.recon.rotation import find_center_vo
 
 
+def test_center_vo_i12_dataset1(i12_dataset1, ensure_clean_memory):
+    projdata = i12_dataset1[0]
+    flats = i12_dataset1[2]
+    darks = i12_dataset1[3]
+    del i12_dataset1
+
+    data_normalised = normalize(projdata, flats, darks, minus_log=True)
+    del flats, darks, projdata
+    ensure_clean_memory
+
+    mid_slice = data_normalised.shape[1] // 2
+    cor = find_center_vo(data_normalised[:, mid_slice, :])
+
+    assert cor == 1253.75
+    assert cor.dtype == np.float32
+
+
+@pytest.mark.perf
+def test_center_vo_i12_dataset1_performance(i12_dataset1, ensure_clean_memory):
+    dev = cp.cuda.Device()
+
+    projdata = i12_dataset1[0]
+    flats = i12_dataset1[2]
+    darks = i12_dataset1[3]
+    del i12_dataset1
+
+    data_normalised = normalize(projdata, flats, darks, minus_log=True)
+    del flats, darks, projdata
+    ensure_clean_memory
+
+    mid_slice = data_normalised.shape[1] // 2
+    # cold run first
+    cor = find_center_vo(data_normalised[:, mid_slice, :])
+
+    start = time.perf_counter_ns()
+    nvtx.RangePush("Core")
+    for _ in range(10):
+        find_center_vo(data_normalised[:, mid_slice, :])
+    nvtx.RangePop()
+    dev.synchronize()
+    duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
+
+    assert "performance in ms" == duration_ms
+
+
 def test_center_vo_i12_dataset2(i12_dataset2, ensure_clean_memory):
     projdata = i12_dataset2[0]
     flats = i12_dataset2[2]
@@ -37,22 +84,6 @@ def test_center_vo_average_i12_dataset2(i12_dataset2, ensure_clean_memory):
     assert cor.dtype == np.float32
 
 
-def test_center_vo_i12_dataset1(i12_dataset1, ensure_clean_memory):
-    projdata = i12_dataset1[0]
-    flats = i12_dataset1[2]
-    darks = i12_dataset1[3]
-    del i12_dataset1
-
-    data_normalised = normalize(projdata, flats, darks, minus_log=True)
-    del flats, darks, projdata
-
-    mid_slice = data_normalised.shape[1] // 2
-    cor = find_center_vo(data_normalised[:, mid_slice, :])
-
-    assert cor == 1253.75
-    assert cor.dtype == np.float32
-
-
 def test_center_vo_geant4_dataset1(geant4_dataset1, ensure_clean_memory):
     projdata = geant4_dataset1[0]
     flats = geant4_dataset1[2]