Skip to content

Commit 694e9ca

Browse files
committed
removing erroneous downsample kernel, adding performance test to zenodo tests
1 parent 6ae3cb1 commit 694e9ca

File tree

3 files changed

+56
-111
lines changed

3 files changed

+56
-111
lines changed

httomolibgpu/cuda_kernels/downsample_sino.cu

Lines changed: 0 additions & 36 deletions
This file was deleted.

httomolibgpu/recon/rotation.py

Lines changed: 9 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def find_center_vo(
6666
step: float = 0.25,
6767
ratio: float = 0.5,
6868
drop: int = 20,
69-
) -> float:
69+
) -> np.float32:
7070
"""
7171
Find the rotation axis location (aka the centre of rotation) using Nghia Vo's method. See the paper
7272
:cite:`vo2014reliable`.
@@ -99,7 +99,7 @@ def find_center_vo(
9999
100100
Returns
101101
-------
102-
float
102+
float32
103103
Rotation axis location with a subpixel precision.
104104
"""
105105
# if 2d sinogram is given it is extended into a 3D array along the vertical dimension
@@ -148,12 +148,6 @@ def find_center_vo(
148148
if dsp_angle > 1 or dsp_detX > 1:
149149
_sino_cs = _downsample(_sino_cs, dsp_angle, dsp_detX)
150150

151-
# NOTE: the gpu implementation of _downsample kernel bellow is erroneuos (different results with each run), needs to be re-written
152-
# if dsp_angle > 1:
153-
# _sino_cs = _downsample_kernel(_sino_cs, level=dsp_angle, axis=0)
154-
# if dsp_detX > 1:
155-
# _sino_cs = _downsample_kernel(_sino_cs, level=dsp_detX, axis=1)
156-
157151
# NOTE: this is correct implementation that avoids running any CUDA kernels. The performance is suboptimal
158152
init_cen = _search_coarse(_sino_cs, start_cor, stop_cor, ratio, drop)
159153

@@ -164,7 +158,7 @@ def find_center_vo(
164158
)
165159
cen_np = np.float32(cp.asnumpy(fine_cen))
166160
if cen_np == 0.0:
167-
return cor_initialisation_value
161+
return np.float32(cor_initialisation_value)
168162
else:
169163
return cen_np
170164

@@ -174,22 +168,7 @@ def _search_coarse(sino, smin, smax, ratio, drop):
174168
flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
175169
comp_sino = cp.ascontiguousarray(cp.flipud(sino))
176170

177-
# # NOTE: gpu code here, half a mask created to avoid sinofram concatenitation and save memory?
178-
# mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
179-
# # NOTE: old GPU code for the sizes with half data
180-
# cen_fliplr = (ncol - 1.0) / 2.0
181-
# smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
182-
# smin = smin_clip_val - cen_fliplr
183-
# smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
184-
# smax = smax_clip_val - cen_fliplr
185-
# start_cor = ncol // 2 + smin
186-
# stop_cor = ncol // 2 + smax
187-
# list_cor = cp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=cp.float32)
188-
# list_shift = 2.0 * (list_cor - cen_fliplr)
189-
# list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
190-
191171
mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
192-
# mask = cp.asarray(mask, dtype=cp.float32)
193172
cen_fliplr = (ncol - 1.0) / 2.0
194173
start_cor, stop_cor = np.sort((smin, smax))
195174
start_cor = np.int16(np.clip(start_cor, 0, ncol - 1))
@@ -198,10 +177,6 @@ def _search_coarse(sino, smin, smax, ratio, drop):
198177
list_shift = 2.0 * (list_cor - cen_fliplr)
199178
list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
200179

201-
# NOTE: this gives a different result to the CPU code, also works with a half data and a half mask
202-
# _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
203-
204-
# This essentially repeats the CPU code... probably not optimal but correct
205180
sino_sino = cp.vstack((sino, flip_sino))
206181
for i, shift in enumerate(list_shift):
207182
_sino = sino_sino[nrow:]
@@ -258,6 +233,7 @@ def _create_mask_numpy(nrow, ncol, radius, drop):
258233
mask[:, cen_col - 1 : cen_col + 2] = 0.0
259234
return mask
260235

236+
261237
def _create_mask_half(nrow, ncol, radius, drop):
262238
du = 1.0 / ncol
263239
dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
@@ -288,6 +264,7 @@ def _create_mask_half(nrow, ncol, radius, drop):
288264
kernel(grid_dims, block_dims, params)
289265
return mask
290266

267+
291268
def _create_mask(nrow, ncol, radius, drop):
292269
du = 1.0 / ncol
293270
dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
@@ -345,7 +322,8 @@ def _calculate_chunks(
345322
available_memory -= shift_size
346323
freq_domain_size = (
347324
# shift_size # it needs only half (RFFT), but complex64, so it's the same
348-
shift_size * 2 # it needs full (FFT), with complex64, so it's double
325+
shift_size
326+
* 2 # it needs full (FFT), with complex64, so it's double
349327
)
350328
fft_plan_size = freq_domain_size
351329
size_per_shift = 2 * (fft_plan_size + freq_domain_size + shift_size)
@@ -450,6 +428,7 @@ def _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out):
450428
mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
451429
)
452430

431+
453432
def _downsample(image, dsp_fact0, dsp_fact1):
454433
"""Downsample an image by averaging.
455434
@@ -480,35 +459,6 @@ def _downsample(image, dsp_fact0, dsp_fact1):
480459
return image_dsp
481460

482461

483-
def _downsample_kernel(sino, level, axis):
484-
assert sino.dtype == cp.float32, "single precision floating point input required"
485-
assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
486-
487-
dx, dz = sino.shape
488-
# Determine the new size, dim, of the downsampled dimension
489-
# dim_new_size = int(sino.shape[axis] / math.pow(2, level))
490-
dim_new_size = int(sino.shape[axis] / level)
491-
shape = [dx, dz]
492-
shape[axis] = dim_new_size
493-
downsampled_data = cp.empty(shape, dtype="float32")
494-
495-
block_x = 8
496-
block_y = 8
497-
block_dims = (block_x, block_y)
498-
grid_x = (sino.shape[1] + block_x - 1) // block_x
499-
grid_y = (sino.shape[0] + block_y - 1) // block_y
500-
grid_dims = (grid_x, grid_y)
501-
# 8x8 thread-block, which means 16 "lots" of columns to downsample per
502-
# thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
503-
# memeory per thread-block
504-
shared_mem_bytes = 64
505-
params = (sino, dx, dz, level, downsampled_data)
506-
module = load_cuda_module("downsample_sino")
507-
kernel = module.get_function("downsample_sino")
508-
kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
509-
return downsampled_data
510-
511-
512462
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
513463
# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
514464
# --- Center of rotation (COR) estimation method ---#
@@ -858,7 +808,7 @@ def find_center_pc(
858808
Subpixel accuracy. Defaults to 0.5.
859809
rotc_guess : float, optional
860810
Initial guess value for the rotation center. Defaults to None.
861-
811+
862812
Returns
863813
----------
864814
float

zenodo-tests/test_recon/test_rotation.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,58 @@
11
import cupy as cp
22
import numpy as np
33
import pytest
4+
from cupy.cuda import nvtx
5+
import time
46

57
from httomolibgpu.prep.normalize import normalize
68
from httomolibgpu.recon.rotation import find_center_vo
79

810

11+
def test_center_vo_i12_dataset1(i12_dataset1, ensure_clean_memory):
12+
projdata = i12_dataset1[0]
13+
flats = i12_dataset1[2]
14+
darks = i12_dataset1[3]
15+
del i12_dataset1
16+
17+
data_normalised = normalize(projdata, flats, darks, minus_log=True)
18+
del flats, darks, projdata
19+
ensure_clean_memory
20+
21+
mid_slice = data_normalised.shape[1] // 2
22+
cor = find_center_vo(data_normalised[:, mid_slice, :])
23+
24+
assert cor == 1253.75
25+
assert cor.dtype == np.float32
26+
27+
28+
@pytest.mark.perf
29+
def test_center_vo_i12_dataset1_performance(i12_dataset1, ensure_clean_memory):
30+
dev = cp.cuda.Device()
31+
32+
projdata = i12_dataset1[0]
33+
flats = i12_dataset1[2]
34+
darks = i12_dataset1[3]
35+
del i12_dataset1
36+
37+
data_normalised = normalize(projdata, flats, darks, minus_log=True)
38+
del flats, darks, projdata
39+
ensure_clean_memory
40+
41+
mid_slice = data_normalised.shape[1] // 2
42+
# cold run first
43+
cor = find_center_vo(data_normalised[:, mid_slice, :])
44+
45+
start = time.perf_counter_ns()
46+
nvtx.RangePush("Core")
47+
for _ in range(10):
48+
find_center_vo(data_normalised[:, mid_slice, :])
49+
nvtx.RangePop()
50+
dev.synchronize()
51+
duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
52+
53+
assert "performance in ms" == duration_ms
54+
55+
956
def test_center_vo_i12_dataset2(i12_dataset2, ensure_clean_memory):
1057
projdata = i12_dataset2[0]
1158
flats = i12_dataset2[2]
@@ -37,22 +84,6 @@ def test_center_vo_average_i12_dataset2(i12_dataset2, ensure_clean_memory):
3784
assert cor.dtype == np.float32
3885

3986

40-
def test_center_vo_i12_dataset1(i12_dataset1, ensure_clean_memory):
41-
projdata = i12_dataset1[0]
42-
flats = i12_dataset1[2]
43-
darks = i12_dataset1[3]
44-
del i12_dataset1
45-
46-
data_normalised = normalize(projdata, flats, darks, minus_log=True)
47-
del flats, darks, projdata
48-
49-
mid_slice = data_normalised.shape[1] // 2
50-
cor = find_center_vo(data_normalised[:, mid_slice, :])
51-
52-
assert cor == 1253.75
53-
assert cor.dtype == np.float32
54-
55-
5687
def test_center_vo_geant4_dataset1(geant4_dataset1, ensure_clean_memory):
5788
projdata = geant4_dataset1[0]
5889
flats = geant4_dataset1[2]

0 commit comments

Comments
 (0)