Skip to content

Commit d3cadb6

Browse files
authored
Merge pull request #239 from DiamondLightSource/remove-stripe-fw-radway-58
Add remove_stripe_fw with memory peak calculator
2 parents e807952 + d26c216 commit d3cadb6

File tree

7 files changed

+946
-1
lines changed

7 files changed

+946
-1
lines changed

docs/source/bibliography/references.bib

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,14 @@ @article{beck2009fast
160160
year={2009},
161161
publisher={SIAM}
162162
}
163+
164+
@article{munch2009stripe,
165+
title={Stripe and ring artifact removal with combined wavelet—Fourier filtering},
166+
author={M{\"u}nch, Beat and Trtik, Pavel and Marone, Federica and Stampanoni, Marco},
167+
journal={Optics express},
168+
volume={17},
169+
number={10},
170+
pages={8567--8591},
171+
year={2009},
172+
publisher={Optical Society of America}
173+
}

httomolibgpu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from httomolibgpu.prep.phase import paganin_filter, paganin_filter_savu_legacy
1010
from httomolibgpu.prep.stripe import (
1111
remove_stripe_based_sorting,
12+
remove_stripe_fw,
1213
remove_stripe_ti,
1314
remove_all_stripe,
1415
raven_filter,
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
template<int WSize>
2+
__global__ void grouped_convolution_x(
3+
int dim_x,
4+
int dim_y,
5+
int dim_z,
6+
const float* in,
7+
int in_stride_x,
8+
int in_stride_y,
9+
int in_stride_z,
10+
float* out,
11+
int out_stride_z,
12+
int out_stride_group,
13+
const float* w
14+
)
15+
{
16+
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
17+
const int g_thd_y = blockDim.y * blockIdx.y + threadIdx.y;
18+
const int g_thd_z = blockDim.z * blockIdx.z + threadIdx.z;
19+
if (g_thd_x >= dim_x || g_thd_y >= dim_y || g_thd_z >= dim_z)
20+
{
21+
return;
22+
}
23+
24+
constexpr int out_groups = 2;
25+
for (int i = 0; i < out_groups; ++i)
26+
{
27+
float acc = 0.F;
28+
for (int j = 0; j < WSize; ++j)
29+
{
30+
const int w_idx = i * WSize + j;
31+
const int in_idx = (g_thd_x * in_stride_x + j) + g_thd_y * in_stride_y + g_thd_z * in_stride_z;
32+
acc += w[w_idx] * in[in_idx];
33+
}
34+
const int out_idx = g_thd_x + g_thd_y * dim_x + g_thd_z * out_stride_z + i * out_stride_group;
35+
out[out_idx] = acc;
36+
}
37+
}
38+
39+
template<int WSize>
40+
__global__ void grouped_convolution_y(
41+
int dim_x,
42+
int dim_y,
43+
int dim_z,
44+
const float* in,
45+
int in_stride_x,
46+
int in_stride_y,
47+
int in_stride_z,
48+
int in_stride_group,
49+
float* out,
50+
int out_stride_z,
51+
int out_stride_group,
52+
const float* w
53+
)
54+
{
55+
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
56+
const int g_thd_y = blockDim.y * blockIdx.y + threadIdx.y;
57+
const int g_thd_z = blockDim.z * blockIdx.z + threadIdx.z;
58+
if (g_thd_x >= dim_x || g_thd_y >= dim_y || g_thd_z >= dim_z)
59+
{
60+
return;
61+
}
62+
63+
constexpr int in_groups = 2;
64+
constexpr int out_groups = 2;
65+
constexpr int item_stride_y = 2;
66+
for (int group = 0; group < in_groups; ++group)
67+
{
68+
for (int i = 0; i < out_groups; ++i)
69+
{
70+
float acc = 0.F;
71+
for (int j = 0; j < WSize; ++j)
72+
{
73+
const int w_idx = (out_groups * group + i) * WSize + j;
74+
const int in_idx = g_thd_x * in_stride_x + (item_stride_y * g_thd_y + j) * in_stride_y + group * in_stride_group + g_thd_z * in_stride_z;
75+
acc += w[w_idx] * in[in_idx];
76+
}
77+
const int out_idx = g_thd_x + g_thd_y * dim_x + g_thd_z * out_stride_z + (out_groups * group + i) * out_stride_group;
78+
out[out_idx] = acc;
79+
}
80+
}
81+
}
82+
83+
template<int WSize>
84+
__global__ void transposed_convolution_x(
85+
int dim_x,
86+
int dim_y,
87+
int dim_z,
88+
const float* in,
89+
int in_dim_x,
90+
int in_stride_y,
91+
int in_stride_z,
92+
const float* w,
93+
float* out
94+
)
95+
{
96+
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
97+
const int g_thd_y = blockDim.y * blockIdx.y + threadIdx.y;
98+
const int g_thd_z = blockDim.z * blockIdx.z + threadIdx.z;
99+
if (g_thd_x >= dim_x || g_thd_y >= dim_y || g_thd_z >= dim_z)
100+
{
101+
return;
102+
}
103+
104+
constexpr int item_out_stride = 2;
105+
float acc = 0.F;
106+
for (int i = 0; i < WSize; ++i)
107+
{
108+
const int in_x = (g_thd_x - i) / item_out_stride;
109+
const int in_x_mod = (g_thd_x - i) % item_out_stride;
110+
if (in_x_mod == 0 && in_x >= 0 && in_x < in_dim_x)
111+
{
112+
const int in_idx = in_x + g_thd_y * in_stride_y + g_thd_z * in_stride_z;
113+
acc += in[in_idx] * w[i];
114+
}
115+
}
116+
const int out_idx = g_thd_x + dim_x * g_thd_y + dim_x * dim_y * g_thd_z;
117+
out[out_idx] = acc;
118+
}
119+
120+
template<int WSize>
121+
__global__ void transposed_convolution_y(
122+
int dim_x,
123+
int dim_y,
124+
int dim_z,
125+
const float* in,
126+
int in_dim_y,
127+
int in_stride_y,
128+
int in_stride_z,
129+
const float* w,
130+
float* out
131+
)
132+
{
133+
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
134+
const int g_thd_y = blockDim.y * blockIdx.y + threadIdx.y;
135+
const int g_thd_z = blockDim.z * blockIdx.z + threadIdx.z;
136+
if (g_thd_x >= dim_x || g_thd_y >= dim_y || g_thd_z >= dim_z)
137+
{
138+
return;
139+
}
140+
141+
constexpr int item_out_stride = 2;
142+
float acc = 0.F;
143+
for (int i = 0; i < WSize; ++i)
144+
{
145+
const int in_y = (g_thd_y - i) / item_out_stride;
146+
const int in_y_mod = (g_thd_y - i) % item_out_stride;
147+
if (in_y_mod == 0 && in_y >= 0 && in_y < in_dim_y)
148+
{
149+
const int in_idx = g_thd_x + in_y * in_stride_y + g_thd_z * in_stride_z;
150+
acc += in[in_idx] * w[i];
151+
}
152+
}
153+
const int out_idx = g_thd_x + dim_x * g_thd_y + dim_x * dim_y * g_thd_z;
154+
out[out_idx] = acc;
155+
}

0 commit comments

Comments
 (0)