Skip to content

Commit 73412b4

Browse files
authored
Removing alloca on GPU (#618)
When building cuFINUFFT natively on your machine, you observed a performance regression but lower memory utilization. This was caused by the use of dynamic stack allocation to allocate memory for the kernel tensors. Previously, the approach was to allocate MAX_ARRAY on the stack. However, since the GPU stack is relatively small, this would spill over into global memory, leading to higher memory utilization. In my benchmarks, I did not observe a performance regression when using alloca (dynamic stack allocation), which differs from your experience. To achieve the best of both worlds, this PR now avoidis alloca and instead using a template recursion trick to generate different CUDA kernels based on varying spreading width values. This approach eliminates the need for alloca while ensuring that no more memory is used than necessary.
1 parent e217279 commit 73412b4

16 files changed

+825
-872
lines changed

CHANGELOG

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order.
22
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
33

44
Master, using release name V 2.4.0 (1/7/25)
5+
* PR618: removing alloca and using kernel dispatch on the GPU.
56
* PR617: Caching pip dependencies in github actions.
67
Forcing Ninja when building python on Windows.
78
* PR614: Added support for sccache in github actions.

devel/gen_all_horner_C_code.m

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
ws = 2:16;
1717
opts.wpad = false; % pad kernel eval to multiple of 4
1818

19-
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
20-
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
19+
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w');
20+
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w');
2121
end
2222
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
2323
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
2727
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
2828
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
2929
if j==1 % write switch statement
30-
fwrite(fid,sprintf(' if (w==%d) {\n',w));
30+
fwrite(fid,sprintf(' if constexpr (w==%d) {\n',w));
3131
else
32-
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
32+
fwrite(fid,sprintf(' } else if constexpr (w==%d) {\n',w));
3333
end
3434
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
3535
end

include/cufinufft/contrib/ker_horner_allw_loop.inc

Lines changed: 185 additions & 185 deletions
Large diffs are not rendered by default.

include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc

Lines changed: 151 additions & 151 deletions
Large diffs are not rendered by default.

include/cufinufft/defs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
77
// common
88
#define MAX_NSPREAD 16
9+
#define MIN_NSPREAD 2
910

1011
// max number of positive quadr nodes
1112
#define MAX_NQUAD 100

include/cufinufft/spreadinterp.h

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
7474
template<typename T>
7575
int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
7676

77-
template<typename T>
78-
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
77+
template<typename T, int ns>
78+
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta)
7979
/* ES ("exp sqrt") kernel evaluation at single real argument:
8080
phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2
8181
related to an asymptotic approximation to the Kaiser--Bessel, itself an
@@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
8888
: 0.0;
8989
}
9090

91-
template<typename T>
92-
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
93-
const double upsampfac)
91+
template<typename T, int w>
92+
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac)
9493
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
9594
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
9695
This is the current evaluation method, since it's faster (except i7 w=16).
@@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
109108
}
110109
}
111110

112-
template<typename T>
113-
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
114-
const T es_c, const T es_beta) {
111+
template<typename T, int w>
112+
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c,
113+
const T es_beta) {
115114
for (int i = 0; i < w; i++) {
116-
ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
115+
ker[i] = evaluate_kernel<T, w>(abs(x + i), es_c, es_beta);
117116
}
118117
}
119118

@@ -129,53 +128,53 @@ template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
129128
// Wrappers for methods of spreading
130129
template<typename T>
131130
int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
132-
template<typename T>
131+
template<typename T, int ns>
133132
int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
134133
template<typename T>
135134
int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
136-
template<typename T>
135+
template<typename T, int ns>
137136
int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
138137

139138
template<typename T>
140139
int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
141-
template<typename T>
140+
template<typename T, int ns>
142141
int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
143142
int blksize);
144143
template<typename T>
145144
int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
146-
template<typename T>
145+
template<typename T, int ns>
147146
int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
148147
template<typename T>
149148
int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
150149
cufinufft_plan_t<T> *d_plan);
151-
template<typename T>
150+
template<typename T, int ns>
152151
int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
153152
int blksize);
154153
template<typename T>
155154
int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
156155
cufinufft_plan_t<T> *d_plan);
157-
template<typename T>
156+
template<typename T, int ns>
158157
int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
159158
int blksize);
160159
template<typename T>
161160
int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
162161
cufinufft_plan_t<T> *d_plan);
163-
template<typename T>
162+
template<typename T, int ns>
164163
int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
165164
int blksize);
166165

167166
// Wrappers for methods of interpolation
168-
template<typename T>
167+
template<typename T, int ns>
169168
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
170-
template<typename T>
169+
template<typename T, int ns>
171170
int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
172171
int blksize);
173-
template<typename T>
172+
template<typename T, int ns>
174173
int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
175-
template<typename T>
174+
template<typename T, int ns>
176175
int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
177176
int blksize);
178-
template<typename T>
177+
template<typename T, int ns>
179178
int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
180179
int blksize);
181180

include/cufinufft/utils.h

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@
1010
#include <cuda.h>
1111
#include <cuda_runtime.h>
1212
#include <type_traits>
13+
#include <utility> // for std::forward
1314

1415
#include <thrust/extrema.h>
1516

17+
#include <finufft_errors.h>
18+
1619
#ifndef _USE_MATH_DEFINES
1720
#define _USE_MATH_DEFINES
1821
#endif
@@ -47,18 +50,6 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
4750
return int2{xstart, xend};
4851
}
4952

50-
// Define a macro to check if NVCC version is >= 11.3
51-
#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
52-
#if (__CUDACC_VER_MAJOR__ > 11) || \
53-
(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
54-
#define ALLOCA_SUPPORTED 1
55-
// windows compatibility
56-
#if __has_include(<malloc.h>)
57-
#include <malloc.h>
58-
#endif
59-
#endif
60-
#endif
61-
6253
#if defined(__CUDA_ARCH__)
6354
#if __CUDA_ARCH__ >= 900
6455
#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
@@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
191182
return std::make_tuple(nf, h, gam);
192183
}
193184

185+
// Generalized dispatcher for any function requiring ns-based dispatch
186+
template<typename Func, typename T, int ns, typename... Args>
187+
int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
188+
if constexpr (ns > MAX_NSPREAD) {
189+
return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
190+
} else {
191+
if (target_ns == ns) {
192+
return std::forward<Func>(func).template operator()<ns>(
193+
std::forward<Args>(args)...);
194+
}
195+
return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
196+
std::forward<Args>(args)...);
197+
}
198+
}
199+
200+
// Wrapper function that starts the dispatch recursion
201+
template<typename Func, typename T, typename... Args>
202+
int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
203+
return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
204+
std::forward<Args>(args)...);
205+
}
206+
194207
} // namespace utils
195208
} // namespace cufinufft
196209

src/cuda/1d/interp1d_wrapper.cu

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,41 +10,46 @@
1010
namespace cufinufft {
1111
namespace spreadinterp {
1212

13-
template<typename T>
14-
int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
15-
/*
16-
A wrapper for different interpolation methods.
17-
18-
Methods available:
19-
(1) Non-uniform points driven
20-
(2) Subproblem
21-
22-
Melody Shih 11/21/21
23-
*/
24-
{
25-
int nf1 = d_plan->nf1;
26-
int M = d_plan->M;
27-
28-
int ier;
29-
switch (d_plan->opts.gpu_method) {
30-
case 1: {
31-
ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
32-
} break;
33-
default:
34-
std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
35-
ier = FINUFFT_ERR_METHOD_NOTVALID;
13+
// Functor to handle function selection (nuptsdriven vs subprob)
14+
struct Interp1DDispatcher {
15+
template<int ns, typename T>
16+
int operator()(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) const {
17+
switch (d_plan->opts.gpu_method) {
18+
case 1:
19+
return cuinterp1d_nuptsdriven<T, ns>(nf1, M, d_plan, blksize);
20+
default:
21+
std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n";
22+
return FINUFFT_ERR_METHOD_NOTVALID;
23+
}
3624
}
37-
38-
return ier;
25+
};
26+
27+
// Updated cuinterp1d using generic dispatch
28+
template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize) {
29+
/*
30+
A wrapper for different interpolation methods.
31+
32+
Methods available:
33+
(1) Non-uniform points driven
34+
(2) Subproblem
35+
36+
Melody Shih 11/21/21
37+
38+
Now the function is updated to dispatch based on ns. This is to avoid alloca which
39+
it seems slower according to the MRI community.
40+
Marco Barbone 01/30/25
41+
*/
42+
return launch_dispatch_ns<Interp1DDispatcher, T>(Interp1DDispatcher(),
43+
d_plan->spopts.nspread, d_plan->nf1,
44+
d_plan->M, d_plan, blksize);
3945
}
4046

41-
template<typename T>
47+
template<typename T, int ns>
4248
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
4349
auto &stream = d_plan->stream;
4450
dim3 threadsPerBlock;
4551
dim3 blocks;
4652

47-
int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
4853
T es_c = d_plan->spopts.ES_c;
4954
T es_beta = d_plan->spopts.ES_beta;
5055
T sigma = d_plan->opts.upsampfac;
@@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
6166

6267
if (d_plan->opts.gpu_kerevalmeth) {
6368
for (int t = 0; t < blksize; t++) {
64-
interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
65-
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
66-
d_idxnupts);
69+
interp_1d_nuptsdriven<T, 1, ns><<<blocks, threadsPerBlock, 0, stream>>>(
70+
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
6771
RETURN_IF_CUDA_ERROR
6872
}
6973
} else {
7074
for (int t = 0; t < blksize; t++) {
71-
interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
72-
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
73-
d_idxnupts);
75+
interp_1d_nuptsdriven<T, 0, ns><<<blocks, threadsPerBlock, 0, stream>>>(
76+
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
7477
RETURN_IF_CUDA_ERROR
7578
}
7679
}

0 commit comments

Comments
 (0)