Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).

Master, using release name V 2.4.0 (1/7/25)
* PR618: removing alloca and using kernel dispatch on the GPU.
* PR617: Caching pip dependencies in github actions.
Forcing Ninja when building python on Windows.
* PR614: Added support for sccache in github actions.
Expand Down
8 changes: 4 additions & 4 deletions devel/gen_all_horner_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
ws = 2:16;
opts.wpad = false; % pad kernel eval to multiple of 4

if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w');
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w');
end
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
Expand All @@ -27,9 +27,9 @@
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
if j==1 % write switch statement
fwrite(fid,sprintf(' if (w==%d) {\n',w));
fwrite(fid,sprintf(' if constexpr (w==%d) {\n',w));
else
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
fwrite(fid,sprintf(' } else if constexpr (w==%d) {\n',w));
end
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
end
Expand Down
370 changes: 185 additions & 185 deletions include/cufinufft/contrib/ker_horner_allw_loop.inc

Large diffs are not rendered by default.

302 changes: 151 additions & 151 deletions include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions include/cufinufft/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
// common
#define MAX_NSPREAD 16
#define MIN_NSPREAD 2

// max number of positive quadr nodes
#define MAX_NQUAD 100
Expand Down
41 changes: 20 additions & 21 deletions include/cufinufft/spreadinterp.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
template<typename T>
int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);

template<typename T>
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
template<typename T, int ns>
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta)
/* ES ("exp sqrt") kernel evaluation at single real argument:
phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2
related to an asymptotic approximation to the Kaiser--Bessel, itself an
Expand All @@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
: 0.0;
}

template<typename T>
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
const double upsampfac)
template<typename T, int w>
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac)
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
This is the current evaluation method, since it's faster (except i7 w=16).
Expand All @@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
}
}

template<typename T>
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
const T es_c, const T es_beta) {
template<typename T, int w>
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c,
const T es_beta) {
for (int i = 0; i < w; i++) {
ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
ker[i] = evaluate_kernel<T, w>(abs(x + i), es_c, es_beta);
}
}

Expand All @@ -129,53 +128,53 @@ template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
// Wrappers for methods of spreading
template<typename T>
int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);

template<typename T>
int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);

// Wrappers for methods of interpolation
template<typename T>
template<typename T, int ns>
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);

Expand Down
37 changes: 25 additions & 12 deletions include/cufinufft/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <type_traits>
#include <utility> // for std::forward

#include <thrust/extrema.h>

#include <finufft_errors.h>

#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
Expand Down Expand Up @@ -47,18 +50,6 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
return int2{xstart, xend};
}

// Define a macro to check if NVCC version is >= 11.3
#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
#if (__CUDACC_VER_MAJOR__ > 11) || \
(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
#define ALLOCA_SUPPORTED 1
// windows compatibility
#if __has_include(<malloc.h>)
#include <malloc.h>
#endif
#endif
#endif

#if defined(__CUDA_ARCH__)
#if __CUDA_ARCH__ >= 900
#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
Expand Down Expand Up @@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
return std::make_tuple(nf, h, gam);
}

// Generalized dispatcher for any function requiring ns-based dispatch
template<typename Func, typename T, int ns, typename... Args>
int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
if constexpr (ns > MAX_NSPREAD) {
return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
} else {
if (target_ns == ns) {
return std::forward<Func>(func).template operator()<ns>(
std::forward<Args>(args)...);
}
return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
std::forward<Args>(args)...);
}
}

// Wrapper function that starts the dispatch recursion
template<typename Func, typename T, typename... Args>
int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
std::forward<Args>(args)...);
}

} // namespace utils
} // namespace cufinufft

Expand Down
69 changes: 36 additions & 33 deletions src/cuda/1d/interp1d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,46 @@
namespace cufinufft {
namespace spreadinterp {

template<typename T>
int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
/*
A wrapper for different interpolation methods.

Methods available:
(1) Non-uniform points driven
(2) Subproblem

Melody Shih 11/21/21
*/
{
int nf1 = d_plan->nf1;
int M = d_plan->M;

int ier;
switch (d_plan->opts.gpu_method) {
case 1: {
ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
} break;
default:
std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
ier = FINUFFT_ERR_METHOD_NOTVALID;
// Functor to handle function selection (nuptsdriven vs subprob)
struct Interp1DDispatcher {
template<int ns, typename T>
int operator()(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) const {
switch (d_plan->opts.gpu_method) {
case 1:
return cuinterp1d_nuptsdriven<T, ns>(nf1, M, d_plan, blksize);
default:
std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n";
return FINUFFT_ERR_METHOD_NOTVALID;
}
}

return ier;
};

// Updated cuinterp1d using generic dispatch
template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize) {
/*
A wrapper for different interpolation methods.

Methods available:
(1) Non-uniform points driven
(2) Subproblem
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe get rid of the unsupported "subproblem" method comment while we're touching this code. We're already refactoring some of this stuff some it seems like a good time to clean the comments to be up to date


Melody Shih 11/21/21

Now the function is updated to dispatch based on ns. This is to avoid alloca which
it seems slower according to the MRI community.
Marco Barbone 01/30/25
*/
return launch_dispatch_ns<Interp1DDispatcher, T>(Interp1DDispatcher(),
d_plan->spopts.nspread, d_plan->nf1,
d_plan->M, d_plan, blksize);
}

template<typename T>
template<typename T, int ns>
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
auto &stream = d_plan->stream;
dim3 threadsPerBlock;
dim3 blocks;

int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
T es_c = d_plan->spopts.ES_c;
T es_beta = d_plan->spopts.ES_beta;
T sigma = d_plan->opts.upsampfac;
Expand All @@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks

if (d_plan->opts.gpu_kerevalmeth) {
for (int t = 0; t < blksize; t++) {
interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
d_idxnupts);
interp_1d_nuptsdriven<T, 1, ns><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
RETURN_IF_CUDA_ERROR
}
} else {
for (int t = 0; t < blksize; t++) {
interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
d_idxnupts);
interp_1d_nuptsdriven<T, 0, ns><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
RETURN_IF_CUDA_ERROR
}
}
Expand Down
Loading
Loading