flatironinstitute · DiamonDinoia · Feb 11, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master, using release name V 2.4.0 (1/7/25)
+* PR618: removing alloca and using kernel dispatch on the GPU.
 * PR617: Caching pip dependencies in github actions.
   Forcing Ninja when building python on Windows.
 * PR614: Added support for sccache in github actions.

diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
@@ -16,8 +16,8 @@
   ws = 2:16;
   opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
-  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if (w==%d) {\n',w));
+      fwrite(fid,sprintf('  if constexpr (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if constexpr (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end

diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
@@ -6,6 +6,7 @@
 // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
 // common
 #define MAX_NSPREAD          16
+#define MIN_NSPREAD          2
 
 // max number of positive quadr nodes
 #define MAX_NQUAD            100

diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
@@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
 template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
 
-template<typename T>
-static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
+template<typename T, int ns>
+static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
    phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
    related to an asymptotic approximation to the Kaiser--Bessel, itself an
@@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
              : 0.0;
 }
 
-template<typename T>
-static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
-                                              const double upsampfac)
+template<typename T, int w>
+static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
@@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
   }
 }
 
-template<typename T>
-static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
-                                                  const T es_c, const T es_beta) {
+template<typename T, int w>
+static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c,
+                                                  const T es_beta) {
   for (int i = 0; i < w; i++) {
-    ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
+    ker[i] = evaluate_kernel<T, w>(abs(x + i), es_c, es_beta);
   }
 }
 
@@ -129,53 +128,53 @@ template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
 // Wrappers for methods of spreading
 template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 template<typename T>
 int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 
 template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
 template<typename T>
 int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
                                 cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
                                 cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
                             cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                        int blksize);
 
 // Wrappers for methods of interpolation
-template<typename T>
+template<typename T, int ns>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                        int blksize);
 

diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -10,9 +10,12 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <type_traits>
+#include <utility> // for std::forward
 
 #include <thrust/extrema.h>
 
+#include <finufft_errors.h>
+
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
@@ -47,18 +50,6 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
   return int2{xstart, xend};
 }
 
-// Define a macro to check if NVCC version is >= 11.3
-#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
-#if (__CUDACC_VER_MAJOR__ > 11) || \
-    (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
-#define ALLOCA_SUPPORTED 1
-// windows compatibility
-#if __has_include(<malloc.h>)
-#include <malloc.h>
-#endif
-#endif
-#endif
-
 #if defined(__CUDA_ARCH__)
 #if __CUDA_ARCH__ >= 900
 #define COMPUTE_CAPABILITY_90_OR_HIGHER 1
@@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
   return std::make_tuple(nf, h, gam);
 }
 
+// Generalized dispatcher for any function requiring ns-based dispatch
+template<typename Func, typename T, int ns, typename... Args>
+int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  if constexpr (ns > MAX_NSPREAD) {
+    return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
+  } else {
+    if (target_ns == ns) {
+      return std::forward<Func>(func).template operator()<ns>(
+          std::forward<Args>(args)...);
+    }
+    return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
+                                        std::forward<Args>(args)...);
+  }
+}
+
+// Wrapper function that starts the dispatch recursion
+template<typename Func, typename T, typename... Args>
+int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
+                                           std::forward<Args>(args)...);
+}
+
 } // namespace utils
 } // namespace cufinufft
 

diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
@@ -10,41 +10,46 @@
 namespace cufinufft {
 namespace spreadinterp {
 
-template<typename T>
-int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
-/*
-    A wrapper for different interpolation methods.
-
-    Methods available:
-    (1) Non-uniform points driven
-    (2) Subproblem
-
-    Melody Shih 11/21/21
-*/
-{
-  int nf1 = d_plan->nf1;
-  int M   = d_plan->M;
-
-  int ier;
-  switch (d_plan->opts.gpu_method) {
-  case 1: {
-    ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-  } break;
-  default:
-    std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
-    ier = FINUFFT_ERR_METHOD_NOTVALID;
+// Functor to handle function selection (nuptsdriven vs subprob)
+struct Interp1DDispatcher {
+  template<int ns, typename T>
+  int operator()(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) const {
+    switch (d_plan->opts.gpu_method) {
+    case 1:
+      return cuinterp1d_nuptsdriven<T, ns>(nf1, M, d_plan, blksize);
+    default:
+      std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n";
+      return FINUFFT_ERR_METHOD_NOTVALID;
+    }
   }
-
-  return ier;
+};
+
+// Updated cuinterp1d using generic dispatch
+template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize) {
+  /*
+   A wrapper for different interpolation methods.
+
+   Methods available:
+      (1) Non-uniform points driven
+      (2) Subproblem
+
+   Melody Shih 11/21/21
+
+   Now the function is updated to dispatch based on ns. This is to avoid alloca which
+   it seems slower according to the MRI community.
+   Marco Barbone 01/30/25
+  */
+  return launch_dispatch_ns<Interp1DDispatcher, T>(Interp1DDispatcher(),
+                                                   d_plan->spopts.nspread, d_plan->nf1,
+                                                   d_plan->M, d_plan, blksize);
 }
 
-template<typename T>
+template<typename T, int ns>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
   auto &stream = d_plan->stream;
   dim3 threadsPerBlock;
   dim3 blocks;
 
-  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
   T es_c          = d_plan->spopts.ES_c;
   T es_beta       = d_plan->spopts.ES_beta;
   T sigma         = d_plan->opts.upsampfac;
@@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
-      interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
-          d_idxnupts);
+      interp_1d_nuptsdriven<T, 1, ns><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
       RETURN_IF_CUDA_ERROR
     }
   } else {
     for (int t = 0; t < blksize; t++) {
-      interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
-          d_idxnupts);
+      interp_1d_nuptsdriven<T, 0, ns><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
       RETURN_IF_CUDA_ERROR
     }
   }