@@ -59,10 +59,11 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker,
5959 const finufft_spread_opts &opts,
6060 const V... elems) noexcept ;
6161static FINUFFT_ALWAYS_INLINE FLT fold_rescale (FLT x, BIGINT N) noexcept ;
62- static FINUFFT_ALWAYS_INLINE void set_kernel_args (
63- FLT *args, FLT x, const finufft_spread_opts &opts) noexcept ;
62+ template <uint8_t ns>
63+ static FINUFFT_ALWAYS_INLINE void set_kernel_args (FLT *args, FLT x) noexcept ;
64+ template <uint8_t N>
6465static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector (
65- FLT *ker, FLT *args, const finufft_spread_opts &opts, int N ) noexcept ;
66+ FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept ;
6667template <uint8_t w, class simd_type = xsimd::make_sized_batch_t <
6768 FLT, find_optimal_simd_width<FLT, w>()>> // aka ns
6869static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner (
@@ -703,16 +704,15 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts)
703704 return exp ((FLT)opts.ES_beta * sqrt ((FLT)1.0 - (FLT)opts.ES_c * x * x));
704705}
705706
706- void set_kernel_args (FLT *args, FLT x, const finufft_spread_opts &opts) noexcept
707+ template <uint8_t ns>
708+ void set_kernel_args (FLT *args, FLT x) noexcept
707709// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
708710// needed for the vectorized kernel eval of Ludvig af K.
709711{
710- int ns = opts.nspread ;
711712 for (int i = 0 ; i < ns; i++) args[i] = x + (FLT)i;
712713}
713-
714- void evaluate_kernel_vector (FLT *ker, FLT *args, const finufft_spread_opts &opts,
715- const int N) noexcept
714+ template <uint8_t N>
715+ void evaluate_kernel_vector (FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept
716716/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
717717 If opts.kerpad true, args and ker must be allocated for Npad, and args is
718718 written to (to pad to length Npad), only first N outputs are correct.
@@ -742,8 +742,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
742742 if (opts.kerpad ) {
743743 // padded part should be zero, in spread_subproblem_nd_kernels, there are
744744 // out of bound writes to trg arrays
745- for (int i = N; i < Npad; ++i)
746- ker[i] = 0.0 ;
745+ for (int i = N; i < Npad; ++i) ker[i] = 0.0 ;
747746 }
748747 } else {
749748 for (int i = 0 ; i < N; i++) // dummy for timing only
@@ -1798,8 +1797,8 @@ auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts,
17981797 }
17991798 if constexpr (kerevalmeth == 0 ) {
18001799 alignas (simd_type::arch_type::alignment ()) std::array<T, MAX_NSPREAD> kernel_args{};
1801- set_kernel_args (kernel_args.data (), inputs[i], opts );
1802- evaluate_kernel_vector (ker + (i * MAX_NSPREAD), kernel_args.data (), opts, ns );
1800+ set_kernel_args<ns> (kernel_args.data (), inputs[i]);
1801+ evaluate_kernel_vector<ns> (ker + (i * MAX_NSPREAD), kernel_args.data (), opts);
18031802 }
18041803 }
18051804 return ker;
0 commit comments