Merge branch 'master' into interp-vectorization

DiamonDinoia · web-flow · commit b77334975ac6 · 2024-07-11T12:21:58.000-04:00
diff --git a/CHANGELOG b/CHANGELOG
@@ -7,6 +7,7 @@ V 2.3.0beta (6/21/24)
   kernel evaluation, templating by ns with AVX-width-dependent decisions.
   Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
     NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
+* new test/finufft3dkernel_test checks kerevalmeth=0,1 same to tol (M Barbone).
 * new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
 * new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
 * cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
@@ -26,15 +27,15 @@ V 2.3.0beta (6/21/24)
 * improved GPU python docs: migration guide; usage from cupy, numba, torch,
   pycuda. PyPI pkg still at 2.2.0beta.
 * Added a clang-format pre-commit hook to ensure consistent code style.
-  Created a .clang-format file to define the style similar to the existing style.
+  Created a .clang-format file to define a style similar to the existing style.
   Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
   using .git-blame-ignore-revs. Added a contributing.md for developers.
-* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
-as opposed to 32-bit. While this does modify the ABI, most code will just need to
-recompile against the new library as compilers will silently upcast any 32-bit
-integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
-integers are still used, so calling cufinufft with more than 2e9 points will fail.
-This restriction may be lifted in the future.
+* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit int
+  as opposed to 32-bit. While this does modify the ABI, most code will just
+  need to recompile against the new library as compilers will silently upcast
+  any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that
+  internally, 32-bit integers are still used, so calling cufinufft with more
+  than 2e9 points will fail. This restriction may be lifted in the future.
 
 V 2.2.0 (12/12/23)
 
@@ -52,7 +53,7 @@ V 2.2.0 (12/12/23)
 * CMake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
   - Note: the plan is to continue to support GNU makefile and make.inc.* but
     to transition to CMake as the main build system.
-  - CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)	
+  - CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
 * Docs: new tutorial content on iterative inverse NUFFTs; troubleshooting.
 * GitHub-facing badges
 * include/finufft/finufft_eitherprec.h moved up directory to be public (bea316c)
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -75,7 +75,7 @@ FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x,
 static FINUFFT_ALWAYS_INLINE void set_kernel_args(
     FLT *args, FLT x, const finufft_spread_opts &opts) noexcept;
 static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
-    FLT *ker, FLT *args, const finufft_spread_opts &opts, int N) noexcept;
+    FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept;
 template<uint8_t w, class simd_type = xsimd::make_sized_batch_t<
                         FLT, find_optimal_simd_width<FLT, w>()>> // aka ns
 static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
@@ -716,16 +716,15 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts)
     return exp((FLT)opts.ES_beta * sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x));
 }
 
-void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts &opts) noexcept
+template<uint8_t ns>
+void set_kernel_args(FLT *args, FLT x) noexcept
 // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
 // needed for the vectorized kernel eval of Ludvig af K.
 {
-  int ns = opts.nspread;
   for (int i = 0; i < ns; i++) args[i] = x + (FLT)i;
 }
-
-void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts,
-                            const int N) noexcept
+template<uint8_t N>
+void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept
 /* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
    If opts.kerpad true, args and ker must be allocated for Npad, and args is
    written to (to pad to length Npad), only first N outputs are correct.
@@ -755,8 +754,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
     if (opts.kerpad) {
       // padded part should be zero, in spread_subproblem_nd_kernels, there are
       // out of bound writes to trg arrays
-      for (int i = N; i < Npad; ++i)
-        ker[i] = 0.0;
+      for (int i = N; i < Npad; ++i) ker[i] = 0.0;
     }
   } else {
     for (int i = 0; i < N; i++) // dummy for timing only
@@ -2034,8 +2032,8 @@ auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts,
     }
     if constexpr (kerevalmeth == 0) {
       alignas(simd_type::arch_type::alignment()) std::array<T, MAX_NSPREAD> kernel_args{};
-      set_kernel_args(kernel_args.data(), inputs[i], opts);
-      evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts, ns);
+      set_kernel_args<ns>(kernel_args.data(), inputs[i]);
+      evaluate_kernel_vector<ns>(ker + (i * MAX_NSPREAD), kernel_args.data(), opts);
     }
   }
   return ker;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Each of these source test files is instantiated in single and double precision
-set(TESTS basicpassfail dumbinputs finufft1d_test finufft1dmany_test finufft2d_test finufft2dmany_test finufft3d_test finufft3dmany_test)
+set(TESTS basicpassfail dumbinputs finufft1d_test finufft1dmany_test finufft2d_test finufft2dmany_test finufft3d_test finufft3dmany_test finufft3dkernel_test)
 
 foreach(TEST ${TESTS})
   add_executable(${TEST} ${TEST}.cpp)
diff --git a/test/README b/test/README
@@ -9,7 +9,7 @@ OpenMP static scheduling for rand-# generation means that the test data should
 be reproducible (non-stochastic). Reordering of thread ops in FINUFFT itself
 leads to machine-rounding sized variations only.
 
-These CPU test executables have suffix "f" fior single precision, else double.
+These CPU test executables have suffix "f" for single precision, else double.
 The source codes do not have the suffix:
 
 basicpassfail{f} : basic double and single-prec smoke tests of the math.
@@ -22,6 +22,9 @@ finufft{1,2,3}dmany_test{f}: accuracy/speed tests for vectorized transforms,
                    in a given dimension. Types 1, 2, and 3 are tested.
                    (exit code 0 is a pass).
                    Call with no arguments for argument documentation.
+finufft3dkernel_test{f} : test kerevalmeth=0,1 give same answer within tol.
+                   Types 1, 2, and 3 are tested, in d=3 only.
+                   (exit code 0 is a pass).
 dumbinputs{f} :    test of edge cases, invalid inputs, and plan interface.
                    No arguments needed (exit code 0 is a pass).
 testutils{f} :     test of utils module.
diff --git a/test/check_finufft.sh b/test/check_finufft.sh
@@ -14,7 +14,7 @@
 
 # Barnett 3/14/17. numdiff-free option 3/16/17. simpler, dual-prec 7/3/20,
 # execs now have exit codes, removed any numdiff dep 8/18/20
-# removed diff 6/16/23.
+# removed diff 6/16/23. Added kerevalmeth=0 vs 1 test 7/8/24.
 
 # precision-specific settings
 if [[ $1 == "SINGLE" ]]; then
@@ -93,6 +93,12 @@ T=finufft3dmany_test$PRECSUF
 E=${PIPESTATUS[0]}
 if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
 
+((N++))
+T=finufft3dkernel_test$PRECSUF
+./$T$FEX 20 50 30 1e3 $FINUFFT_REQ_TOL 2>$DIR/$T.err.out | tee $DIR/$T.out
+E=${PIPESTATUS[0]}
+if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
+
 ((N++))
 T=dumbinputs$PRECSUF
 ./$T$FEX 2>$DIR/$T.err.out | tee $DIR/$T.out
diff --git a/test/finufft3dkernel_test.cpp b/test/finufft3dkernel_test.cpp
@@ -0,0 +1,183 @@
+#include <finufft/test_defs.h>
+// this enforces recompilation, responding to SINGLE...
+#include "directft/dirft3d.cpp"
+using namespace std;
+using namespace finufft::utils;
+
+const char *help[] = {
+    "Test spread_kerevalmeth=0 & 1 match, for 3 types of 3D transf, either prec.",
+    "Usage: finufft3dkernel_test Nmodes1 Nmodes2 Nmodes3 Nsrc",
+    "\t[tol] error tolerance (default 1e-6)",
+    "\t[debug] (default 0) 0: silent, 1: text, 2: as 1 but also spreader",
+    "\t[spread_sort] (default 2) 0: don't sort NU pts, 1: do, 2: auto",
+    "\t[upsampfac] (default 2.0)",
+    "\teg: finufft3dkernel_test 100 200 50 1e6 1e-12 0 2 0.0",
+    "\tnotes: exit code 1 if any error > tol",
+    nullptr};
+/**
+ * @brief Test the 3D NUFFT of type 1, 2, and 3.
+ * It evaluates the error of the kernel evaluation methods.
+ * It uses err(a,b)=||a-b||_2 / ||a||_2 as the error metric.
+ * It return FINUFFT error code if it is not 0.
+ * It returns 1 if any error exceeds tol.
+ * It returns 0 if test passes.
+ */
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2, N3;      // M = # srcs, N1,N2,N3 = # modes
+  double w, tol      = 1e-6; // default
+  double err, errmax = 0;
+  finufft_opts opts0, opts1;
+  FINUFFT_DEFAULT_OPTS(&opts0);
+  FINUFFT_DEFAULT_OPTS(&opts1);
+  opts0.spread_kerevalmeth = 0;
+  opts1.spread_kerevalmeth = 1;
+  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
+  // opts.spread_max_sp_size = 3e4; // override test
+  // opts.spread_nthr_atomic = 15;  // "
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 5 || argc > 10) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
+    return 2;
+  }
+  sscanf(argv[1], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 5) sscanf(argv[5], "%lf", &tol);
+  if (argc > 6) sscanf(argv[6], "%d", &opts0.debug); // can be 0,1 or 2
+  opts0.spread_debug = (opts0.debug > 1) ? 1 : 0;    // see output from spreader
+  if (argc > 7) sscanf(argv[7], "%d", &opts0.spread_sort);
+  if (argc > 8) {
+    sscanf(argv[8], "%lf", &w);
+    opts0.upsampfac = (FLT)w;
+  }
+
+  opts1                    = opts0;
+  opts0.spread_kerevalmeth = 0;
+  opts1.spread_kerevalmeth = 1;
+
+  cout << scientific << setprecision(15);
+  const BIGINT N = N1 * N2 * N3;
+
+  std::vector<FLT> x(M);         // NU pts x coords
+  std::vector<FLT> y(M);         // NU pts y coords
+  std::vector<FLT> z(M);         // NU pts z coords
+  std::vector<CPX> c0(M), c1(N); // strengths
+  std::vector<CPX> F0(N);        // mode ampls kereval 0
+  std::vector<CPX> F1(N);        // mode ampls kereval 1
+#pragma omp parallel
+  {
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j]  = M_PI * randm11r(&se);
+      y[j]  = M_PI * randm11r(&se);
+      z[j]  = M_PI * randm11r(&se);
+      c0[j] = crandm11r(&se);
+    }
+  }
+  c1 = c0;                     // copy strengths
+  printf("test 3d type 1:\n"); // -------------- type 1
+  printf("kerevalmeth 0:\n");
+  CNTime timer{};
+  timer.start();
+  int ier = FINUFFT3D1(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
+                       F0.data(), &opts0);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("     %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
+  printf("kerevalmeth 1:\n");
+  timer.restart();
+  ier = FINUFFT3D1(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
+                   F1.data(), &opts1);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("     %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
+
+  err    = relerrtwonorm(N, F0.data(), F1.data());
+  errmax = max(err, errmax);
+  printf("\ttype 1 rel l2-err in F is %.3g\n", err);
+  // copy F0 to F1 so that we can test type 2
+  F1 = F0;
+  printf("kerevalmeth 0:\n");
+  timer.restart();
+  ier = FINUFFT3D2(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
+                   F0.data(), &opts0);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("     (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
+  printf("kerevalmeth 1:\n");
+  timer.restart();
+  ier = FINUFFT3D2(M, x.data(), y.data(), z.data(), c1.data(), isign, tol, N1, N2, N3,
+                   F0.data(), &opts1);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("     (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
+  err    = relerrtwonorm(M, c0.data(), c1.data());
+  errmax = std::max(err, errmax);
+  printf("\ttype 2 rel l2-err in c is %.3g\n", err);
+
+  printf("test 3d type 3:\n"); // -------------- type 3
+#pragma omp parallel
+  {
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
+      z[j] = 1.0 + M_PI * randm11r(&se);  // " z_j
+    }
+  }
+  std::vector<FLT> s(N); // targ freqs (1-cmpt)
+  std::vector<FLT> t(N); // targ freqs (2-cmpt)
+  std::vector<FLT> u(N); // targ freqs (3-cmpt)
+
+  timer.restart();
+  printf("kerevalmeth 0:\n");
+  ier = FINUFFT3D3(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N, s.data(),
+                   t.data(), u.data(), F0.data(), &opts0);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
+  timer.restart();
+  printf("kerevalmeth 1:\n");
+  ier = FINUFFT3D3(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N, s.data(),
+                   t.data(), u.data(), F1.data(), &opts1);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
+    return ier;
+  } else
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
+  err    = relerrtwonorm(N, F0.data(), F1.data());
+  errmax = max(err, errmax);
+  printf("\ttype 3 rel l2-err in F is %.3g\n", err);
+  // return 1 if any error exceeds tol
+  // or return finufft error code if it is not 0
+  return (errmax > tol);
+}