Skip to content

Commit b773349

Browse files
authored
Merge branch 'master' into interp-vectorization
2 parents ca9c243 + 3110b60 commit b773349

File tree

6 files changed

+212
-21
lines changed

6 files changed

+212
-21
lines changed

CHANGELOG

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ V 2.3.0beta (6/21/24)
77
kernel evaluation, templating by ns with AVX-width-dependent decisions.
88
Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
99
NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
10+
* new test/finufft3dkernel_test checks kerevalmeth=0,1 same to tol (M Barbone).
1011
* new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
1112
* new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
1213
* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
@@ -26,15 +27,15 @@ V 2.3.0beta (6/21/24)
2627
* improved GPU python docs: migration guide; usage from cupy, numba, torch,
2728
pycuda. PyPI pkg still at 2.2.0beta.
2829
* Added a clang-format pre-commit hook to ensure consistent code style.
29-
Created a .clang-format file to define the style similar to the existing style.
30+
Created a .clang-format file to define a style similar to the existing style.
3031
Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
3132
using .git-blame-ignore-revs. Added a contributing.md for developers.
32-
* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
33-
as opposed to 32-bit. While this does modify the ABI, most code will just need to
34-
recompile against the new library as compilers will silently upcast any 32-bit
35-
integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
36-
integers are still used, so calling cufinufft with more than 2e9 points will fail.
37-
This restriction may be lifted in the future.
33+
* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit int
34+
as opposed to 32-bit. While this does modify the ABI, most code will just
35+
need to recompile against the new library as compilers will silently upcast
36+
any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that
37+
internally, 32-bit integers are still used, so calling cufinufft with more
38+
than 2e9 points will fail. This restriction may be lifted in the future.
3839

3940
V 2.2.0 (12/12/23)
4041

@@ -52,7 +53,7 @@ V 2.2.0 (12/12/23)
5253
* CMake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
5354
- Note: the plan is to continue to support GNU makefile and make.inc.* but
5455
to transition to CMake as the main build system.
55-
- CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
56+
- CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
5657
* Docs: new tutorial content on iterative inverse NUFFTs; troubleshooting.
5758
* GitHub-facing badges
5859
* include/finufft/finufft_eitherprec.h moved up directory to be public (bea316c)

src/spreadinterp.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x,
7575
static FINUFFT_ALWAYS_INLINE void set_kernel_args(
7676
FLT *args, FLT x, const finufft_spread_opts &opts) noexcept;
7777
static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
78-
FLT *ker, FLT *args, const finufft_spread_opts &opts, int N) noexcept;
78+
FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept;
7979
template<uint8_t w, class simd_type = xsimd::make_sized_batch_t<
8080
FLT, find_optimal_simd_width<FLT, w>()>> // aka ns
8181
static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
@@ -716,16 +716,15 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts)
716716
return exp((FLT)opts.ES_beta * sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x));
717717
}
718718

719-
void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts &opts) noexcept
719+
template<uint8_t ns>
720+
void set_kernel_args(FLT *args, FLT x) noexcept
720721
// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
721722
// needed for the vectorized kernel eval of Ludvig af K.
722723
{
723-
int ns = opts.nspread;
724724
for (int i = 0; i < ns; i++) args[i] = x + (FLT)i;
725725
}
726-
727-
void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts,
728-
const int N) noexcept
726+
template<uint8_t N>
727+
void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept
729728
/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
730729
If opts.kerpad true, args and ker must be allocated for Npad, and args is
731730
written to (to pad to length Npad), only first N outputs are correct.
@@ -755,8 +754,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts
755754
if (opts.kerpad) {
756755
// padded part should be zero, in spread_subproblem_nd_kernels, there are
757756
// out of bound writes to trg arrays
758-
for (int i = N; i < Npad; ++i)
759-
ker[i] = 0.0;
757+
for (int i = N; i < Npad; ++i) ker[i] = 0.0;
760758
}
761759
} else {
762760
for (int i = 0; i < N; i++) // dummy for timing only
@@ -2034,8 +2032,8 @@ auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts,
20342032
}
20352033
if constexpr (kerevalmeth == 0) {
20362034
alignas(simd_type::arch_type::alignment()) std::array<T, MAX_NSPREAD> kernel_args{};
2037-
set_kernel_args(kernel_args.data(), inputs[i], opts);
2038-
evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts, ns);
2035+
set_kernel_args<ns>(kernel_args.data(), inputs[i]);
2036+
evaluate_kernel_vector<ns>(ker + (i * MAX_NSPREAD), kernel_args.data(), opts);
20392037
}
20402038
}
20412039
return ker;

test/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Each of these source test files is instantiated in single and double precision
2-
set(TESTS basicpassfail dumbinputs finufft1d_test finufft1dmany_test finufft2d_test finufft2dmany_test finufft3d_test finufft3dmany_test)
2+
set(TESTS basicpassfail dumbinputs finufft1d_test finufft1dmany_test finufft2d_test finufft2dmany_test finufft3d_test finufft3dmany_test finufft3dkernel_test)
33

44
foreach(TEST ${TESTS})
55
add_executable(${TEST} ${TEST}.cpp)

test/README

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ OpenMP static scheduling for rand-# generation means that the test data should
99
be reproducible (non-stochastic). Reordering of thread ops in FINUFFT itself
1010
leads to machine-rounding sized variations only.
1111

12-
These CPU test executables have suffix "f" fior single precision, else double.
12+
These CPU test executables have suffix "f" for single precision, else double.
1313
The source codes do not have the suffix:
1414

1515
basicpassfail{f} : basic double and single-prec smoke tests of the math.
@@ -22,6 +22,9 @@ finufft{1,2,3}dmany_test{f}: accuracy/speed tests for vectorized transforms,
2222
in a given dimension. Types 1, 2, and 3 are tested.
2323
(exit code 0 is a pass).
2424
Call with no arguments for argument documentation.
25+
finufft3dkernel_test{f} : test kerevalmeth=0,1 give same answer within tol.
26+
Types 1, 2, and 3 are tested, in d=3 only.
27+
(exit code 0 is a pass).
2528
dumbinputs{f} : test of edge cases, invalid inputs, and plan interface.
2629
No arguments needed (exit code 0 is a pass).
2730
testutils{f} : test of utils module.

test/check_finufft.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
# Barnett 3/14/17. numdiff-free option 3/16/17. simpler, dual-prec 7/3/20,
1616
# execs now have exit codes, removed any numdiff dep 8/18/20
17-
# removed diff 6/16/23.
17+
# removed diff 6/16/23. Added kerevalmeth=0 vs 1 test 7/8/24.
1818

1919
# precision-specific settings
2020
if [[ $1 == "SINGLE" ]]; then
@@ -93,6 +93,12 @@ T=finufft3dmany_test$PRECSUF
9393
E=${PIPESTATUS[0]}
9494
if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
9595

96+
((N++))
97+
T=finufft3dkernel_test$PRECSUF
98+
./$T$FEX 20 50 30 1e3 $FINUFFT_REQ_TOL 2>$DIR/$T.err.out | tee $DIR/$T.out
99+
E=${PIPESTATUS[0]}
100+
if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
101+
96102
((N++))
97103
T=dumbinputs$PRECSUF
98104
./$T$FEX 2>$DIR/$T.err.out | tee $DIR/$T.out

test/finufft3dkernel_test.cpp

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#include <finufft/test_defs.h>
2+
// this enforces recompilation, responding to SINGLE...
3+
#include "directft/dirft3d.cpp"
4+
using namespace std;
5+
using namespace finufft::utils;
6+
7+
const char *help[] = {
8+
"Test spread_kerevalmeth=0 & 1 match, for 3 types of 3D transf, either prec.",
9+
"Usage: finufft3dkernel_test Nmodes1 Nmodes2 Nmodes3 Nsrc",
10+
"\t[tol] error tolerance (default 1e-6)",
11+
"\t[debug] (default 0) 0: silent, 1: text, 2: as 1 but also spreader",
12+
"\t[spread_sort] (default 2) 0: don't sort NU pts, 1: do, 2: auto",
13+
"\t[upsampfac] (default 2.0)",
14+
"\teg: finufft3dkernel_test 100 200 50 1e6 1e-12 0 2 0.0",
15+
"\tnotes: exit code 1 if any error > tol",
16+
nullptr};
17+
/**
18+
* @brief Test the 3D NUFFT of type 1, 2, and 3.
19+
* It evaluates the error of the kernel evaluation methods.
20+
* It uses err(a,b)=||a-b||_2 / ||a||_2 as the error metric.
21+
* It return FINUFFT error code if it is not 0.
22+
* It returns 1 if any error exceeds tol.
23+
* It returns 0 if test passes.
24+
*/
25+
int main(int argc, char *argv[]) {
26+
BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3 = # modes
27+
double w, tol = 1e-6; // default
28+
double err, errmax = 0;
29+
finufft_opts opts0, opts1;
30+
FINUFFT_DEFAULT_OPTS(&opts0);
31+
FINUFFT_DEFAULT_OPTS(&opts1);
32+
opts0.spread_kerevalmeth = 0;
33+
opts1.spread_kerevalmeth = 1;
34+
// opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE
35+
// opts.spread_max_sp_size = 3e4; // override test
36+
// opts.spread_nthr_atomic = 15; // "
37+
int isign = +1; // choose which exponential sign to test
38+
if (argc < 5 || argc > 10) {
39+
for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
40+
return 2;
41+
}
42+
sscanf(argv[1], "%lf", &w);
43+
N1 = (BIGINT)w;
44+
sscanf(argv[2], "%lf", &w);
45+
N2 = (BIGINT)w;
46+
sscanf(argv[3], "%lf", &w);
47+
N3 = (BIGINT)w;
48+
sscanf(argv[4], "%lf", &w);
49+
M = (BIGINT)w;
50+
if (argc > 5) sscanf(argv[5], "%lf", &tol);
51+
if (argc > 6) sscanf(argv[6], "%d", &opts0.debug); // can be 0,1 or 2
52+
opts0.spread_debug = (opts0.debug > 1) ? 1 : 0; // see output from spreader
53+
if (argc > 7) sscanf(argv[7], "%d", &opts0.spread_sort);
54+
if (argc > 8) {
55+
sscanf(argv[8], "%lf", &w);
56+
opts0.upsampfac = (FLT)w;
57+
}
58+
59+
opts1 = opts0;
60+
opts0.spread_kerevalmeth = 0;
61+
opts1.spread_kerevalmeth = 1;
62+
63+
cout << scientific << setprecision(15);
64+
const BIGINT N = N1 * N2 * N3;
65+
66+
std::vector<FLT> x(M); // NU pts x coords
67+
std::vector<FLT> y(M); // NU pts y coords
68+
std::vector<FLT> z(M); // NU pts z coords
69+
std::vector<CPX> c0(M), c1(N); // strengths
70+
std::vector<CPX> F0(N); // mode ampls kereval 0
71+
std::vector<CPX> F1(N); // mode ampls kereval 1
72+
#pragma omp parallel
73+
{
74+
unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
75+
#pragma omp for schedule(static, TEST_RANDCHUNK)
76+
for (BIGINT j = 0; j < M; ++j) {
77+
x[j] = M_PI * randm11r(&se);
78+
y[j] = M_PI * randm11r(&se);
79+
z[j] = M_PI * randm11r(&se);
80+
c0[j] = crandm11r(&se);
81+
}
82+
}
83+
c1 = c0; // copy strengths
84+
printf("test 3d type 1:\n"); // -------------- type 1
85+
printf("kerevalmeth 0:\n");
86+
CNTime timer{};
87+
timer.start();
88+
int ier = FINUFFT3D1(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
89+
F0.data(), &opts0);
90+
double ti = timer.elapsedsec();
91+
if (ier > 1) {
92+
printf("error (ier=%d)!\n", ier);
93+
return ier;
94+
} else
95+
printf(" %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
96+
(long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
97+
printf("kerevalmeth 1:\n");
98+
timer.restart();
99+
ier = FINUFFT3D1(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
100+
F1.data(), &opts1);
101+
ti = timer.elapsedsec();
102+
if (ier > 1) {
103+
printf("error (ier=%d)!\n", ier);
104+
return ier;
105+
} else
106+
printf(" %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
107+
(long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
108+
109+
err = relerrtwonorm(N, F0.data(), F1.data());
110+
errmax = max(err, errmax);
111+
printf("\ttype 1 rel l2-err in F is %.3g\n", err);
112+
// copy F0 to F1 so that we can test type 2
113+
F1 = F0;
114+
printf("kerevalmeth 0:\n");
115+
timer.restart();
116+
ier = FINUFFT3D2(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N1, N2, N3,
117+
F0.data(), &opts0);
118+
ti = timer.elapsedsec();
119+
if (ier > 1) {
120+
printf("error (ier=%d)!\n", ier);
121+
return ier;
122+
} else
123+
printf(" (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
124+
(long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
125+
printf("kerevalmeth 1:\n");
126+
timer.restart();
127+
ier = FINUFFT3D2(M, x.data(), y.data(), z.data(), c1.data(), isign, tol, N1, N2, N3,
128+
F0.data(), &opts1);
129+
ti = timer.elapsedsec();
130+
if (ier > 1) {
131+
printf("error (ier=%d)!\n", ier);
132+
return ier;
133+
} else
134+
printf(" (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
135+
(long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
136+
err = relerrtwonorm(M, c0.data(), c1.data());
137+
errmax = std::max(err, errmax);
138+
printf("\ttype 2 rel l2-err in c is %.3g\n", err);
139+
140+
printf("test 3d type 3:\n"); // -------------- type 3
141+
#pragma omp parallel
142+
{
143+
unsigned int se = MY_OMP_GET_THREAD_NUM();
144+
#pragma omp for schedule(static, TEST_RANDCHUNK)
145+
for (BIGINT j = 0; j < M; ++j) {
146+
x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin
147+
y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
148+
z[j] = 1.0 + M_PI * randm11r(&se); // " z_j
149+
}
150+
}
151+
std::vector<FLT> s(N); // targ freqs (1-cmpt)
152+
std::vector<FLT> t(N); // targ freqs (2-cmpt)
153+
std::vector<FLT> u(N); // targ freqs (3-cmpt)
154+
155+
timer.restart();
156+
printf("kerevalmeth 0:\n");
157+
ier = FINUFFT3D3(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N, s.data(),
158+
t.data(), u.data(), F0.data(), &opts0);
159+
ti = timer.elapsedsec();
160+
if (ier > 1) {
161+
printf("error (ier=%d)!\n", ier);
162+
return ier;
163+
} else
164+
printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", (long long)M,
165+
(long long)N, ti, (M + N) / ti);
166+
timer.restart();
167+
printf("kerevalmeth 1:\n");
168+
ier = FINUFFT3D3(M, x.data(), y.data(), z.data(), c0.data(), isign, tol, N, s.data(),
169+
t.data(), u.data(), F1.data(), &opts1);
170+
ti = timer.elapsedsec();
171+
if (ier > 1) {
172+
printf("error (ier=%d)!\n", ier);
173+
return ier;
174+
} else
175+
printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", (long long)M,
176+
(long long)N, ti, (M + N) / ti);
177+
err = relerrtwonorm(N, F0.data(), F1.data());
178+
errmax = max(err, errmax);
179+
printf("\ttype 3 rel l2-err in F is %.3g\n", err);
180+
// return 1 if any error exceeds tol
181+
// or return finufft error code if it is not 0
182+
return (errmax > tol);
183+
}

0 commit comments

Comments
 (0)