Skip to content

Commit a8c3257

Browse files
committed
merge master
2 parents 10374e3 + 50e797a commit a8c3257

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+1008
-457
lines changed

.github/workflows/cmake_ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ jobs:
4242
if: runner.os == 'macOS'
4343
run: |
4444
brew unlink gcc
45+
continue-on-error: true
4546

4647
- name: Setup Cpp
4748
uses: aminya/setup-cpp@v1

.github/workflows/python_wheel.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ jobs:
4444
steps:
4545
- uses: actions/checkout@v4
4646

47-
- name: Install gcc and fftw
47+
- name: Unlink gcc
4848
run: |
4949
brew unlink gcc
50+
continue-on-error: true
51+
52+
- name: Install gcc@13 and fftw
53+
run: |
5054
brew install gcc@13 fftw
5155
cp make.inc.macosx_gcc-12 make.inc
5256
echo "FC=gfortran-13" >> make.inc

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
33

44
* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
55
order, 1 FFT-style mode order.
6+
* New foldrescale, removes [-3pi,3pi) restriction on NU points, and slight
7+
speedup at large tols. Deprecates both opts.chkbnds and error code
8+
FINUFFT_ERR_SPREAD_PTS_OUT_RANGE. PR #440 (Marco Barbone + Martin Reinecke)
69
* CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or
710
if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt.
811
Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431.

CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
3131
option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
3232
option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
3333
option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
34+
option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
3435
# sphinx tag (don't remove): @cmake_opts_end
3536

3637
if(FINUFFT_USE_CPU)
@@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU)
4546
endif()
4647

4748
set(CPM_DOWNLOAD_VERSION 0.38.0)
48-
include(cmake/setupCPM.cmake)
49-
5049
set(FFTW_VERSION 3.3.10)
50+
51+
include(cmake/setupCPM.cmake)
5152
include(cmake/setupFFTW.cmake)
53+
5254
endif()
5355

5456
if (FINUFFT_BUILD_MATLAB)
@@ -246,6 +248,10 @@ if (FINUFFT_BUILD_MATLAB)
246248
add_subdirectory(matlab)
247249
endif ()
248250

251+
if (FINUFFT_BUILD_DEVEL)
252+
add_subdirectory(devel)
253+
endif ()
254+
249255
include(GNUInstallDirs)
250256
install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
251257
install(FILES ${PROJECT_SOURCE_DIR}/LICENSE

CMakePresets.json

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,22 @@
3131
"generator": "Ninja Multi-Config",
3232
"cacheVariables": {
3333
"FINUFFT_BUILD_TESTS": "ON",
34-
"FINUFFT_BUILD_EXAMPLES": "ON"
34+
"FINUFFT_BUILD_EXAMPLES": "ON",
35+
"FINUFFT_BUILD_DEVEL": "ON"
36+
}
37+
},
38+
{
39+
"name": "benchmark",
40+
"binaryDir": "build/benchmark",
41+
"displayName": "Benchmark",
42+
"description": "Benchmark release configuration (ninja)",
43+
"generator": "Ninja",
44+
"cacheVariables": {
45+
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
46+
"FINUFFT_BUILD_TESTS": "ON",
47+
"FINUFFT_BUILD_EXAMPLES": "ON",
48+
"FINUFFT_FFTW_SUFFIX": "",
49+
"FINUFFT_USE_OPENMP": "OFF"
3550
}
3651
},
3752
{
@@ -104,7 +119,7 @@
104119
{
105120
"name": "dev",
106121
"configurePreset": "dev",
107-
"configuration": "Debug"
122+
"configuration": "RelWithDebInfo"
108123
},
109124
{
110125
"name": "ninja-multi",

devel/CMakeLists.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
project(finufft_devel)
2+
# Set the minimum required version of CMake
3+
cmake_minimum_required(VERSION 3.5)
4+
5+
6+
# include cpm cmake, downloading it
7+
CPMAddPackage(
8+
NAME benchmark
9+
GITHUB_REPOSITORY google/benchmark
10+
VERSION 1.8.3
11+
OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
12+
13+
)
14+
15+
if (benchmark_ADDED)
16+
# patch benchmark target
17+
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
18+
endif()
19+
20+
add_executable(foldrescale foldrescale.cpp)
21+
target_link_libraries(foldrescale finufft benchmark)
22+
target_compile_options(foldrescale PRIVATE -mavx2)
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
PR #440 tests on AMD laptop 5700U CPU (8-core)
2+
3+
4+
We pick tests in 1D v poor tol (so that spreading negligible)
5+
6+
MASTER branch 79de0847 : ........................................
7+
8+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 1 1e7 1e6 1e-1 1 0 1
9+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
10+
sorted (1 threads): 0.000317 s
11+
spread 1D (M=1; N1=1000000,N2=1,N3=1; pir=0), nthr=1
12+
zero output array 0.00144 s
13+
using low-density speed rescue nb=M...
14+
t1 fancy spread: 2.3e-05 s (1 subprobs)
15+
making random data...
16+
spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
17+
sorted (1 threads): 0.136 s
18+
spread 1D (M=10000000; N1=1000000,N2=1,N3=1; pir=0), nthr=1
19+
zero output array 0.00144 s
20+
capping subproblem sizes to max of 10000
21+
t1 fancy spread: 0.237 s (1000 subprobs)
22+
1e+07 NU pts in 0.382 s 2.62e+07 pts/s 5.24e+07 spread pts/s
23+
rel err in total over grid: 0.04
24+
making more random NU pts...
25+
spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
26+
sorted (1 threads): 0.133 s
27+
interp 1D (M=10000000; N1=1000000,N2=1,N3=1; pir=0), nthr=1
28+
t2 spreading loop: 0.339 s
29+
1e+07 NU pts in 0.478 s 2.09e+07 pts/s 4.18e+07 spread pts/s
30+
max rel err in values at NU pts: 0.0954
31+
32+
[note for single-thread t2: sorting helps, but default opt=2 doesn't choose it]
33+
34+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 1 1e8 1e6 1e-1 2 0 1
35+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
36+
sorted (1 threads): 0.000287 s
37+
spread 1D (M=1; N1=1000000,N2=1,N3=1; pir=0), nthr=8
38+
zero output array 0.00139 s
39+
using low-density speed rescue nb=M...
40+
t1 fancy spread: 0.000771 s (1 subprobs)
41+
making random data...
42+
spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
43+
sorted (8 threads): 0.631 s
44+
spread 1D (M=100000000; N1=1000000,N2=1,N3=1; pir=0), nthr=8
45+
zero output array 0.00154 s
46+
capping subproblem sizes to max of 10000
47+
t1 fancy spread: 1.04 s (10000 subprobs)
48+
1e+08 NU pts in 1.77 s 5.66e+07 pts/s 1.13e+08 spread pts/s
49+
rel err in total over grid: 0.0303
50+
making more random NU pts...
51+
spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
52+
not sorted (sort=2): 0.0647 s
53+
interp 1D (M=100000000; N1=1000000,N2=1,N3=1; pir=0), nthr=8
54+
t2 spreading loop: 0.769 s
55+
1e+08 NU pts in 0.905 s 1.1e+08 pts/s 2.21e+08 spread pts/s
56+
max rel err in values at NU pts: 0.0954
57+
58+
[note for multi-thread t2: sorting doesn't helps and default opt=2 doesn't choose it... good]
59+
60+
fold PR #440 ..........................................
61+
62+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 1 1e7 1e6 1e-1 1 0 1
63+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
64+
sorted (1 threads): 0.000316 s
65+
spread 1D (M=1; N1=1000000,N2=1,N3=1), nthr=1
66+
zero output array 0.00142 s
67+
using low-density speed rescue nb=M...
68+
t1 fancy spread: 3.4e-05 s (1 subprobs)
69+
making random data...
70+
spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
71+
sorted (1 threads): 0.136 s
72+
spread 1D (M=10000000; N1=1000000,N2=1,N3=1), nthr=1
73+
zero output array 0.00145 s
74+
capping subproblem sizes to max of 10000
75+
t1 fancy spread: 0.223 s (1000 subprobs)
76+
1e+07 NU pts in 0.367 s 2.72e+07 pts/s 5.44e+07 spread pts/s
77+
rel err in total over grid: 0.0475
78+
making more random NU pts...
79+
spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
80+
sorted (1 threads): 0.134 s
81+
interp 1D (M=10000000; N1=1000000,N2=1,N3=1), nthr=1
82+
t2 spreading loop: 0.308 s
83+
1e+07 NU pts in 0.448 s 2.23e+07 pts/s 4.46e+07 spread pts/s
84+
max rel err in values at NU pts: 0.0954
85+
86+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 1 1e8 1e6 1e-1 2 0 1
87+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
88+
sorted (1 threads): 0.00028 s
89+
spread 1D (M=1; N1=1000000,N2=1,N3=1), nthr=8
90+
zero output array 0.00137 s
91+
using low-density speed rescue nb=M...
92+
t1 fancy spread: 0.000328 s (1 subprobs)
93+
making random data...
94+
spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
95+
sorted (8 threads): 0.634 s
96+
spread 1D (M=100000000; N1=1000000,N2=1,N3=1), nthr=8
97+
zero output array 0.00137 s
98+
capping subproblem sizes to max of 10000
99+
t1 fancy spread: 1.04 s (10000 subprobs)
100+
1e+08 NU pts in 1.77 s 5.65e+07 pts/s 1.13e+08 spread pts/s
101+
rel err in total over grid: 0.0477
102+
making more random NU pts...
103+
spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
104+
not sorted (sort=2): 0.064 s
105+
interp 1D (M=100000000; N1=1000000,N2=1,N3=1), nthr=8
106+
t2 spreading loop: 0.759 s
107+
1e+08 NU pts in 0.895 s 1.12e+08 pts/s 2.24e+08 spread pts/s
108+
max rel err in values at NU pts: 0.0954
109+
110+
............................
111+
112+
1D Concl: single-thread 7% speedup interp (dir=2) - none to do with sorting
113+
5% speedup spread dir=1.
114+
multi-thread no significant change (~1% level).
115+
116+
Also noted: PR #440 compile time for spreadinterp.o is 10x longer than before (~5 sec)
117+
118+
119+
=================================================
120+
3D tests: (poor tol to give foldrescale a chance to shine; 3 coords done each NU pt):
121+
122+
MASTER:
123+
124+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 3 1e7 1e6 1e-1 1 0 1
125+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
126+
sorted (1 threads): 2.9e-05 s
127+
spread 3D (M=1; N1=100,N2=100,N3=100; pir=0), nthr=1
128+
zero output array 0.00141 s
129+
using low-density speed rescue nb=M...
130+
t1 fancy spread: 2.5e-05 s (1 subprobs)
131+
making random data...
132+
spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
133+
sorted (1 threads): 0.137 s
134+
spread 3D (M=10000000; N1=100,N2=100,N3=100; pir=0), nthr=1
135+
zero output array 0.00136 s
136+
capping subproblem sizes to max of 100000
137+
t1 fancy spread: 0.782 s (100 subprobs)
138+
1e+07 NU pts in 0.927 s 1.08e+07 pts/s 8.63e+07 spread pts/s
139+
rel err in total over grid: 0.189
140+
making more random NU pts...
141+
spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
142+
sorted (1 threads): 0.134 s
143+
interp 3D (M=10000000; N1=100,N2=100,N3=100; pir=0), nthr=1
144+
t2 spreading loop: 0.752 s
145+
1e+07 NU pts in 0.892 s 1.12e+07 pts/s 8.97e+07 spread pts/s
146+
max rel err in values at NU pts: 0.315
147+
148+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
149+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
150+
sorted (1 threads): 1.7e-05 s
151+
spread 3D (M=1; N1=100,N2=100,N3=100; pir=0), nthr=8
152+
zero output array 0.00147 s
153+
using low-density speed rescue nb=M...
154+
t1 fancy spread: 0.000397 s (1 subprobs)
155+
making random data...
156+
spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
157+
sorted (8 threads): 0.315 s
158+
spread 3D (M=100000000; N1=100,N2=100,N3=100; pir=0), nthr=8
159+
zero output array 0.00138 s
160+
capping subproblem sizes to max of 100000
161+
t1 fancy spread: 1.91 s (1000 subprobs)
162+
1e+08 NU pts in 2.32 s 4.31e+07 pts/s 3.45e+08 spread pts/s
163+
rel err in total over grid: 0.165
164+
making more random NU pts...
165+
spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
166+
sorted (8 threads): 0.311 s
167+
interp 3D (M=100000000; N1=100,N2=100,N3=100; pir=0), nthr=8
168+
t2 spreading loop: 2.04 s
169+
1e+08 NU pts in 2.45 s 4.08e+07 pts/s 3.26e+08 spread pts/s
170+
max rel err in values at NU pts: 0.315
171+
172+
173+
PR #440:
174+
175+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 3 1e7 1e6 1e-1 1 0 1
176+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
177+
sorted (1 threads): 2e-05 s
178+
spread 3D (M=1; N1=100,N2=100,N3=100), nthr=1
179+
zero output array 0.00142 s
180+
using low-density speed rescue nb=M...
181+
t1 fancy spread: 3.3e-05 s (1 subprobs)
182+
making random data...
183+
spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
184+
sorted (1 threads): 0.136 s
185+
spread 3D (M=10000000; N1=100,N2=100,N3=100), nthr=1
186+
zero output array 0.00135 s
187+
capping subproblem sizes to max of 100000
188+
t1 fancy spread: 0.794 s (100 subprobs)
189+
1e+07 NU pts in 0.937 s 1.07e+07 pts/s 8.53e+07 spread pts/s
190+
rel err in total over grid: 0.143
191+
making more random NU pts...
192+
spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
193+
sorted (1 threads): 0.135 s
194+
interp 3D (M=10000000; N1=100,N2=100,N3=100), nthr=1
195+
t2 spreading loop: 0.687 s
196+
1e+07 NU pts in 0.829 s 1.21e+07 pts/s 9.65e+07 spread pts/s
197+
max rel err in values at NU pts: 0.315
198+
199+
(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
200+
setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
201+
sorted (1 threads): 1.8e-05 s
202+
spread 3D (M=1; N1=100,N2=100,N3=100), nthr=8
203+
zero output array 0.0014 s
204+
using low-density speed rescue nb=M...
205+
t1 fancy spread: 0.000358 s (1 subprobs)
206+
making random data...
207+
spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
208+
sorted (8 threads): 0.31 s
209+
spread 3D (M=100000000; N1=100,N2=100,N3=100), nthr=8
210+
zero output array 0.00132 s
211+
capping subproblem sizes to max of 100000
212+
t1 fancy spread: 1.92 s (1000 subprobs)
213+
1e+08 NU pts in 2.33 s 4.29e+07 pts/s 3.43e+08 spread pts/s
214+
rel err in total over grid: 0.167
215+
making more random NU pts...
216+
spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
217+
sorted (8 threads): 0.319 s
218+
interp 3D (M=100000000; N1=100,N2=100,N3=100), nthr=8
219+
t2 spreading loop: 2.02 s
220+
1e+08 NU pts in 2.44 s 4.1e+07 pts/s 3.28e+08 spread pts/s
221+
max rel err in values at NU pts: 0.315
222+
223+
concl: single-thread: spread no change; interp is 9% faster
224+
8-thread : spread no change; interp no change.
225+
226+
Overall: only affects single-core perf, and by 9% or less.
227+
228+
(Of course, advantage of no 3pi-restriction is good too)
229+

0 commit comments

Comments
 (0)