flatironinstitute
diff --git a/‎.github/workflows/cmake_ci.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/cmake_ci.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/python_wheel.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/python_wheel.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎CHANGELOG‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 17 additions & 2 deletions b/‎CMakePresets.json‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎devel/CMakeLists.txt‎
Lines changed: 22 additions & 0 deletions b/‎devel/CMakeLists.txt‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎devel/compare_foldrescale_PR440_laptop5700U.txt‎
Lines changed: 229 additions & 0 deletions b/‎devel/compare_foldrescale_PR440_laptop5700U.txt‎
Lines changed: 229 additions & 0 deletions
@@ -42,6 +42,7 @@ jobs:
         if: runner.os == 'macOS'
         run: |
           brew unlink gcc
+        continue-on-error: true
 
       - name: Setup Cpp
         uses: aminya/setup-cpp@v1
 
@@ -44,9 +44,13 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - name: Install gcc and fftw
+    - name: Unlink gcc
       run: |
         brew unlink gcc
+      continue-on-error: true
+
+    - name: Install gcc@13 and fftw
+      run: |
         brew install gcc@13 fftw
         cp make.inc.macosx_gcc-12 make.inc
         echo "FC=gfortran-13" >> make.inc
 
@@ -3,6 +3,9 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 * cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
   order, 1 FFT-style mode order.
+* New foldrescale, removes [-3pi,3pi) restriction on NU points, and slight
+  speedup at large tols. Deprecates both opts.chkbnds and error code
+  FINUFFT_ERR_SPREAD_PTS_OUT_RANGE.  PR #440 (Marco Barbone + Martin Reinecke)
 * CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or
   if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt.
   Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431.
 
@@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
 option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
 option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
 option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
+option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
 # sphinx tag (don't remove): @cmake_opts_end
 
 if(FINUFFT_USE_CPU)
@@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU)
     endif()
 
     set(CPM_DOWNLOAD_VERSION 0.38.0)
-    include(cmake/setupCPM.cmake)
-
     set(FFTW_VERSION 3.3.10)
+
+    include(cmake/setupCPM.cmake)
     include(cmake/setupFFTW.cmake)
+
 endif()
 
 if (FINUFFT_BUILD_MATLAB)
@@ -246,6 +248,10 @@ if (FINUFFT_BUILD_MATLAB)
     add_subdirectory(matlab)
 endif ()
 
+if (FINUFFT_BUILD_DEVEL)
+    add_subdirectory(devel)
+endif ()
+
 include(GNUInstallDirs)
 install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
 install(FILES ${PROJECT_SOURCE_DIR}/LICENSE
 
@@ -31,7 +31,22 @@
             "generator": "Ninja Multi-Config",
             "cacheVariables": {
                 "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON"
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFFT_BUILD_DEVEL": "ON"
+            }
+        },
+        {
+            "name": "benchmark",
+            "binaryDir": "build/benchmark",
+            "displayName": "Benchmark",
+            "description": "Benchmark release configuration (ninja)",
+            "generator": "Ninja",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+                "FINUFFT_BUILD_TESTS": "ON",
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFFT_FFTW_SUFFIX": "",
+                "FINUFFT_USE_OPENMP": "OFF"
             }
         },
         {
@@ -104,7 +119,7 @@
         {
             "name": "dev",
             "configurePreset": "dev",
-            "configuration": "Debug"
+            "configuration": "RelWithDebInfo"
         },
         {
             "name": "ninja-multi",
 
@@ -0,0 +1,22 @@
+project(finufft_devel)
+# Set the minimum required version of CMake
+cmake_minimum_required(VERSION 3.5)
+
+
+# include cpm cmake, downloading it
+CPMAddPackage(
+    NAME benchmark
+    GITHUB_REPOSITORY google/benchmark
+    VERSION 1.8.3
+    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
+
+)
+
+if (benchmark_ADDED)
+    # patch benchmark target
+    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+endif()
+
+add_executable(foldrescale foldrescale.cpp)
+target_link_libraries(foldrescale finufft benchmark)
+target_compile_options(foldrescale PRIVATE -mavx2)
@@ -0,0 +1,229 @@
+PR #440 tests on AMD laptop 5700U CPU (8-core)
+
+
+We pick tests in 1D v poor tol (so that spreading negligible)
+
+MASTER branch 79de0847 :  ........................................
+
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 1 1e7 1e6 1e-1 1 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	0.000317 s
+	spread 1D (M=1; N1=1000000,N2=1,N3=1; pir=0), nthr=1
+	zero output array	0.00144 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	2.3e-05 s (1 subprobs)
+making random data...
+spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (1 threads):	0.136 s
+	spread 1D (M=10000000; N1=1000000,N2=1,N3=1; pir=0), nthr=1
+	zero output array	0.00144 s
+	capping subproblem sizes to max of 10000
+	t1 fancy spread: 	0.237 s (1000 subprobs)
+    1e+07 NU pts in 0.382 s 	2.62e+07 pts/s 	5.24e+07 spread pts/s
+    rel err in total over grid:      0.04
+making more random NU pts...
+spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (1 threads):	0.133 s
+	interp 1D (M=10000000; N1=1000000,N2=1,N3=1; pir=0), nthr=1
+	t2 spreading loop: 	0.339 s
+    1e+07 NU pts in 0.478 s 	2.09e+07 pts/s 	4.18e+07 spread pts/s
+    max rel err in values at NU pts: 0.0954
+
+[note for single-thread t2: sorting helps, but default opt=2 doesn't choose it]
+
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 1 1e8 1e6 1e-1 2 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	0.000287 s
+	spread 1D (M=1; N1=1000000,N2=1,N3=1; pir=0), nthr=8
+	zero output array	0.00139 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	0.000771 s (1 subprobs)
+making random data...
+spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (8 threads):	0.631 s
+	spread 1D (M=100000000; N1=1000000,N2=1,N3=1; pir=0), nthr=8
+	zero output array	0.00154 s
+	capping subproblem sizes to max of 10000
+	t1 fancy spread: 	1.04 s (10000 subprobs)
+    1e+08 NU pts in 1.77 s 	5.66e+07 pts/s 	1.13e+08 spread pts/s
+    rel err in total over grid:      0.0303
+making more random NU pts...
+spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	not sorted (sort=2): 	0.0647 s
+	interp 1D (M=100000000; N1=1000000,N2=1,N3=1; pir=0), nthr=8
+	t2 spreading loop: 	0.769 s
+    1e+08 NU pts in 0.905 s 	1.1e+08 pts/s 	2.21e+08 spread pts/s
+    max rel err in values at NU pts: 0.0954
+
+[note for multi-thread t2: sorting doesn't helps and default opt=2 doesn't choose it... good]
+
+fold PR #440 ..........................................
+
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 1 1e7 1e6 1e-1 1 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	0.000316 s
+	spread 1D (M=1; N1=1000000,N2=1,N3=1), nthr=1
+	zero output array	0.00142 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	3.4e-05 s (1 subprobs)
+making random data...
+spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (1 threads):	0.136 s
+	spread 1D (M=10000000; N1=1000000,N2=1,N3=1), nthr=1
+	zero output array	0.00145 s
+	capping subproblem sizes to max of 10000
+	t1 fancy spread: 	0.223 s (1000 subprobs)
+    1e+07 NU pts in 0.367 s 	2.72e+07 pts/s 	5.44e+07 spread pts/s
+    rel err in total over grid:      0.0475
+making more random NU pts...
+spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (1 threads):	0.134 s
+	interp 1D (M=10000000; N1=1000000,N2=1,N3=1), nthr=1
+	t2 spreading loop: 	0.308 s
+    1e+07 NU pts in 0.448 s 	2.23e+07 pts/s 	4.46e+07 spread pts/s
+    max rel err in values at NU pts: 0.0954
+
+(base) alex@ross /home/alex/numerics/finufft>  OMP_NUM_THREADS=8 perftest/spreadtestnd 1 1e8 1e6 1e-1 2 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	0.00028 s
+	spread 1D (M=1; N1=1000000,N2=1,N3=1), nthr=8
+	zero output array	0.00137 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	0.000328 s (1 subprobs)
+making random data...
+spreadinterp 1D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (8 threads):	0.634 s
+	spread 1D (M=100000000; N1=1000000,N2=1,N3=1), nthr=8
+	zero output array	0.00137 s
+	capping subproblem sizes to max of 10000
+	t1 fancy spread: 	1.04 s (10000 subprobs)
+    1e+08 NU pts in 1.77 s 	5.65e+07 pts/s 	1.13e+08 spread pts/s
+    rel err in total over grid:      0.0477
+making more random NU pts...
+spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	not sorted (sort=2): 	0.064 s
+	interp 1D (M=100000000; N1=1000000,N2=1,N3=1), nthr=8
+	t2 spreading loop: 	0.759 s
+    1e+08 NU pts in 0.895 s 	1.12e+08 pts/s 	2.24e+08 spread pts/s
+    max rel err in values at NU pts: 0.0954
+
+............................
+
+1D Concl: single-thread 7% speedup interp (dir=2) - none to do with sorting
+                        5% speedup spread dir=1.
+          multi-thread  no significant change (~1% level).       
+
+Also noted: PR #440 compile time for spreadinterp.o is 10x longer than before (~5 sec)
+
+
+=================================================
+3D tests: (poor tol to give foldrescale a chance to shine; 3 coords done each NU pt):
+
+MASTER:
+
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 3 1e7 1e6 1e-1 1 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	2.9e-05 s
+	spread 3D (M=1; N1=100,N2=100,N3=100; pir=0), nthr=1
+	zero output array	0.00141 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	2.5e-05 s (1 subprobs)
+making random data...
+spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (1 threads):	0.137 s
+	spread 3D (M=10000000; N1=100,N2=100,N3=100; pir=0), nthr=1
+	zero output array	0.00136 s
+	capping subproblem sizes to max of 100000
+	t1 fancy spread: 	0.782 s (100 subprobs)
+    1e+07 NU pts in 0.927 s 	1.08e+07 pts/s 	8.63e+07 spread pts/s
+    rel err in total over grid:      0.189
+making more random NU pts...
+spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (1 threads):	0.134 s
+	interp 3D (M=10000000; N1=100,N2=100,N3=100; pir=0), nthr=1
+	t2 spreading loop: 	0.752 s
+    1e+07 NU pts in 0.892 s 	1.12e+07 pts/s 	8.97e+07 spread pts/s
+    max rel err in values at NU pts: 0.315
+    
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	1.7e-05 s
+	spread 3D (M=1; N1=100,N2=100,N3=100; pir=0), nthr=8
+	zero output array	0.00147 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	0.000397 s (1 subprobs)
+making random data...
+spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (8 threads):	0.315 s
+	spread 3D (M=100000000; N1=100,N2=100,N3=100; pir=0), nthr=8
+	zero output array	0.00138 s
+	capping subproblem sizes to max of 100000
+	t1 fancy spread: 	1.91 s (1000 subprobs)
+    1e+08 NU pts in 2.32 s 	4.31e+07 pts/s 	3.45e+08 spread pts/s
+    rel err in total over grid:      0.165
+making more random NU pts...
+spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (8 threads):	0.311 s
+	interp 3D (M=100000000; N1=100,N2=100,N3=100; pir=0), nthr=8
+	t2 spreading loop: 	2.04 s
+    1e+08 NU pts in 2.45 s 	4.08e+07 pts/s 	3.26e+08 spread pts/s
+    max rel err in values at NU pts: 0.315
+
+
+PR #440:
+
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=1 perftest/spreadtestnd 3 1e7 1e6 1e-1 1 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	2e-05 s
+	spread 3D (M=1; N1=100,N2=100,N3=100), nthr=1
+	zero output array	0.00142 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	3.3e-05 s (1 subprobs)
+making random data...
+spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (1 threads):	0.136 s
+	spread 3D (M=10000000; N1=100,N2=100,N3=100), nthr=1
+	zero output array	0.00135 s
+	capping subproblem sizes to max of 100000
+	t1 fancy spread: 	0.794 s (100 subprobs)
+    1e+07 NU pts in 0.937 s 	1.07e+07 pts/s 	8.53e+07 spread pts/s
+    rel err in total over grid:      0.143
+making more random NU pts...
+spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (1 threads):	0.135 s
+	interp 3D (M=10000000; N1=100,N2=100,N3=100), nthr=1
+	t2 spreading loop: 	0.687 s
+    1e+07 NU pts in 0.829 s 	1.21e+07 pts/s 	9.65e+07 spread pts/s
+    max rel err in values at NU pts: 0.315
+    
+(base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
+setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
+	sorted (1 threads):	1.8e-05 s
+	spread 3D (M=1; N1=100,N2=100,N3=100), nthr=8
+	zero output array	0.0014 s
+	using low-density speed rescue nb=M...
+	t1 fancy spread: 	0.000358 s (1 subprobs)
+making random data...
+spreadinterp 3D, 1e+06 U pts, dir=1, tol=0.1: nspread=2
+	sorted (8 threads):	0.31 s
+	spread 3D (M=100000000; N1=100,N2=100,N3=100), nthr=8
+	zero output array	0.00132 s
+	capping subproblem sizes to max of 100000
+	t1 fancy spread: 	1.92 s (1000 subprobs)
+    1e+08 NU pts in 2.33 s 	4.29e+07 pts/s 	3.43e+08 spread pts/s
+    rel err in total over grid:      0.167
+making more random NU pts...
+spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
+	sorted (8 threads):	0.319 s
+	interp 3D (M=100000000; N1=100,N2=100,N3=100), nthr=8
+	t2 spreading loop: 	2.02 s
+    1e+08 NU pts in 2.44 s 	4.1e+07 pts/s 	3.28e+08 spread pts/s
+    max rel err in values at NU pts: 0.315
+
+concl: single-thread: spread no change; interp is 9% faster
+       8-thread :    spread no change; interp no change.
+
+Overall: only affects single-core perf, and by 9% or less.
+
+(Of course, advantage of no 3pi-restriction is good too)
+