flatironinstitute
diff --git a/‎CHANGELOG‎
Lines changed: 9 additions & 8 deletions b/‎CHANGELOG‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎include/finufft/defs.h‎
Lines changed: 7 additions & 2 deletions b/‎include/finufft/defs.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎include/finufft/spreadinterp.h‎
Lines changed: 12 additions & 12 deletions b/‎include/finufft/spreadinterp.h‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎makefile‎
Lines changed: 5 additions & 1 deletion b/‎makefile‎
Lines changed: 5 additions & 1 deletion
@@ -7,6 +7,7 @@ V 2.3.0beta (6/21/24)
   kernel evaluation, templating by ns with AVX-width-dependent decisions.
   Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
     NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
+* new test/finufft3dkernel_test checks kerevalmeth=0,1 same to tol (M Barbone).
 * new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
 * new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
 * cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
@@ -26,15 +27,15 @@ V 2.3.0beta (6/21/24)
 * improved GPU python docs: migration guide; usage from cupy, numba, torch,
   pycuda. PyPI pkg still at 2.2.0beta.
 * Added a clang-format pre-commit hook to ensure consistent code style.
-  Created a .clang-format file to define the style similar to the existing style.
+  Created a .clang-format file to define a style similar to the existing style.
   Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
   using .git-blame-ignore-revs. Added a contributing.md for developers.
-* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
-as opposed to 32-bit. While this does modify the ABI, most code will just need to
-recompile against the new library as compilers will silently upcast any 32-bit
-integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
-integers are still used, so calling cufinufft with more than 2e9 points will fail.
-This restriction may be lifted in the future.
+* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit int
+  as opposed to 32-bit. While this does modify the ABI, most code will just
+  need to recompile against the new library as compilers will silently upcast
+  any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that
+  internally, 32-bit integers are still used, so calling cufinufft with more
+  than 2e9 points will fail. This restriction may be lifted in the future.
 
 V 2.2.0 (12/12/23)
 
@@ -52,7 +53,7 @@ V 2.2.0 (12/12/23)
 * CMake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
   - Note: the plan is to continue to support GNU makefile and make.inc.* but
     to transition to CMake as the main build system.
-  - CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)	
+  - CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
 * Docs: new tutorial content on iterative inverse NUFFTs; troubleshooting.
 * GitHub-facing badges
 * include/finufft/finufft_eitherprec.h moved up directory to be public (bea316c)
 
@@ -4,10 +4,27 @@ project(finufft VERSION 2.2.0 LANGUAGES C CXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+include(CheckCXXCompilerFlag)
+
 set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
 if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
     # Set custom compiler flags for gcc-compatible compilers
-    set(FINUFFT_CXX_FLAGS_RELEASE -O3 -funroll-loops -ffp-contract=fast)
+    set(FINUFFT_CXX_FLAGS_RELEASE -O3
+            -funroll-loops
+            -ffp-contract=fast
+            -fno-math-errno
+            -fno-signed-zeros
+            -fno-trapping-math
+            -fassociative-math
+            -freciprocal-math
+            -fmerge-all-constants
+            -ftree-vectorize
+    )
+    # if -fimplicit-constexpr is supported, add it to the list of flags
+    check_cxx_compiler_flag("-fimplicit-constexpr" COMPILER_SUPPORTS_FIMPLICIT_CONSTEXPR)
+    if (COMPILER_SUPPORTS_FIMPLICIT_CONSTEXPR)
+        list(APPEND FINUFFT_CXX_FLAGS_RELEASE -fimplicit-constexpr)
+    endif ()
     set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -g ${FINUFFT_CXX_FLAGS_RELEASE})
 endif ()
 
@@ -100,6 +117,7 @@ function(enable_asan target)
         target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
         target_link_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
     endif ()
+    target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-Wall -Wno-sign-compare>)
 endfunction()
 
 # Utility function to link static/dynamic lib
 
@@ -40,21 +40,26 @@
 // inline macro, to force inlining of small functions
 // this avoids the use of macros to implement functions
 #if defined(_MSC_VER)
-#define FINUFFT_ALWAYS_INLINE __forceinline
+#define FINUFFT_ALWAYS_INLINE __forceinline inline
 #define FINUFFT_NEVER_INLINE  __declspec(noinline)
 #define FINUFFT_RESTRICT      __restrict
 #define FINUFFT_UNREACHABLE   __assume(0)
-
+#define FINUFFT_UNLIKELY(x)   (x)
+#define FINUFFT_LIKELY(x)     (x)
 #elif defined(__GNUC__) || defined(__clang__)
 #define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
 #define FINUFFT_NEVER_INLINE  __attribute__((noinline))
 #define FINUFFT_RESTRICT      __restrict__
 #define FINUFFT_UNREACHABLE   __builtin_unreachable()
+#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
+#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
 #else
 #define FINUFFT_ALWAYS_INLINE inline
 #define FINUFFT_NEVER_INLINE
 #define FINUFFT_RESTRICT
 #define FINUFFT_UNREACHABLE
+#define FINUFFT_UNLIKELY(x) (x)
+#define FINUFFT_LIKELY(x)   (x)
 #endif
 
 // ------------- Library-wide algorithm parameter settings ----------------
 
@@ -30,26 +30,26 @@ namespace spreadinterp {
 
 // things external (spreadinterp) interface needs...
 FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
-    BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky,
+    UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT N, FLT *kx, FLT *ky,
     FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
-                                             FLT *kx, FLT *ky, FLT *kz,
+FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3,
+                                             UBIGINT N, FLT *kx, FLT *ky, FLT *kz,
                                              const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2,
-                                           BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-                                           const finufft_spread_opts &opts);
+FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2,
+                                           UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky,
+                                           FLT *kz, const finufft_spread_opts &opts);
 FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(
-    const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3,
-    FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx,
+    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
+    FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
     FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
     FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts);
 FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(
-    const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
-    BIGINT M, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform,
+    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform,
+    UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform,
     const finufft_spread_opts &opts, int did_sort);
 FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
-    const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3,
-    FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx,
+    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
+    FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
     FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
     FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort);
 FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts);
 
@@ -27,7 +27,11 @@ PYTHON = python3
 # Notes: 1) -Ofast breaks isfinite() & isnan(), so use -O3 which now is as fast
 #        2) -fcx-limited-range for fortran-speed complex arith in C++
 #        3) we use simply-expanded (:=) makefile variables, otherwise confusing
-CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast $(CFLAGS)
+# 		 4) the extra math flags are for speed, but they do not impact accuracy
+#           they allow gcc to vectorize the code more effectively
+CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\
+		  -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\
+		  -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS)
 FFLAGS := $(CFLAGS) $(FFLAGS)
 CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
 # FFTW base name, and math linking...