flatironinstitute
diff --git a/‎CHANGELOG‎
Lines changed: 18 additions & 6 deletions b/‎CHANGELOG‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎devel/CMakeLists.txt‎
Lines changed: 13 additions & 11 deletions b/‎devel/CMakeLists.txt‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎devel/gen_all_horner_C_code.m‎
Lines changed: 6 additions & 6 deletions b/‎devel/gen_all_horner_C_code.m‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎devel/gen_ker_horner_loop_C_code.m‎
Lines changed: 2 additions & 2 deletions b/‎devel/gen_ker_horner_loop_C_code.m‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/cufinufft/common.h‎
Lines changed: 33 additions & 0 deletions b/‎include/cufinufft/common.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎include/cufinufft/contrib/helper_cuda.h‎
Lines changed: 8 additions & 7 deletions b/‎include/cufinufft/contrib/helper_cuda.h‎
Lines changed: 8 additions & 7 deletions
@@ -1,13 +1,13 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-V 2.3.0-rc1 (8/2/24)
+V 2.3.0-rc1 (8/6/24)
 
 * Switched C++ standards from C++14 to C++17, allowing various templating
   improvements (Barbone).
-* python build modernized to pyproject.toml (for both CPU and GPU).
-  PR 507 (Anden, Lu, Barbone)
-* switchable FFT: either FFTW or DUCC0 (latter needs no plan stage; also it is
+* Python build modernized to pyproject.toml (for both CPU and GPU).
+  PR 507 (Anden, Lu, Barbone). Compiles from source for the local build.
+* Switchable FFT: either FFTW or DUCC0 (latter needs no plan stage; also it is
   used to exploit sparsity pattern to achieve FFT speedups 1-3x in 2D and 3D).
   PR463, Martin Reinecke. Both CMake and makefile includes this DUCC0 option
   (makefile PR511 by Barnett; CMake by Barbone).
@@ -54,8 +54,20 @@ V 2.3.0-rc1 (8/2/24)
   It now auto-selects compiler flags based on those supported on all OSes, and
   has support for Windows (llvm, msvc), Linux (llvm, gcc) and MacOS (llvm, gcc).
 * CMake added nvcc and msvc optimization flags.
-* sphinx local doc build also using CMake.
-* updated install docs, including for DUCC0 FFT.
+* sphinx local doc build also using CMake. (Barbone)
+* updated install docs, including for DUCC0 FFT and new python build.
+* updated install docs (Barnett)
+* Major acceleration effort for the GPU library cufinufft (M Barbone, PR488):
+  - binsize is now a function of the shared memory available where possible.
+  - GM 1D sorts using thrust::sort instead of bin-sort.
+  - uses the new normalized Horner coefficients and added support for
+    upsampfac=1.25 on GPU, for first time.
+  - new compile flags for extra-vectorization, flushing single
+    precision denormals to 0 and using fma where possible.
+  -  using intrinsics (eg FMA) in foldrescale and other places to increase
+    performance
+  - using SM90 float2 vector atomicAdd where supported
+  - make default binsize = 0
 
 V 2.2.0 (12/12/23)
 
 
@@ -2,23 +2,25 @@ project(finufft_devel)
 # Set the minimum required version of CMake
 cmake_minimum_required(VERSION 3.5)
 
-
 # include cpm cmake, downloading it
-CPMAddPackage(
-    NAME benchmark
-    GITHUB_REPOSITORY google/benchmark
-    VERSION 1.8.3
-    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
-
-)
+cpmaddpackage(
+  NAME
+  benchmark
+  GITHUB_REPOSITORY
+  google/benchmark
+  VERSION
+  1.8.3
+  OPTIONS
+  "BENCHMARK_ENABLE_TESTING OFF")
 
-if (benchmark_ADDED)
-    # patch benchmark target
-    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+if(benchmark_ADDED)
+  # patch benchmark target
+  set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
 endif()
 
 add_executable(foldrescale foldrescale.cpp)
 target_link_libraries(foldrescale finufft benchmark xsimd)
 add_executable(padding padding.cpp)
+target_compile_features(padding PRIVATE cxx_std_17)
 target_link_libraries(padding finufft xsimd)
 target_compile_options(padding PRIVATE -march=native)
@@ -12,12 +12,12 @@
 
 for upsampfac = [2.0, 1.25];   % sigma: either 2 (default) or low (eg 5/4)
   fprintf('upsampfac = %g...\n',upsampfac)
-  
+
   ws = 2:16;
-  opts.wpad = true;    % pad kernel eval to multiple of 4
+  opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
-  else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  if (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end
 
@@ -38,9 +38,9 @@
   width = w;
 end
 for n=1:d+1                 % loop over poly coeff powers
-  s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
+  s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
   for i=2:width            % loop over segments
-    s = sprintf('%s, %.16E', s, C(n,i));      
+    s = sprintf('%s, %.16E', s, C(n,i));
   end
   str{n} = [s sprintf('};\n')];
 end
 
@@ -11,20 +11,23 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
 
 foreach(EXAMPLE ${EXAMPLES})
   add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
   enable_asan(${EXAMPLE})
 endforeach()
 
 foreach(EXAMPLE ${EXAMPLES_C})
   add_executable(${EXAMPLE} ${EXAMPLE}.c)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   enable_asan(${EXAMPLE})
 endforeach()
 
 if(FINUFFT_USE_OPENMP)
   foreach(EXAMPLE ${EXAMPLES_OPENMP})
     add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
     target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
+    target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
     enable_asan(${EXAMPLE})
   endforeach()
 endif()
@@ -1,4 +1,3 @@
-
 file(GLOB example_src "*.cpp")
 
 foreach(srcfile ${example_src})
@@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
   add_executable(${executable} ${srcfile})
   target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
   target_link_libraries(${executable} cufinufft)
+  target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
@@ -4,6 +4,7 @@
 #include <cufft.h>
 #include <cufinufft/types.h>
 #include <cufinufft_opts.h>
+#include <finufft_errors.h>
 #include <finufft_spread_opts.h>
 
 #include <complex.h>
@@ -32,6 +33,38 @@ template<typename T>
 void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    T *fwkerhalf, finufft_spread_opts opts);
 
+template<typename T>
+std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
+                                   int bin_size_z);
+
+template<typename T>
+void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
+
+template<typename T, typename V>
+auto cufinufft_set_shared_memory(V *kernel, const int dim,
+                                 const cufinufft_plan_t<T> &d_plan) {
+  /**
+   * WARNING: this function does not handle cuda errors. The caller should check them.
+   */
+  int device_id{}, shared_mem_per_block{};
+  cudaGetDevice(&device_id);
+  const auto shared_mem_required =
+      shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
+                                d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
+  cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                         device_id);
+  if (shared_mem_required > shared_mem_per_block) {
+    fprintf(stderr,
+            "Error: Shared memory required per block is %zu bytes, but the device "
+            "supports only %d bytes.\n",
+            shared_mem_required, shared_mem_per_block);
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       shared_mem_required);
+  return 0;
+}
+
 } // namespace common
 } // namespace cufinufft
 #endif
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
   return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
 }
 
-#define RETURN_IF_CUDA_ERROR                                         \
-  {                                                                  \
-    cudaError_t err = cudaGetLastError();                            \
-    if (err != cudaSuccess) {                                        \
-      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
-      return FINUFFT_ERR_CUDA_FAILURE;                               \
-    }                                                                \
+#define RETURN_IF_CUDA_ERROR                                                         \
+  {                                                                                  \
+    cudaError_t err = cudaGetLastError();                                            \
+    if (err != cudaSuccess) {                                                        \
+      printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
+             __FILE__, __LINE__);                                                    \
+      return FINUFFT_ERR_CUDA_FAILURE;                                               \
+    }                                                                                \
   }
 
 #define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \