Merge branch 'flatironinstitute:master' into interp-vectorization

DiamonDinoia · web-flow · commit 92b4def4d6d7 · 2024-06-28T14:25:21.000-04:00
diff --git a/CHANGELOG b/CHANGELOG
@@ -29,6 +29,12 @@ V 2.3.0beta (6/21/24)
   Created a .clang-format file to define the style similar to the existing style.
   Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
   using .git-blame-ignore-revs. Added a contributing.md for developers.
+* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
+as opposed to 32-bit. While this does modify the ABI, most code will just need to
+recompile against the new library as compilers will silently upcast any 32-bit
+integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
+integers are still used, so calling cufinufft with more than 2e9 points will fail.
+This restriction may be lifted in the future.
 
 V 2.2.0 (12/12/23)
 
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -46,11 +46,11 @@ pipeline {
     sh '${PYBIN}/python3 -m venv $HOME'
     sh '''#!/bin/bash -ex
       source $HOME/bin/activate
-      python3 -m pip install --upgrade pip
-      python3 -m pip install --upgrade pycuda cupy-cuda112 numba
-      python3 -m pip install torch==1.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html
-      python3 -m pip install python/cufinufft
-      python3 -m pip install pytest
+      python3 -m pip install --no-cache-dir --upgrade pip
+      python3 -m pip install --no-cache-dir --upgrade pycuda cupy-cuda112 numba
+      python3 -m pip install --no-cache-dir torch==1.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+      python3 -m pip install --no-cache-dir python/cufinufft
+      python3 -m pip install --no-cache-dir pytest
       python -c "from numba import cuda; cuda.cudadrv.libs.test()"
       python3 -m pytest --framework=pycuda python/cufinufft
       python3 -m pytest --framework=numba python/cufinufft
diff --git a/include/cufinufft.h b/include/cufinufft.h
@@ -19,10 +19,10 @@ int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int
 int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
                         float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
-                     int N, double *d_s, double *d_t, double *d_u);
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
-                      int N, float *d_s, float *d_t, float *d_u);
+int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y,
+                     double *d_z, int N, double *d_s, double *d_t, double *d_u);
+int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y,
+                      float *d_z, int N, float *d_s, float *d_t, float *d_u);
 
 int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
 int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk);
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
@@ -45,6 +45,19 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
 // that a CUDA host call returns an error
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
+template<typename T>
+static inline cudaError_t cudaMallocWrapper(T **devPtr, size_t size, cudaStream_t stream,
+                                            int pool_supported) {
+  return pool_supported ? cudaMallocAsync(devPtr, size, stream)
+                        : cudaMalloc(devPtr, size);
+}
+
+template<typename T>
+static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
+                                          int pool_supported) {
+  return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
+}
+
 #define RETURN_IF_CUDA_ERROR                                         \
   {                                                                  \
     cudaError_t err = cudaGetLastError();                            \
@@ -54,12 +67,12 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
     }                                                                \
   }
 
-#define CUDA_FREE_AND_NULL(val, stream)                            \
-  {                                                                \
-    if (val != nullptr) {                                          \
-      check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \
-      val = nullptr;                                               \
-    }                                                              \
+#define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \
+  {                                                                                  \
+    if (val != nullptr) {                                                            \
+      check(cudaFreeWrapper(val, stream, pool_supported), #val, __FILE__, __LINE__); \
+      val = nullptr;                                                                 \
+    }                                                                                \
   }
 
 static const char *cufftGetErrorString(cufftResult error) {
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -125,6 +125,18 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
   }
 
+  // cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check
+  // for support
+  cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported,
+                         device_id);
+  static bool warned = false;
+  if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) {
+    fprintf(stderr,
+            "[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of "
+            "CUDA streams may not perform optimally.\n");
+    warned = true;
+  }
+
   auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
 
   /* Automatically set GPU method. */
@@ -246,10 +258,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
                                     d_plan->spopts);
 
     if ((ier = checkCudaErrors(
-             cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
+             cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream,
+                               d_plan->supports_pools))))
       goto finalize;
-    if ((ier =
-             checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
+    if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T),
+                                                 stream, d_plan->supports_pools))))
       goto finalize;
     if ((ier = checkCudaErrors(
              cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
@@ -265,8 +278,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
 
 finalize:
-  cudaFreeAsync(d_a, stream);
-  cudaFreeAsync(d_f, stream);
+  cudaFreeWrapper(d_a, stream, d_plan->supports_pools);
+  cudaFreeWrapper(d_f, stream, d_plan->supports_pools);
 
   if (ier > 1) {
     delete *d_plan_ptr;
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
@@ -39,6 +39,7 @@ template<typename T> struct cufinufft_plan_t {
   int ntransf;
   int maxbatchsize;
   int iflag;
+  int supports_pools;
 
   int totalnumsubprob;
   T *fwkerhalf1;
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
@@ -102,13 +102,13 @@ class NufftOpts(ctypes.Structure):
 
 _set_pts = lib.cufinufft_setpts
 _set_pts.argtypes = [
-    c_void_p, c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
+    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
     c_double_p, c_double_p]
 _set_pts.restype = c_int
 
 _set_ptsf = lib.cufinufftf_setpts
 _set_ptsf.argtypes = [
-    c_void_p, c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
+    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
     c_float_p, c_float_p]
 _set_ptsf.restype = c_int
 
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
@@ -203,7 +203,8 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
     return ier;
   cudaStreamSynchronize(stream);
   if ((ier = checkCudaErrors(
-           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+           cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
+                             d_plan->supports_pools))))
     return ier;
   map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
       d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
@@ -215,7 +216,7 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
   }
 
   assert(d_subprob_to_bin != NULL);
-  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
   d_plan->subprob_to_bin  = d_subprob_to_bin;
   d_plan->totalnumsubprob = totalnumsubprob;
 
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
@@ -220,7 +220,8 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     return ier;
   cudaStreamSynchronize(stream);
   if ((ier = checkCudaErrors(
-           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+           cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
+                             d_plan->supports_pools))))
     return ier;
   map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
       d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
@@ -232,7 +233,7 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
   }
 
   assert(d_subprob_to_bin != NULL);
-  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
   d_plan->subprob_to_bin  = d_subprob_to_bin;
   d_plan->totalnumsubprob = totalnumsubprob;
 
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
@@ -260,8 +260,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
                                              cudaMemcpyDeviceToHost, stream))))
     return ier;
   cudaStreamSynchronize(stream);
-  if ((ier = checkCudaErrors(
-           cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
+  if ((ier = checkCudaErrors(cudaMallocWrapper(&d_idxnupts, totalNUpts * sizeof(int),
+                                               stream, d_plan->supports_pools))))
     return ier;
 
   calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
@@ -320,7 +320,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
     return ier;
   cudaStreamSynchronize(stream);
   if ((ier = checkCudaErrors(
-           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+           cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
+                             d_plan->supports_pools))))
     return ier;
   map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
       d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
@@ -474,8 +475,8 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
                                       sizeof(int), cudaMemcpyDeviceToHost, stream)))
     return FINUFFT_ERR_CUDA_FAILURE;
   cudaStreamSynchronize(stream);
-  if (checkCudaErrors(
-          cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
+  if (checkCudaErrors(cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int),
+                                        stream, d_plan->supports_pools)))
     return FINUFFT_ERR_CUDA_FAILURE;
 
   map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0,
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
@@ -48,15 +48,19 @@ int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int
                                  (cufinufft_plan_t<double> **)d_plan_ptr, opts);
 }
 
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
-                      int N, float *d_s, float *d_t, float *d_u) {
-  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+int cufinufftf_setpts(cufinufftf_plan d_plan, const int64_t M, float *d_x, float *d_y,
+                      float *d_z, int N, float *d_s, float *d_t, float *d_u) {
+  if (M > std::numeric_limits<int32_t>::max()) return FINUFFT_ERR_NDATA_NOTVALID;
+
+  return cufinufft_setpts_impl((int)M, d_x, d_y, d_z, N, d_s, d_t, d_u,
                                (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
-                     int N, double *d_s, double *d_t, double *d_u) {
-  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+int cufinufft_setpts(cufinufft_plan d_plan, const int64_t M, double *d_x, double *d_y,
+                     double *d_z, int N, double *d_s, double *d_t, double *d_u) {
+  if (M > std::numeric_limits<int32_t>::max()) return FINUFFT_ERR_NDATA_NOTVALID;
+
+  return cufinufft_setpts_impl((int)M, d_x, d_y, d_z, N, d_s, d_t, d_u,
                                (cufinufft_plan_t<double> *)d_plan);
 }
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu