Skip to content

Commit 92b4def

Browse files
authored
Merge branch 'flatironinstitute:master' into interp-vectorization
2 parents 0454f4e + 5dde122 commit 92b4def

File tree

12 files changed

+206
-136
lines changed

12 files changed

+206
-136
lines changed

CHANGELOG

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ V 2.3.0beta (6/21/24)
2929
Created a .clang-format file to define the style similar to the existing style.
3030
Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
3131
using .git-blame-ignore-revs. Added a contributing.md for developers.
32+
* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
33+
as opposed to 32-bit. While this does modify the ABI, most code will just need to
34+
recompile against the new library as compilers will silently upcast any 32-bit
35+
integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
36+
integers are still used, so calling cufinufft with more than 2e9 points will fail.
37+
This restriction may be lifted in the future.
3238

3339
V 2.2.0 (12/12/23)
3440

Jenkinsfile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ pipeline {
4646
sh '${PYBIN}/python3 -m venv $HOME'
4747
sh '''#!/bin/bash -ex
4848
source $HOME/bin/activate
49-
python3 -m pip install --upgrade pip
50-
python3 -m pip install --upgrade pycuda cupy-cuda112 numba
51-
python3 -m pip install torch==1.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html
52-
python3 -m pip install python/cufinufft
53-
python3 -m pip install pytest
49+
python3 -m pip install --no-cache-dir --upgrade pip
50+
python3 -m pip install --no-cache-dir --upgrade pycuda cupy-cuda112 numba
51+
python3 -m pip install --no-cache-dir torch==1.10.2+cu111 -f https://download.pytorch.org/whl/torch_stable.html
52+
python3 -m pip install --no-cache-dir python/cufinufft
53+
python3 -m pip install --no-cache-dir pytest
5454
python -c "from numba import cuda; cuda.cudadrv.libs.test()"
5555
python3 -m pytest --framework=pycuda python/cufinufft
5656
python3 -m pytest --framework=numba python/cufinufft

include/cufinufft.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int
1919
int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
2020
float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
2121

22-
int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
23-
int N, double *d_s, double *d_t, double *d_u);
24-
int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
25-
int N, float *d_s, float *d_t, float *d_u);
22+
int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y,
23+
double *d_z, int N, double *d_s, double *d_t, double *d_u);
24+
int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y,
25+
float *d_z, int N, float *d_s, float *d_t, float *d_u);
2626

2727
int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
2828
int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk);

include/cufinufft/contrib/helper_cuda.h

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,19 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
4545
// that a CUDA host call returns an error
4646
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
4747

48+
template<typename T>
49+
static inline cudaError_t cudaMallocWrapper(T **devPtr, size_t size, cudaStream_t stream,
50+
int pool_supported) {
51+
return pool_supported ? cudaMallocAsync(devPtr, size, stream)
52+
: cudaMalloc(devPtr, size);
53+
}
54+
55+
template<typename T>
56+
static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
57+
int pool_supported) {
58+
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
59+
}
60+
4861
#define RETURN_IF_CUDA_ERROR \
4962
{ \
5063
cudaError_t err = cudaGetLastError(); \
@@ -54,12 +67,12 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
5467
} \
5568
}
5669

57-
#define CUDA_FREE_AND_NULL(val, stream) \
58-
{ \
59-
if (val != nullptr) { \
60-
check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \
61-
val = nullptr; \
62-
} \
70+
#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \
71+
{ \
72+
if (val != nullptr) { \
73+
check(cudaFreeWrapper(val, stream, pool_supported), #val, __FILE__, __LINE__); \
74+
val = nullptr; \
75+
} \
6376
}
6477

6578
static const char *cufftGetErrorString(cufftResult error) {

include/cufinufft/impl.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,18 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
125125
d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
126126
}
127127

128+
// cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check
129+
// for support
130+
cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported,
131+
device_id);
132+
static bool warned = false;
133+
if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) {
134+
fprintf(stderr,
135+
"[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of "
136+
"CUDA streams may not perform optimally.\n");
137+
warned = true;
138+
}
139+
128140
auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
129141

130142
/* Automatically set GPU method. */
@@ -246,10 +258,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
246258
d_plan->spopts);
247259

248260
if ((ier = checkCudaErrors(
249-
cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
261+
cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream,
262+
d_plan->supports_pools))))
250263
goto finalize;
251-
if ((ier =
252-
checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
264+
if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T),
265+
stream, d_plan->supports_pools))))
253266
goto finalize;
254267
if ((ier = checkCudaErrors(
255268
cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
@@ -265,8 +278,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
265278
}
266279

267280
finalize:
268-
cudaFreeAsync(d_a, stream);
269-
cudaFreeAsync(d_f, stream);
281+
cudaFreeWrapper(d_a, stream, d_plan->supports_pools);
282+
cudaFreeWrapper(d_f, stream, d_plan->supports_pools);
270283

271284
if (ier > 1) {
272285
delete *d_plan_ptr;

include/cufinufft/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ template<typename T> struct cufinufft_plan_t {
3939
int ntransf;
4040
int maxbatchsize;
4141
int iflag;
42+
int supports_pools;
4243

4344
int totalnumsubprob;
4445
T *fwkerhalf1;

python/cufinufft/cufinufft/_cufinufft.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,13 @@ class NufftOpts(ctypes.Structure):
102102

103103
_set_pts = lib.cufinufft_setpts
104104
_set_pts.argtypes = [
105-
c_void_p, c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
105+
c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
106106
c_double_p, c_double_p]
107107
_set_pts.restype = c_int
108108

109109
_set_ptsf = lib.cufinufftf_setpts
110110
_set_ptsf.argtypes = [
111-
c_void_p, c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
111+
c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
112112
c_float_p, c_float_p]
113113
_set_ptsf.restype = c_int
114114

src/cuda/1d/spread1d_wrapper.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
203203
return ier;
204204
cudaStreamSynchronize(stream);
205205
if ((ier = checkCudaErrors(
206-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
206+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
207+
d_plan->supports_pools))))
207208
return ier;
208209
map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
209210
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
@@ -215,7 +216,7 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
215216
}
216217

217218
assert(d_subprob_to_bin != NULL);
218-
cudaFreeAsync(d_plan->subprob_to_bin, stream);
219+
cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
219220
d_plan->subprob_to_bin = d_subprob_to_bin;
220221
d_plan->totalnumsubprob = totalnumsubprob;
221222

src/cuda/2d/spread2d_wrapper.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,8 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
220220
return ier;
221221
cudaStreamSynchronize(stream);
222222
if ((ier = checkCudaErrors(
223-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
223+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
224+
d_plan->supports_pools))))
224225
return ier;
225226
map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
226227
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
@@ -232,7 +233,7 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
232233
}
233234

234235
assert(d_subprob_to_bin != NULL);
235-
cudaFreeAsync(d_plan->subprob_to_bin, stream);
236+
cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
236237
d_plan->subprob_to_bin = d_subprob_to_bin;
237238
d_plan->totalnumsubprob = totalnumsubprob;
238239

src/cuda/3d/spread3d_wrapper.cu

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
260260
cudaMemcpyDeviceToHost, stream))))
261261
return ier;
262262
cudaStreamSynchronize(stream);
263-
if ((ier = checkCudaErrors(
264-
cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
263+
if ((ier = checkCudaErrors(cudaMallocWrapper(&d_idxnupts, totalNUpts * sizeof(int),
264+
stream, d_plan->supports_pools))))
265265
return ier;
266266

267267
calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
@@ -320,7 +320,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
320320
return ier;
321321
cudaStreamSynchronize(stream);
322322
if ((ier = checkCudaErrors(
323-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
323+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
324+
d_plan->supports_pools))))
324325
return ier;
325326
map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
326327
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
@@ -474,8 +475,8 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
474475
sizeof(int), cudaMemcpyDeviceToHost, stream)))
475476
return FINUFFT_ERR_CUDA_FAILURE;
476477
cudaStreamSynchronize(stream);
477-
if (checkCudaErrors(
478-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
478+
if (checkCudaErrors(cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int),
479+
stream, d_plan->supports_pools)))
479480
return FINUFFT_ERR_CUDA_FAILURE;
480481

481482
map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0,

0 commit comments

Comments
 (0)