Skip to content

Commit 162b3c2

Browse files
blackwerjanden
authored andcommitted
cuda: use sync malloc/free when async not supported #445
1 parent 000f8dc commit 162b3c2

File tree

7 files changed

+172
-119
lines changed

7 files changed

+172
-119
lines changed

include/cufinufft/contrib/helper_cuda.h

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,19 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
4545
// that a CUDA host call returns an error
4646
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
4747

48+
template<typename T>
49+
static inline cudaError_t cudaMallocWrapper(T **devPtr, size_t size, cudaStream_t stream,
50+
int pool_supported) {
51+
return pool_supported ? cudaMallocAsync(devPtr, size, stream)
52+
: cudaMalloc(devPtr, size);
53+
}
54+
55+
template<typename T>
56+
static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
57+
int pool_supported) {
58+
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
59+
}
60+
4861
#define RETURN_IF_CUDA_ERROR \
4962
{ \
5063
cudaError_t err = cudaGetLastError(); \
@@ -54,12 +67,12 @@ static const char *_cudaGetErrorEnum(cudaError_t error) {
5467
} \
5568
}
5669

57-
#define CUDA_FREE_AND_NULL(val, stream) \
58-
{ \
59-
if (val != nullptr) { \
60-
check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \
61-
val = nullptr; \
62-
} \
70+
#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \
71+
{ \
72+
if (val != nullptr) { \
73+
check(cudaFreeWrapper(val, stream, pool_supported), #val, __FILE__, __LINE__); \
74+
val = nullptr; \
75+
} \
6376
}
6477

6578
static const char *cufftGetErrorString(cufftResult error) {

include/cufinufft/impl.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
125125
d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
126126
}
127127

128+
// cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check
129+
// for support
130+
cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported,
131+
device_id);
132+
128133
auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
129134

130135
/* Automatically set GPU method. */
@@ -246,10 +251,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
246251
d_plan->spopts);
247252

248253
if ((ier = checkCudaErrors(
249-
cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
254+
cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream,
255+
d_plan->supports_pools))))
250256
goto finalize;
251-
if ((ier =
252-
checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
257+
if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T),
258+
stream, d_plan->supports_pools))))
253259
goto finalize;
254260
if ((ier = checkCudaErrors(
255261
cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
@@ -265,8 +271,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
265271
}
266272

267273
finalize:
268-
cudaFreeAsync(d_a, stream);
269-
cudaFreeAsync(d_f, stream);
274+
cudaFreeWrapper(d_a, stream, d_plan->supports_pools);
275+
cudaFreeWrapper(d_f, stream, d_plan->supports_pools);
270276

271277
if (ier > 1) {
272278
delete *d_plan_ptr;

include/cufinufft/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ template<typename T> struct cufinufft_plan_t {
3939
int ntransf;
4040
int maxbatchsize;
4141
int iflag;
42+
int supports_pools;
4243

4344
int totalnumsubprob;
4445
T *fwkerhalf1;

src/cuda/1d/spread1d_wrapper.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
203203
return ier;
204204
cudaStreamSynchronize(stream);
205205
if ((ier = checkCudaErrors(
206-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
206+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
207+
d_plan->supports_pools))))
207208
return ier;
208209
map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
209210
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
@@ -215,7 +216,7 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
215216
}
216217

217218
assert(d_subprob_to_bin != NULL);
218-
cudaFreeAsync(d_plan->subprob_to_bin, stream);
219+
cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
219220
d_plan->subprob_to_bin = d_subprob_to_bin;
220221
d_plan->totalnumsubprob = totalnumsubprob;
221222

src/cuda/2d/spread2d_wrapper.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,8 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
220220
return ier;
221221
cudaStreamSynchronize(stream);
222222
if ((ier = checkCudaErrors(
223-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
223+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
224+
d_plan->supports_pools))))
224225
return ier;
225226
map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
226227
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
@@ -232,7 +233,7 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
232233
}
233234

234235
assert(d_subprob_to_bin != NULL);
235-
cudaFreeAsync(d_plan->subprob_to_bin, stream);
236+
cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
236237
d_plan->subprob_to_bin = d_subprob_to_bin;
237238
d_plan->totalnumsubprob = totalnumsubprob;
238239

src/cuda/3d/spread3d_wrapper.cu

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
260260
cudaMemcpyDeviceToHost, stream))))
261261
return ier;
262262
cudaStreamSynchronize(stream);
263-
if ((ier = checkCudaErrors(
264-
cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
263+
if ((ier = checkCudaErrors(cudaMallocWrapper(&d_idxnupts, totalNUpts * sizeof(int),
264+
stream, d_plan->supports_pools))))
265265
return ier;
266266

267267
calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
@@ -320,7 +320,8 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
320320
return ier;
321321
cudaStreamSynchronize(stream);
322322
if ((ier = checkCudaErrors(
323-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
323+
cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
324+
d_plan->supports_pools))))
324325
return ier;
325326
map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
326327
d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
@@ -474,8 +475,8 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
474475
sizeof(int), cudaMemcpyDeviceToHost, stream)))
475476
return FINUFFT_ERR_CUDA_FAILURE;
476477
cudaStreamSynchronize(stream);
477-
if (checkCudaErrors(
478-
cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
478+
if (checkCudaErrors(cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int),
479+
stream, d_plan->supports_pools)))
479480
return FINUFFT_ERR_CUDA_FAILURE;
480481

481482
map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0,

0 commit comments

Comments
 (0)