Skip to content

Commit c36a007

Browse files
committed
Init cuda10/11 differently w/ macro CUDART_VERSION
1 parent a674b3f commit c36a007

File tree

1 file changed

+42
-37
lines changed

1 file changed

+42
-37
lines changed

src/cudart/gpucard.cpp

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
#include "tool/argkey.h"
33
#include "tool/error.h"
44
#include "tool/exec.h"
5-
#include <algorithm>
5+
66
#include <cuda_runtime.h>
77
#include <thrust/version.h>
88

9+
#include <algorithm>
10+
911
extern "C" int tinkerGpuUtilizationInt32_macos(int);
1012

1113
namespace tinker {
@@ -142,7 +144,6 @@ static void getDeviceAttribute(DeviceAttribute& a, int device = 0)
142144
a.clock_rate_kHz = prop.clockRate;
143145

144146
check_rt(cudaDeviceReset());
145-
cudaDeviceSynchronize();
146147

147148
if (not found_cc) {
148149
TINKER_THROW(format("The source code should be updated for compute capability %d; "
@@ -268,58 +269,62 @@ void gpuData(RcOp op)
268269
#if 1
269270
cuda_device_flags |= cudaDeviceScheduleBlockingSync;
270271
#elif 0
271-
// Using this flag may reduce the latency
272-
// for cudaStreamSynchronize() calls.
272+
// Using this flag may reduce the latency for the cudaStreamSynchronize() calls.
273273
cuda_device_flags |= cudaDeviceScheduleSpin;
274274
#endif
275+
276+
// cudaError_t cudaSetDeviceFlags (unsigned int flags);
277+
//
278+
// v10.2.89
279+
//
280+
// Records flags as the flags to use when initializing the current device.
281+
// If no device has been made current to the calling thread, then flags will
282+
// be applied to the initialization of any device initialized by the calling
283+
// host thread, unless that device has had its initialization flags set
284+
// explicitly by this or any host thread.
285+
//
286+
// If the current device has been set and that device has already been
287+
// initialized then this call will fail with the error cudaErrorSetOnActiveProcess.
288+
// In this case it is necessary to reset device using cudaDeviceReset()
289+
// before the device's initialization flags may be set.
290+
//
291+
// v11.0.3
292+
//
293+
// Records flags as the flags for the current device. If the current device
294+
// has been set and that device has already been initialized, the previous
295+
// flags are overwritten. If the current device has not been initialized,
296+
// it is initialized with the provided flags. If no device has been made
297+
// current to the calling thread, a default device is selected and initialized
298+
// with the provided flags.
299+
//
300+
// Conclusion
301+
//
302+
// Since CUDA 11, cudaSetDeviceFlags should be called after cudaSetDevice.
303+
// Prior to CUDA 11, cudaSetDeviceFlags must be called before cudaSetDevice.
304+
305+
#if CUDART_VERSION < 11000
275306
always_check_rt(cudaSetDeviceFlags(cuda_device_flags));
307+
#endif
276308

277309
always_check_rt(cudaGetDeviceCount(&ndevice));
278310
auto& all = gpuDeviceAttributes();
279311
all.resize(ndevice);
280312
for (int i = 0; i < ndevice; ++i)
281313
getDeviceAttribute(all[i], i);
282-
283314
idevice = recommendDevice(ndevice);
315+
284316
check_rt(cudaSetDevice(idevice));
317+
#if CUDART_VERSION >= 11000
318+
check_rt(cudaSetDeviceFlags(cuda_device_flags));
319+
#endif
320+
285321
int kdevice = -1;
286322
check_rt(cudaGetDevice(&kdevice));
287323
if (kdevice != idevice)
288324
TINKER_THROW(
289325
format("Device %d in use is different than the selected Device %d.", kdevice, idevice));
290326

291-
unsigned int kflags;
292-
// BEGIN HACK
293-
// I failed to just call cudaGetDeviceFlags() only once in the code.
294-
// I think this is due to an undocumented change in the Cuda runtime,
295-
// e.g., 10.1 vs. 11.2, at least I didn't find anything related
296-
// on the internet. Consider the following logic
297-
//
298-
// #A
299-
// loop k over all devices (even if all == 1)
300-
// cudaSetDevice(k)
301-
// check the properties of device k
302-
// cudaDeviceReset()
303-
// cudaDeviceSynchronize()
304-
// end loop
305-
// idevice = select one device
306-
// #B
307-
// cudaSetDevice(idevice)
308-
// #C
309-
//
310-
// where there are 3 lines (#A, #B, and #C) we can put the function
311-
// cudaGetDeviceFlags() in. For Cuda 10.1, #A is the only option,
312-
// otherwise, errno 708 (cudaErrorSetOnActiveProcess) will be returned.
313-
// What it seems to me is that cudaDeviceReset() didn't actually reset
314-
// the device. For Cuda 11.2, no error is returned, but the desired flags
315-
// are not set correctly if cudaGetDeviceFlags() is put in #A. Therefore,
316-
// I need to put cudaGetDeviceFlags() in #A and add a hack like this.
317-
kflags = 0;
318-
check_rt(cudaGetDeviceFlags(&kflags));
319-
if (kflags != cuda_device_flags)
320-
always_check_rt(cudaSetDeviceFlags(cuda_device_flags));
321-
// END HACK
322-
kflags = 0;
327+
unsigned int kflags = 0;
323328
check_rt(cudaGetDeviceFlags(&kflags));
324329
if (kflags != cuda_device_flags)
325330
TINKER_THROW(

0 commit comments

Comments
 (0)