|
2 | 2 | #include "tool/argkey.h"
|
3 | 3 | #include "tool/error.h"
|
4 | 4 | #include "tool/exec.h"
|
5 |
| -#include <algorithm> |
| 5 | + |
6 | 6 | #include <cuda_runtime.h>
|
7 | 7 | #include <thrust/version.h>
|
8 | 8 |
|
| 9 | +#include <algorithm> |
| 10 | + |
9 | 11 | extern "C" int tinkerGpuUtilizationInt32_macos(int);
|
10 | 12 |
|
11 | 13 | namespace tinker {
|
@@ -142,7 +144,6 @@ static void getDeviceAttribute(DeviceAttribute& a, int device = 0)
|
142 | 144 | a.clock_rate_kHz = prop.clockRate;
|
143 | 145 |
|
144 | 146 | check_rt(cudaDeviceReset());
|
145 |
| - cudaDeviceSynchronize(); |
146 | 147 |
|
147 | 148 | if (not found_cc) {
|
148 | 149 | TINKER_THROW(format("The source code should be updated for compute capability %d; "
|
@@ -268,58 +269,62 @@ void gpuData(RcOp op)
|
268 | 269 | #if 1
|
269 | 270 | cuda_device_flags |= cudaDeviceScheduleBlockingSync;
|
270 | 271 | #elif 0
|
271 |
| - // Using this flag may reduce the latency |
272 |
| - // for cudaStreamSynchronize() calls. |
| 272 | + // Using this flag may reduce the latency for the cudaStreamSynchronize() calls. |
273 | 273 | cuda_device_flags |= cudaDeviceScheduleSpin;
|
274 | 274 | #endif
|
| 275 | + |
| 276 | + // cudaError_t cudaSetDeviceFlags (unsigned int flags); |
| 277 | + // |
| 278 | + // v10.2.89 |
| 279 | + // |
| 280 | + // Records flags as the flags to use when initializing the current device. |
| 281 | + // If no device has been made current to the calling thread, then flags will |
| 282 | + // be applied to the initialization of any device initialized by the calling |
| 283 | + // host thread, unless that device has had its initialization flags set |
| 284 | + // explicitly by this or any host thread. |
| 285 | + // |
| 286 | + // If the current device has been set and that device has already been |
| 287 | + // initialized then this call will fail with the error cudaErrorSetOnActiveProcess. |
| 288 | + // In this case it is necessary to reset device using cudaDeviceReset() |
| 289 | + // before the device's initialization flags may be set. |
| 290 | + // |
| 291 | + // v11.0.3 |
| 292 | + // |
| 293 | + // Records flags as the flags for the current device. If the current device |
| 294 | + // has been set and that device has already been initialized, the previous |
| 295 | + // flags are overwritten. If the current device has not been initialized, |
| 296 | + // it is initialized with the provided flags. If no device has been made |
| 297 | + // current to the calling thread, a default device is selected and initialized |
| 298 | + // with the provided flags. |
| 299 | + // |
| 300 | + // Conclusion |
| 301 | + // |
| 302 | + // Since CUDA 11, cudaSetDeviceFlags should be called after cudaSetDevice. |
| 303 | + // Prior to CUDA 11, cudaSetDeviceFlags must be called before cudaSetDevice. |
| 304 | + |
| 305 | +#if CUDART_VERSION < 11000 |
275 | 306 | always_check_rt(cudaSetDeviceFlags(cuda_device_flags));
|
| 307 | +#endif |
276 | 308 |
|
277 | 309 | always_check_rt(cudaGetDeviceCount(&ndevice));
|
278 | 310 | auto& all = gpuDeviceAttributes();
|
279 | 311 | all.resize(ndevice);
|
280 | 312 | for (int i = 0; i < ndevice; ++i)
|
281 | 313 | getDeviceAttribute(all[i], i);
|
282 |
| - |
283 | 314 | idevice = recommendDevice(ndevice);
|
| 315 | + |
284 | 316 | check_rt(cudaSetDevice(idevice));
|
| 317 | +#if CUDART_VERSION >= 11000 |
| 318 | + check_rt(cudaSetDeviceFlags(cuda_device_flags)); |
| 319 | +#endif |
| 320 | + |
285 | 321 | int kdevice = -1;
|
286 | 322 | check_rt(cudaGetDevice(&kdevice));
|
287 | 323 | if (kdevice != idevice)
|
288 | 324 | TINKER_THROW(
|
289 | 325 | format("Device %d in use is different than the selected Device %d.", kdevice, idevice));
|
290 | 326 |
|
291 |
| - unsigned int kflags; |
292 |
| - // BEGIN HACK |
293 |
| - // I failed to just call cudaGetDeviceFlags() only once in the code. |
294 |
| - // I think this is due to an undocumented change in the Cuda runtime, |
295 |
| - // e.g., 10.1 vs. 11.2, at least I didn't find anything related |
296 |
| - // on the internet. Consider the following logic |
297 |
| - // |
298 |
| - // #A |
299 |
| - // loop k over all devices (even if all == 1) |
300 |
| - // cudaSetDevice(k) |
301 |
| - // check the properties of device k |
302 |
| - // cudaDeviceReset() |
303 |
| - // cudaDeviceSynchronize() |
304 |
| - // end loop |
305 |
| - // idevice = select one device |
306 |
| - // #B |
307 |
| - // cudaSetDevice(idevice) |
308 |
| - // #C |
309 |
| - // |
310 |
| - // where there are 3 lines (#A, #B, and #C) we can put the function |
311 |
| - // cudaGetDeviceFlags() in. For Cuda 10.1, #A is the only option, |
312 |
| - // otherwise, errno 708 (cudaErrorSetOnActiveProcess) will be returned. |
313 |
| - // What it seems to me is that cudaDeviceReset() didn't actually reset |
314 |
| - // the device. For Cuda 11.2, no error is returned, but the desired flags |
315 |
| - // are not set correctly if cudaGetDeviceFlags() is put in #A. Therefore, |
316 |
| - // I need to put cudaGetDeviceFlags() in #A and add a hack like this. |
317 |
| - kflags = 0; |
318 |
| - check_rt(cudaGetDeviceFlags(&kflags)); |
319 |
| - if (kflags != cuda_device_flags) |
320 |
| - always_check_rt(cudaSetDeviceFlags(cuda_device_flags)); |
321 |
| - // END HACK |
322 |
| - kflags = 0; |
| 327 | + unsigned int kflags = 0; |
323 | 328 | check_rt(cudaGetDeviceFlags(&kflags));
|
324 | 329 | if (kflags != cuda_device_flags)
|
325 | 330 | TINKER_THROW(
|
|
0 commit comments