patched broken CUDA state after malloc failure

TysonRayJones · web-flow · commit 37e222ea2f35 · 2025-07-13T12:21:20.000-04:00
Danny Hindson discovered a bug wherein the failing cudaMalloc() call deliberately induced by the 'out-of-memory' unit test of createFullStateDiagMatr() breaks subsequent GPU simulation. This is because a failing cudaMalloc corrupts the CUDA API state until being explicitly cleared using the undocumented facility of cudaGetLastError (which clears "non-sticky' errors). We correct this, and defensively check for irrecoverable sticky errors.
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
@@ -698,6 +698,11 @@ void error_cudaCallFailed(const char* msg, const char* func, const char* caller,
     raiseInternalError(err);
 }
 
+void error_cudaEncounteredIrrecoverableError() {
+
+    raiseInternalError("The CUDA API encountered an irrecoverable \"sticky\" error which was attemptedly cleared as if it were non-sticky.");
+}
+
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
@@ -266,6 +266,8 @@ void assert_applyFullStateDiagMatrTempGpuAllocSucceeded(qcomp* gpuPtr);
 
 void error_cudaCallFailed(const char* msg, const char* func, const char* caller, const char* file, int line);
 
+void error_cudaEncounteredIrrecoverableError();
+
 
 
 /*
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
@@ -46,14 +46,16 @@
 /*
  * CUDA ERROR HANDLING
  *
- * which is only defined when CUDA-compiling, since it is invoked only a macro (defined
- * in gpu_config.hpp) which wraps CUDA API calls
+ * which are only defined when CUDA-compiling, since only ever invoked
+ * when encountering issues through use of the CUDA API
  */
 
 #if COMPILE_CUDA
 
 void assertCudaCallSucceeded(int result, const char* call, const char* caller, const char* file, int line) {
 
+    // this function is only invoked by the CUDA_CHECK macro defined in gpu_config.hpp header
+
     // result (int) is actually type cudaError_t but we cannot use this CUDA-defined type
     // in gpu_config.hpp (since it's included by non-CUDA-compiled files), and we wish to keep
     // the signature consistent.
@@ -63,6 +65,32 @@ void assertCudaCallSucceeded(int result, const char* call, const char* caller, c
         error_cudaCallFailed(cudaGetErrorString(code), call, caller, file, line);
 }
 
+void clearPossibleCudaError() {
+
+    // beware that in addition to clearing anticipated CUDA errors (like
+    // cudaMalloc failing), this function will check that the CUDA API is
+    // generally working (i.e. has not encountered an irrecoverable error),
+    // including whether e.g. the CUDA drivers match the runtime version. It
+    // should ergo never be called in settings where GPU is compiled but not
+    // runtime activated, since such settings see CUDA be in an acceptably
+    // broken state - calling this function would throw an internal error
+
+    // clear "non-sticky" errors so that future CUDA API use is not corrupted
+    cudaError_t initialCode = cudaGetLastError();
+
+    // nothing to do if no error had occurred
+    if (initialCode == cudaSuccess)
+        return;
+
+    // sync and re-check if error code is erroneously unchanged, which 
+    // indicates that CUDA encountered an irrecoverable "sticky" error
+    CUDA_CHECK( cudaDeviceSynchronize() );
+
+    cudaError_t finalCode = cudaGetLastError();
+    if (initialCode == finalCode)
+        error_cudaEncounteredIrrecoverableError();
+}
+
 #endif
 
 
@@ -153,6 +181,12 @@ int gpu_getNumberOfLocalGpus() {
     // is called but no devices exist, which we handle
     int num;
     auto status = cudaGetDeviceCount(&num);
+
+    // treat query failure as indication of no local GPUs
+    // so do not call clearPossibleCudaError(). This is
+    // necessary because cudaGetDeviceCount() can report
+    // driver version errors when QuEST is GPU-compiled
+    // on a platform without a GPU, which we tolerate
     return (status == cudaSuccess)? num : 0;
 
 #else
@@ -176,8 +210,12 @@ bool gpu_isGpuAvailable() {
         struct cudaDeviceProp props;
         auto status = cudaGetDeviceProperties(&props, deviceInd);
 
-        // if the query failed, device is anyway unusable
-        if (status != cudaSuccess) 
+        // if the query failed, device is anyway unusable; we do not
+        // clear the error with clearPossibleCudaError() since this
+        // can trigger an internal error when QuEST is GPU-compiled
+        // but no valid GPU exists (hence no valid driver), like
+        // occurs on cluster submission nodes
+        if (status != cudaSuccess)
             continue;
 
         // if the device is a real GPU, it's 'major' compute capability is != 9999 (meaning emulation)
@@ -405,9 +443,16 @@ qcomp* gpu_allocArray(qindex length) {
     qcomp* ptr;
     cudaError_t errCode = cudaMalloc(&ptr, numBytes);
 
-    // intercept memory-alloc error and merely return nullptr pointer (to be handled by validation)
-    if (errCode == cudaErrorMemoryAllocation)
+    // intercept memory-alloc error (handled by caller's validation)
+    if (errCode == cudaErrorMemoryAllocation) {
+
+        // malloc failure can break CUDA API state, so recover it in
+        // case execution is continuing (e.g. by unit tests)
+        clearPossibleCudaError();
+
+        // indicate alloc failure
         return nullptr;
+    }
 
     // pass all other unexpected errors to internal error handling
     CUDA_CHECK(errCode);

Original file line number	Diff line number	Diff line change
`@@ -698,6 +698,11 @@ void error_cudaCallFailed(const char* msg, const char* func, const char* caller,`
`698`	`698`	`raiseInternalError(err);`
`699`	`699`	`}`
`700`	`700`
	`701`	`+void error_cudaEncounteredIrrecoverableError() {`
	`702`	`+`
	`703`	`+ raiseInternalError("The CUDA API encountered an irrecoverable \"sticky\" error which was attemptedly cleared as if it were non-sticky.");`
	`704`	`+}`
	`705`	`+`
`701`	`706`
`702`	`707`
`703`	`708`	`/*`