Fix for safe process teardown (#633)

chhwang · web-flow · commit 43f160c8e6a0 · 2025-09-10T20:28:04.000-07:00
* `gpuFree*()` functions are usually called during process teardown, so
we let them ignore regarding errors.
* `AvoidCudaGraphCaptureGuard` is constructed in `gpuFree*()` functions,
so it needs the same fix.
diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
@@ -26,6 +26,8 @@ using CUmemAccessDesc = hipMemAccessDesc;
 using CUmemAllocationHandleType = hipMemAllocationHandleType;
 
 constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
+constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
+constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
 constexpr auto cudaSuccess = hipSuccess;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
 constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
@@ -46,6 +48,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #ifndef CUDA_SUCCESS
 #define CUDA_SUCCESS hipSuccess
 #endif  // CUDA_SUCCESS
+#define CUDA_ERROR_DEINITIALIZED hipErrorDeinitialized
+#define CUDA_ERROR_CONTEXT_IS_DESTROYED hipErrorContextIsDestroyed
 
 #define cudaEventCreate(...) hipEventCreate(__VA_ARGS__)
 #define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__)
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -42,6 +42,7 @@ struct AvoidCudaGraphCaptureGuard {
   AvoidCudaGraphCaptureGuard();
   ~AvoidCudaGraphCaptureGuard();
   cudaStreamCaptureMode mode_;
+  bool active_;
 };
 
 /// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
@@ -5,13 +5,54 @@
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
+static inline bool isCudaTeardownError(cudaError_t err) {
+#if defined(__HIP_PLATFORM_AMD__)
+  return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
+#else   // !defined(__HIP_PLATFORM_AMD__)
+  return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
+        err == cudaErrorInvalidDevice;
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+}
+
+static inline bool isCuTeardownError(CUresult r) {
+  return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED;
+}
+
+#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                         \
+    cudaError_t __e = cmd;                     \
+    if (isCudaTeardownError(__e)) {            \
+      (void)cudaGetLastError();                \
+    } else {                                   \
+      MSCCLPP_CUDATHROW(__e);                  \
+    }                                          \
+  } while (false)
+
+#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
+  do {                                       \
+    CUresult __e = cmd;                      \
+    if (!isCuTeardownError(__e)) {           \
+      MSCCLPP_CUTHROW(__e);                  \
+    }                                        \
+  } while (false)
+
 namespace mscclpp {
 
-AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
-  MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
+AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed), active_(true) {
+  cudaError_t res = cudaThreadExchangeStreamCaptureMode(&mode_);
+  if (isCudaTeardownError(res)) {
+    // Runtime is going away; just mark inactive so destructor skips restoring.
+    active_ = false;
+    (void)cudaGetLastError();
+  } else {
+    MSCCLPP_CUDATHROW(res);
+  }
 }
 
-AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() { (void)cudaThreadExchangeStreamCaptureMode(&mode_); }
+AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() {
+  if (!active_) return;
+  (void)cudaThreadExchangeStreamCaptureMode(&mode_);
+}
 
 CudaStreamWithFlags::CudaStreamWithFlags() : stream_(nullptr) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); }
 
@@ -185,25 +226,25 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
 
 void gpuFree(void* ptr) {
   AvoidCudaGraphCaptureGuard cgcGuard;
-  MSCCLPP_CUDATHROW(cudaFree(ptr));
+  MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFree(ptr));
 }
 
 void gpuFreeHost(void* ptr) {
   AvoidCudaGraphCaptureGuard cgcGuard;
-  MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
+  MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFreeHost(ptr));
 }
 
 #if (CUDA_NVLS_API_AVAILABLE)
 void gpuFreePhysical(void* ptr) {
   AvoidCudaGraphCaptureGuard cgcGuard;
   CUmemGenericAllocationHandle handle;
   size_t size = 0;
-  MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
-  MSCCLPP_CUTHROW(cuMemRelease(handle));
-  MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-  MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
-  MSCCLPP_CUTHROW(cuMemRelease(handle));
-  MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRetainAllocationHandle(&handle, ptr));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemUnmap((CUdeviceptr)ptr, size));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
+  MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemAddressFree((CUdeviceptr)ptr, size));
 }
 #endif  // CUDA_NVLS_API_AVAILABLE