Skip to content

Commit 43f160c

Browse files
authored
Fix for safe process teardown (#633)
* `gpuFree*()` functions are usually called during process teardown, so we let them ignore regarding errors. * `AvoidCudaGraphCaptureGuard` is constructed in `gpuFree*()` functions, so it needs the same fix.
1 parent d946c45 commit 43f160c

File tree

3 files changed

+57
-11
lines changed

3 files changed

+57
-11
lines changed

include/mscclpp/gpu.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ using CUmemAccessDesc = hipMemAccessDesc;
2626
using CUmemAllocationHandleType = hipMemAllocationHandleType;
2727

2828
constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
29+
constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
30+
constexpr auto cudaErrorInvalidDevice = hipErrorInvalidDevice;
2931
constexpr auto cudaSuccess = hipSuccess;
3032
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
3133
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
@@ -46,6 +48,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
4648
#ifndef CUDA_SUCCESS
4749
#define CUDA_SUCCESS hipSuccess
4850
#endif // CUDA_SUCCESS
51+
#define CUDA_ERROR_DEINITIALIZED hipErrorDeinitialized
52+
#define CUDA_ERROR_CONTEXT_IS_DESTROYED hipErrorContextIsDestroyed
4953

5054
#define cudaEventCreate(...) hipEventCreate(__VA_ARGS__)
5155
#define cudaEventCreateWithFlags(...) hipEventCreateWithFlags(__VA_ARGS__)

include/mscclpp/gpu_utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct AvoidCudaGraphCaptureGuard {
4242
AvoidCudaGraphCaptureGuard();
4343
~AvoidCudaGraphCaptureGuard();
4444
cudaStreamCaptureMode mode_;
45+
bool active_;
4546
};
4647

4748
/// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.

src/gpu_utils.cc

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,54 @@
55
#include <mscclpp/gpu.hpp>
66
#include <mscclpp/gpu_utils.hpp>
77

8+
static inline bool isCudaTeardownError(cudaError_t err) {
9+
#if defined(__HIP_PLATFORM_AMD__)
10+
return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice;
11+
#else // !defined(__HIP_PLATFORM_AMD__)
12+
return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError ||
13+
err == cudaErrorInvalidDevice;
14+
#endif // !defined(__HIP_PLATFORM_AMD__)
15+
}
16+
17+
static inline bool isCuTeardownError(CUresult r) {
18+
return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED;
19+
}
20+
21+
#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \
22+
do { \
23+
cudaError_t __e = cmd; \
24+
if (isCudaTeardownError(__e)) { \
25+
(void)cudaGetLastError(); \
26+
} else { \
27+
MSCCLPP_CUDATHROW(__e); \
28+
} \
29+
} while (false)
30+
31+
#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \
32+
do { \
33+
CUresult __e = cmd; \
34+
if (!isCuTeardownError(__e)) { \
35+
MSCCLPP_CUTHROW(__e); \
36+
} \
37+
} while (false)
38+
839
namespace mscclpp {
940

10-
AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
11-
MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
41+
AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed), active_(true) {
42+
cudaError_t res = cudaThreadExchangeStreamCaptureMode(&mode_);
43+
if (isCudaTeardownError(res)) {
44+
// Runtime is going away; just mark inactive so destructor skips restoring.
45+
active_ = false;
46+
(void)cudaGetLastError();
47+
} else {
48+
MSCCLPP_CUDATHROW(res);
49+
}
1250
}
1351

14-
AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() { (void)cudaThreadExchangeStreamCaptureMode(&mode_); }
52+
AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() {
53+
if (!active_) return;
54+
(void)cudaThreadExchangeStreamCaptureMode(&mode_);
55+
}
1556

1657
CudaStreamWithFlags::CudaStreamWithFlags() : stream_(nullptr) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); }
1758

@@ -185,25 +226,25 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
185226

186227
void gpuFree(void* ptr) {
187228
AvoidCudaGraphCaptureGuard cgcGuard;
188-
MSCCLPP_CUDATHROW(cudaFree(ptr));
229+
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFree(ptr));
189230
}
190231

191232
void gpuFreeHost(void* ptr) {
192233
AvoidCudaGraphCaptureGuard cgcGuard;
193-
MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
234+
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaFreeHost(ptr));
194235
}
195236

196237
#if (CUDA_NVLS_API_AVAILABLE)
197238
void gpuFreePhysical(void* ptr) {
198239
AvoidCudaGraphCaptureGuard cgcGuard;
199240
CUmemGenericAllocationHandle handle;
200241
size_t size = 0;
201-
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, ptr));
202-
MSCCLPP_CUTHROW(cuMemRelease(handle));
203-
MSCCLPP_CUTHROW(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
204-
MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, size));
205-
MSCCLPP_CUTHROW(cuMemRelease(handle));
206-
MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, size));
242+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRetainAllocationHandle(&handle, ptr));
243+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
244+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
245+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemUnmap((CUdeviceptr)ptr, size));
246+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemRelease(handle));
247+
MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cuMemAddressFree((CUdeviceptr)ptr, size));
207248
}
208249
#endif // CUDA_NVLS_API_AVAILABLE
209250

0 commit comments

Comments
 (0)