Skip to content

Commit a1e8344

Browse files
committed
TL/CUDA: fix NVLS rank-0 error propagation via handle status field
When cuMulticastCreate fails on rank 0, the error was stored in the per-process-local nvls->status_supported field only. Non-root ranks checked their own copy (always UCC_OK after memset) and proceeded to call cudaIpcOpenEventHandle on an uninitialised handle, relying on the import failure as the error signal — fragile and emits confusing CUDA error messages. Fix: add a status field to ucc_tl_cuda_nvls_handle_t so rank 0 can embed the error code in the allgathered handle. Non-root ranks read share_data[0].status in STATE_IMPORT_HANDLE and bail out immediately with a clear warning when rank 0 reported a failure.
1 parent 607a507 commit a1e8344

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

src/components/tl/cuda/tl_cuda_nvls.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,16 @@ ucc_status_t ucc_tl_cuda_nvls_init(
421421
"failed to create multicast object. status (%d) %s",
422422
status,
423423
ucc_status_string(status));
424-
// goto cleanup;
425-
// Store the error status to the caller
426-
// We need to share invalid handle to unblock peers
427-
// we will propagate the error status to the caller later
424+
/* Keep going to unblock peers waiting in the allgather;
425+
* propagate the error via the status field of the shared
426+
* handle so non-root ranks detect the failure explicitly. */
428427
nvls->status_supported = UCC_ERR_NOT_SUPPORTED;
429428
} else {
430429
nvls->status_supported = UCC_OK;
431430
}
431+
/* Embed the error status into the handle that will be allgathered
432+
* so every non-root rank can detect rank-0 failure directly. */
433+
nvls->local_handle.status = nvls->status_supported;
432434
// Store PID for POSIX handles
433435
if (!nvls->is_multinode) {
434436
nvls->local_handle.data.posix.pid = getpid();
@@ -467,8 +469,17 @@ ucc_status_t ucc_tl_cuda_nvls_init(
467469
team->state = UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE;
468470
// fall through
469471
case UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE:
470-
// Import handle on non-root ranks
472+
/* Non-root ranks check the status field broadcast by rank 0 before
473+
* attempting to import a potentially garbage handle. */
471474
if (UCC_TL_TEAM_RANK(team) != 0) {
475+
if (nvls->share_data[0].status != UCC_OK) {
476+
tl_warn(UCC_TL_TEAM_LIB(team),
477+
"NVLS: rank 0 failed to create multicast object "
478+
"(status=%d); disabling NVLS for this team",
479+
nvls->share_data[0].status);
480+
nvls->status_supported = nvls->share_data[0].status;
481+
goto cleanup;
482+
}
472483
if (nvls->is_multinode) {
473484
status = ucc_tl_cuda_nvls_import_handle_fabric(
474485
team, &nvls->share_data[0], &mc_handle);

src/components/tl/cuda/tl_cuda_nvls.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ typedef enum {
2222

2323
typedef struct {
2424
ucc_tl_cuda_nvls_handle_type_t type;
25+
/* Rank 0 sets this to UCC_ERR_NOT_SUPPORTED when cuMulticastCreate fails.
26+
* Non-root ranks read share_data[0].status to propagate the error instead
27+
* of relying on garbage-handle import failure. */
28+
ucc_status_t status;
2529
union {
2630
struct {
2731
pid_t pid;

0 commit comments

Comments
 (0)