TL/CUDA: fix NVLS rank-0 error propagation via handle status field

ikryukov · ikryukov · commit a1e83445548e · 2026-03-23T19:36:02.000+01:00
When cuMulticastCreate fails on rank 0, the error was stored in the
per-process-local nvls-&gt;status_supported field only.  Non-root ranks
checked their own copy (always UCC_OK after memset) and proceeded to
call cudaIpcOpenEventHandle on an uninitialised handle, relying on the
import failure as the error signal — fragile and emits confusing CUDA
error messages.

Fix: add a status field to ucc_tl_cuda_nvls_handle_t so rank 0 can
embed the error code in the allgathered handle.  Non-root ranks read
share_data[0].status in STATE_IMPORT_HANDLE and bail out immediately
with a clear warning when rank 0 reported a failure.
diff --git a/src/components/tl/cuda/tl_cuda_nvls.c b/src/components/tl/cuda/tl_cuda_nvls.c
@@ -421,14 +421,16 @@ ucc_status_t ucc_tl_cuda_nvls_init(
                     "failed to create multicast object. status (%d) %s",
                     status,
                     ucc_status_string(status));
-                // goto cleanup;
-                // Store the error status to the caller
-                // We need to share invalid handle to unblock peers
-                // we will propagate the error status to the caller later
+                /* Keep going to unblock peers waiting in the allgather;
+                 * propagate the error via the status field of the shared
+                 * handle so non-root ranks detect the failure explicitly. */
                 nvls->status_supported = UCC_ERR_NOT_SUPPORTED;
             } else {
                 nvls->status_supported = UCC_OK;
             }
+            /* Embed the error status into the handle that will be allgathered
+             * so every non-root rank can detect rank-0 failure directly. */
+            nvls->local_handle.status = nvls->status_supported;
             // Store PID for POSIX handles
             if (!nvls->is_multinode) {
                 nvls->local_handle.data.posix.pid = getpid();
@@ -467,8 +469,17 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         team->state = UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE;
         // fall through
     case UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE:
-        // Import handle on non-root ranks
+        /* Non-root ranks check the status field broadcast by rank 0 before
+         * attempting to import a potentially garbage handle. */
         if (UCC_TL_TEAM_RANK(team) != 0) {
+            if (nvls->share_data[0].status != UCC_OK) {
+                tl_warn(UCC_TL_TEAM_LIB(team),
+                        "NVLS: rank 0 failed to create multicast object "
+                        "(status=%d); disabling NVLS for this team",
+                        nvls->share_data[0].status);
+                nvls->status_supported = nvls->share_data[0].status;
+                goto cleanup;
+            }
             if (nvls->is_multinode) {
                 status = ucc_tl_cuda_nvls_import_handle_fabric(
                     team, &nvls->share_data[0], &mc_handle);
diff --git a/src/components/tl/cuda/tl_cuda_nvls.h b/src/components/tl/cuda/tl_cuda_nvls.h
@@ -22,6 +22,10 @@ typedef enum {
 
 typedef struct {
     ucc_tl_cuda_nvls_handle_type_t type;
+    /* Rank 0 sets this to UCC_ERR_NOT_SUPPORTED when cuMulticastCreate fails.
+     * Non-root ranks read share_data[0].status to propagate the error instead
+     * of relying on garbage-handle import failure. */
+    ucc_status_t status;
     union {
         struct {
             pid_t pid;