TL/CUDA: fix control region init and add OOB barrier for NVLS

ikryukov · ikryukov · commit 25514672dd72 · 2026-03-23T19:58:05.000+01:00
Two critical correctness fixes in ucc_tl_cuda_nvls_init STATE_ADD_DEVICE:

1. All ranks must zero-initialise their control region via UC VA BEFORE
   cuMulticastAddDevice, not only rank 0 after cuMulticastBindAddr.
   Each rank's uc_va maps its OWN physical pages; only that rank can
   zero its arrival_counter.  Previously, ranks 1..N-1 started with
   uninitialised counters, causing incorrect allreduce results.
   The fix exploits the fact that cuMulticastBindAddr blocks until all
   ranks have called cuMulticastAddDevice, so the control-region memset
   (done BEFORE addDevice) is guaranteed complete on every rank when
   the bind returns.

2. Add UCC_TL_CUDA_NVLS_STATE_BARRIER: a final OOB allgather barrier
   after the CUDA setup is complete.  This ensures all ranks have
   finished NVLS initialisation (including any async fabric operations)
   before any collective uses the team.

Also reorder STATE_ADD_DEVICE to: allocate physical memory → map UC VA
→ init control regions → addDevice → bind → map MC VA.  Store nvls
handles only after full success to keep nvls_destroy idempotent and
avoid double-free on init failure.
diff --git a/src/components/tl/cuda/tl_cuda_nvls.c b/src/components/tl/cuda/tl_cuda_nvls.c
@@ -512,19 +512,19 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         // fall through
     case UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE:
     {
-        // Add device to multicast object
-        status = CUDADRV_FUNC(
-            cuMulticastAddDevice(nvls->mc_handle, nvls->device));
-        if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to add device to multicast");
-            goto cleanup;
-        }
-        tl_debug(
-            UCC_TL_TEAM_LIB(team),
-            "RANK %d: added device %d to multicast\n",
-            UCC_TL_TEAM_RANK(team),
-            nvls->device);
+        /* Correct ordering for NVLS memory setup:
+         *
+         * 1. Allocate physical memory and map UC VA first.
+         * 2. Zero-initialise the control region on UC VA on ALL ranks BEFORE
+         *    calling cuMulticastAddDevice.  cuMulticastBindAddr blocks until
+         *    every rank has called cuMulticastAddDevice, so when it returns
+         *    ALL ranks have already zeroed their control regions — no extra
+         *    barrier needed to protect the counters.
+         * 3. Add device to multicast → bind (collective barrier point) → map MC VA.
+         *
+         * Doing the memset only on rank 0 (or after bind) leaves arrival_counter
+         * uninitialised on non-root ranks which causes incorrect collective results.
+         */
 
         // Allocate physical memory
         prop.type          = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -537,9 +537,8 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         mc_size = nvls->mc_size;
         status  = CUDADRV_FUNC(cuMemCreate(&mem_handle, mc_size, &prop, 0));
         if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team),
-                "failed to create memory allocation for multicast");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to create memory allocation for multicast");
             goto cleanup;
         }
 
@@ -549,12 +548,11 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         accessDesc.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
         // Reserve and map unicast virtual address space
-        status                   = CUDADRV_FUNC(
+        status = CUDADRV_FUNC(
             cuMemAddressReserve(&uc_va, mc_size, nvls->minGran, 0U, 0));
         if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team),
-                "failed to reserve virtual address space");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to reserve virtual address space");
             goto cleanup;
         }
 
@@ -570,22 +568,67 @@ ucc_status_t ucc_tl_cuda_nvls_init(
             goto cleanup;
         }
 
+        // Allocate coll_ids
+        nvls->coll_ids = (size_t *)ucc_malloc(
+            lib->cfg.max_concurrent * sizeof(size_t), "coll_ids");
+        if (!nvls->coll_ids) {
+            status = UCC_ERR_NO_MEMORY;
+            goto cleanup;
+        }
+        // Initialize the coll_ids to 0
+        memset(nvls->coll_ids, 0, lib->cfg.max_concurrent * sizeof(size_t));
+
+        /* ALL ranks zero-initialise their own control region via UC VA
+         * BEFORE cuMulticastAddDevice.  cuMulticastBindAddr acts as a
+         * collective barrier: when it returns every rank has already
+         * completed its memset, so arrival_counter starts at 0 everywhere. */
+        // Initialize control regions on uc_va BEFORE adding device to multicast.
+        // This is critical: cuMulticastBindAddr blocks until all devices are added,
+        // so by doing memset BEFORE cuMulticastAddDevice, we guarantee that when
+        // cuMulticastBindAddr unblocks, ALL ranks have initialized their control regions.
+        tl_debug(UCC_TL_TEAM_LIB(team),
+                 "NVLS init: rank %d initialising control regions "
+                 "uc_va=%p symm_size=%zu ctrl_size=%d slots=%u",
+                 UCC_TL_TEAM_RANK(team), (void *)uc_va,
+                 lib->cfg.nvls_symmetric_size, NVLS_CONTROL_SIZE,
+                 lib->cfg.max_concurrent);
+        CUDA_CHECK(cudaMemset2D(
+            (void *)(uc_va + lib->cfg.nvls_symmetric_size),
+            lib->cfg.nvls_symmetric_size + NVLS_CONTROL_SIZE,
+            0,
+            NVLS_CONTROL_SIZE,
+            lib->cfg.max_concurrent));
+
+        // Add device to multicast object
+        status = CUDADRV_FUNC(
+            cuMulticastAddDevice(nvls->mc_handle, nvls->device));
+        if (status != UCC_OK) {
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to add device to multicast");
+            goto cleanup;
+        }
+        tl_debug(UCC_TL_TEAM_LIB(team),
+                 "RANK %d: added device %d to multicast",
+                 UCC_TL_TEAM_RANK(team), nvls->device);
+
         // Bind memory to multicast object
+        // This BLOCKS until all devices have called cuMulticastAddDevice.
+        // Since we initialized control regions BEFORE cuMulticastAddDevice,
+        // when this returns, ALL ranks have completed their control region init.
         status = CUDADRV_FUNC(cuMulticastBindAddr(
             nvls->mc_handle, 0 /*mcOffset*/, uc_va, mc_size, 0));
         if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to bind memory to multicast");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to bind memory to multicast");
             goto cleanup;
         }
 
         // Reserve and map multicast virtual address space
         status = CUDADRV_FUNC(
             cuMemAddressReserve(&mc_va, mc_size, nvls->minGran, 0U, 0));
         if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team),
-                "failed to reserve multicast virtual address space");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to reserve multicast virtual address space");
             goto cleanup;
         }
 
@@ -597,42 +640,74 @@ ucc_status_t ucc_tl_cuda_nvls_init(
 
         status = CUDADRV_FUNC(cuMemSetAccess(mc_va, mc_size, &accessDesc, 1));
         if (status != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to set multicast memory access");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to set multicast memory access");
             goto cleanup;
         }
 
-        tl_debug(
-            UCC_TL_TEAM_LIB(team),
-            "Rank: %d symmetric memory is set: %p [%ld bytes]\n",
-            UCC_TL_TEAM_RANK(team),
-            (void *)mc_va,
-            mc_size);
+        tl_debug(UCC_TL_TEAM_LIB(team),
+                 "Rank: %d symmetric memory is set: %p [%ld bytes]",
+                 UCC_TL_TEAM_RANK(team), (void *)mc_va, mc_size);
 
         // Store the handles for cleanup in team destroy
         nvls->mc_va        = mc_va;
         nvls->uc_va        = uc_va;
         nvls->mc_memhandle = mem_handle;
         nvls->mc_offset    = 0; // mcOffset;
-        nvls->coll_ids     = (size_t *)ucc_malloc(
-            lib->cfg.max_concurrent * sizeof(size_t), "coll_ids");
-        if (!nvls->coll_ids) {
-            status = UCC_ERR_NO_MEMORY;
-            goto cleanup;
+
+        team->state = UCC_TL_CUDA_NVLS_STATE_BARRIER;
+        // fall through
+    }
+    case UCC_TL_CUDA_NVLS_STATE_BARRIER:
+    {
+        /* Final OOB barrier — ensures all ranks have completed the NVLS
+         * memory setup (including any async fabric operations) before any
+         * collective can use the team. */
+        if (nvls->barrier_data == NULL) {
+            nvls->barrier_data = (char *)ucc_malloc(
+                UCC_TL_TEAM_SIZE(team), "nvls_barrier");
+            if (!nvls->barrier_data) {
+                status = UCC_ERR_NO_MEMORY;
+                goto cleanup;
+            }
+            nvls->barrier_data[UCC_TL_TEAM_RANK(team)] = 1;
         }
-        // Initialize the coll_ids to 0
-        memset(nvls->coll_ids, 0, lib->cfg.max_concurrent * sizeof(size_t));
 
-        if (UCC_TL_TEAM_RANK(team) == 0) {
-            // root rank zero-initializes the control region for each task slot
-            CUDA_CHECK(cudaMemset2D(
-                (void *)(uc_va + lib->cfg.nvls_symmetric_size),
-                lib->cfg.nvls_symmetric_size + NVLS_CONTROL_SIZE,
-                0,
-                NVLS_CONTROL_SIZE,
-                lib->cfg.max_concurrent));
+        if (team->oob_req == NULL) {
+            status = team->oob.allgather(
+                &nvls->barrier_data[UCC_TL_TEAM_RANK(team)],
+                nvls->barrier_data,
+                1,
+                team->oob.coll_info,
+                &team->oob_req);
+            if (status != UCC_OK) {
+                tl_error(UCC_TL_TEAM_LIB(team),
+                         "failed to initiate NVLS barrier");
+                ucc_free(nvls->barrier_data);
+                nvls->barrier_data = NULL;
+                goto cleanup;
+            }
         }
 
+        status = team->oob.req_test(team->oob_req);
+        if (status > 0) {
+            return UCC_INPROGRESS;
+        }
+        if (status < 0) {
+            tl_error(UCC_TL_TEAM_LIB(team), "NVLS barrier failed");
+            ucc_free(nvls->barrier_data);
+            nvls->barrier_data = NULL;
+            goto cleanup;
+        }
+
+        team->oob.req_free(team->oob_req);
+        team->oob_req = NULL;
+        ucc_free(nvls->barrier_data);
+        nvls->barrier_data = NULL;
+
+        tl_debug(UCC_TL_TEAM_LIB(team),
+                 "NVLS init: rank %d OOB barrier complete — team ready",
+                 UCC_TL_TEAM_RANK(team));
         break;
     }
     default:
@@ -647,47 +722,49 @@ ucc_status_t ucc_tl_cuda_nvls_init(
         ucc_free(nvls->share_data);
         nvls->share_data = NULL;
     }
+    if (nvls->barrier_data) {
+        ucc_free(nvls->barrier_data);
+        nvls->barrier_data = NULL;
+    }
 
     // Clean up CUDA resources - check local variables for partial allocations
     // Unmap and free multicast VA if it was reserved/mapped
     if (mc_va != 0) {
         if (CUDADRV_FUNC(cuMemUnmap(mc_va, mc_size)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to unmap mc_va during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to unmap mc_va during cleanup");
         }
         if (CUDADRV_FUNC(cuMemAddressFree(mc_va, mc_size)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to free mc_va during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to free mc_va during cleanup");
         }
     }
 
     // Unmap and free unicast VA if it was reserved/mapped
     if (uc_va != 0) {
         if (CUDADRV_FUNC(cuMemUnmap(uc_va, mc_size)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to unmap uc_va during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to unmap uc_va during cleanup");
         }
         if (CUDADRV_FUNC(cuMemAddressFree(uc_va, mc_size)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team), "failed to free uc_va during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to free uc_va during cleanup");
         }
     }
 
     // Release memory handle if it was created
     if (mem_handle != 0) {
         if (CUDADRV_FUNC(cuMemRelease(mem_handle)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team),
-                "failed to release mem_handle during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to release mem_handle during cleanup");
         }
     }
 
     // Release multicast handle if it was created or imported
     if (nvls->mc_handle != 0) {
         if (CUDADRV_FUNC(cuMemRelease(nvls->mc_handle)) != UCC_OK) {
-            tl_error(
-                UCC_TL_TEAM_LIB(team),
-                "failed to release mc_handle during cleanup");
+            tl_error(UCC_TL_TEAM_LIB(team),
+                     "failed to release mc_handle during cleanup");
         }
         nvls->mc_handle = 0;
     }
@@ -742,5 +819,9 @@ ucc_status_t ucc_tl_cuda_nvls_destroy(ucc_tl_cuda_team_t *team)
         ucc_free(team->nvls.share_data);
         team->nvls.share_data = NULL;
     }
+    if (team->nvls.barrier_data) {
+        ucc_free(team->nvls.barrier_data);
+        team->nvls.barrier_data = NULL;
+    }
     return UCC_OK;
 }
diff --git a/src/components/tl/cuda/tl_cuda_nvls.h b/src/components/tl/cuda/tl_cuda_nvls.h
@@ -40,6 +40,7 @@ typedef enum {
     UCC_TL_CUDA_NVLS_STATE_SHARE_HANDLES,
     UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE,
     UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE,
+    UCC_TL_CUDA_NVLS_STATE_BARRIER,
 } ucc_tl_cuda_nvls_state_t;
 
 typedef struct ucc_tl_cuda_nvls {
@@ -71,6 +72,8 @@ typedef struct ucc_tl_cuda_nvls {
     size_t                       minGran;
     // Granularity
     size_t                       gran;
+    // Temporary buffer for final OOB barrier (STATE_BARRIER)
+    char                        *barrier_data;
 } ucc_tl_cuda_nvls_t;
 
 typedef struct ucc_tl_cuda_nvls_control {