Skip to content

Commit 2551467

Browse files
committed
TL/CUDA: fix control region init and add OOB barrier for NVLS
Two critical correctness fixes in ucc_tl_cuda_nvls_init STATE_ADD_DEVICE: 1. All ranks must zero-initialise their control region via UC VA BEFORE cuMulticastAddDevice, not only rank 0 after cuMulticastBindAddr. Each rank's uc_va maps its OWN physical pages; only that rank can zero its arrival_counter. Previously, ranks 1..N-1 started with uninitialised counters, causing incorrect allreduce results. The fix exploits the fact that cuMulticastBindAddr blocks until all ranks have called cuMulticastAddDevice, so the control-region memset (done BEFORE addDevice) is guaranteed complete on every rank when the bind returns. 2. Add UCC_TL_CUDA_NVLS_STATE_BARRIER: a final OOB allgather barrier after the CUDA setup is complete. This ensures all ranks have finished NVLS initialisation (including any async fabric operations) before any collective uses the team. Also reorder STATE_ADD_DEVICE to: allocate physical memory → map UC VA → init control regions → addDevice → bind → map MC VA. Store nvls handles only after full success to keep nvls_destroy idempotent and avoid double-free on init failure.
1 parent 78a8d13 commit 2551467

File tree

2 files changed

+146
-62
lines changed

2 files changed

+146
-62
lines changed

src/components/tl/cuda/tl_cuda_nvls.c

Lines changed: 143 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -512,19 +512,19 @@ ucc_status_t ucc_tl_cuda_nvls_init(
512512
// fall through
513513
case UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE:
514514
{
515-
// Add device to multicast object
516-
status = CUDADRV_FUNC(
517-
cuMulticastAddDevice(nvls->mc_handle, nvls->device));
518-
if (status != UCC_OK) {
519-
tl_error(
520-
UCC_TL_TEAM_LIB(team), "failed to add device to multicast");
521-
goto cleanup;
522-
}
523-
tl_debug(
524-
UCC_TL_TEAM_LIB(team),
525-
"RANK %d: added device %d to multicast\n",
526-
UCC_TL_TEAM_RANK(team),
527-
nvls->device);
515+
/* Correct ordering for NVLS memory setup:
516+
*
517+
* 1. Allocate physical memory and map UC VA first.
518+
* 2. Zero-initialise the control region on UC VA on ALL ranks BEFORE
519+
* calling cuMulticastAddDevice. cuMulticastBindAddr blocks until
520+
* every rank has called cuMulticastAddDevice, so when it returns
521+
* ALL ranks have already zeroed their control regions — no extra
522+
* barrier needed to protect the counters.
523+
* 3. Add device to multicast → bind (collective barrier point) → map MC VA.
524+
*
525+
* Doing the memset only on rank 0 (or after bind) leaves arrival_counter
526+
* uninitialised on non-root ranks which causes incorrect collective results.
527+
*/
528528

529529
// Allocate physical memory
530530
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -537,9 +537,8 @@ ucc_status_t ucc_tl_cuda_nvls_init(
537537
mc_size = nvls->mc_size;
538538
status = CUDADRV_FUNC(cuMemCreate(&mem_handle, mc_size, &prop, 0));
539539
if (status != UCC_OK) {
540-
tl_error(
541-
UCC_TL_TEAM_LIB(team),
542-
"failed to create memory allocation for multicast");
540+
tl_error(UCC_TL_TEAM_LIB(team),
541+
"failed to create memory allocation for multicast");
543542
goto cleanup;
544543
}
545544

@@ -549,12 +548,11 @@ ucc_status_t ucc_tl_cuda_nvls_init(
549548
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
550549

551550
// Reserve and map unicast virtual address space
552-
status = CUDADRV_FUNC(
551+
status = CUDADRV_FUNC(
553552
cuMemAddressReserve(&uc_va, mc_size, nvls->minGran, 0U, 0));
554553
if (status != UCC_OK) {
555-
tl_error(
556-
UCC_TL_TEAM_LIB(team),
557-
"failed to reserve virtual address space");
554+
tl_error(UCC_TL_TEAM_LIB(team),
555+
"failed to reserve virtual address space");
558556
goto cleanup;
559557
}
560558

@@ -570,22 +568,67 @@ ucc_status_t ucc_tl_cuda_nvls_init(
570568
goto cleanup;
571569
}
572570

571+
// Allocate coll_ids
572+
nvls->coll_ids = (size_t *)ucc_malloc(
573+
lib->cfg.max_concurrent * sizeof(size_t), "coll_ids");
574+
if (!nvls->coll_ids) {
575+
status = UCC_ERR_NO_MEMORY;
576+
goto cleanup;
577+
}
578+
// Initialize the coll_ids to 0
579+
memset(nvls->coll_ids, 0, lib->cfg.max_concurrent * sizeof(size_t));
580+
581+
/* ALL ranks zero-initialise their own control region via UC VA
582+
* BEFORE cuMulticastAddDevice. cuMulticastBindAddr acts as a
583+
* collective barrier: when it returns every rank has already
584+
* completed its memset, so arrival_counter starts at 0 everywhere. */
585+
// Initialize control regions on uc_va BEFORE adding device to multicast.
586+
// This is critical: cuMulticastBindAddr blocks until all devices are added,
587+
// so by doing memset BEFORE cuMulticastAddDevice, we guarantee that when
588+
// cuMulticastBindAddr unblocks, ALL ranks have initialized their control regions.
589+
tl_debug(UCC_TL_TEAM_LIB(team),
590+
"NVLS init: rank %d initialising control regions "
591+
"uc_va=%p symm_size=%zu ctrl_size=%d slots=%u",
592+
UCC_TL_TEAM_RANK(team), (void *)uc_va,
593+
lib->cfg.nvls_symmetric_size, NVLS_CONTROL_SIZE,
594+
lib->cfg.max_concurrent);
595+
CUDA_CHECK(cudaMemset2D(
596+
(void *)(uc_va + lib->cfg.nvls_symmetric_size),
597+
lib->cfg.nvls_symmetric_size + NVLS_CONTROL_SIZE,
598+
0,
599+
NVLS_CONTROL_SIZE,
600+
lib->cfg.max_concurrent));
601+
602+
// Add device to multicast object
603+
status = CUDADRV_FUNC(
604+
cuMulticastAddDevice(nvls->mc_handle, nvls->device));
605+
if (status != UCC_OK) {
606+
tl_error(UCC_TL_TEAM_LIB(team),
607+
"failed to add device to multicast");
608+
goto cleanup;
609+
}
610+
tl_debug(UCC_TL_TEAM_LIB(team),
611+
"RANK %d: added device %d to multicast",
612+
UCC_TL_TEAM_RANK(team), nvls->device);
613+
573614
// Bind memory to multicast object
615+
// This BLOCKS until all devices have called cuMulticastAddDevice.
616+
// Since we initialized control regions BEFORE cuMulticastAddDevice,
617+
// when this returns, ALL ranks have completed their control region init.
574618
status = CUDADRV_FUNC(cuMulticastBindAddr(
575619
nvls->mc_handle, 0 /*mcOffset*/, uc_va, mc_size, 0));
576620
if (status != UCC_OK) {
577-
tl_error(
578-
UCC_TL_TEAM_LIB(team), "failed to bind memory to multicast");
621+
tl_error(UCC_TL_TEAM_LIB(team),
622+
"failed to bind memory to multicast");
579623
goto cleanup;
580624
}
581625

582626
// Reserve and map multicast virtual address space
583627
status = CUDADRV_FUNC(
584628
cuMemAddressReserve(&mc_va, mc_size, nvls->minGran, 0U, 0));
585629
if (status != UCC_OK) {
586-
tl_error(
587-
UCC_TL_TEAM_LIB(team),
588-
"failed to reserve multicast virtual address space");
630+
tl_error(UCC_TL_TEAM_LIB(team),
631+
"failed to reserve multicast virtual address space");
589632
goto cleanup;
590633
}
591634

@@ -597,42 +640,74 @@ ucc_status_t ucc_tl_cuda_nvls_init(
597640

598641
status = CUDADRV_FUNC(cuMemSetAccess(mc_va, mc_size, &accessDesc, 1));
599642
if (status != UCC_OK) {
600-
tl_error(
601-
UCC_TL_TEAM_LIB(team), "failed to set multicast memory access");
643+
tl_error(UCC_TL_TEAM_LIB(team),
644+
"failed to set multicast memory access");
602645
goto cleanup;
603646
}
604647

605-
tl_debug(
606-
UCC_TL_TEAM_LIB(team),
607-
"Rank: %d symmetric memory is set: %p [%ld bytes]\n",
608-
UCC_TL_TEAM_RANK(team),
609-
(void *)mc_va,
610-
mc_size);
648+
tl_debug(UCC_TL_TEAM_LIB(team),
649+
"Rank: %d symmetric memory is set: %p [%ld bytes]",
650+
UCC_TL_TEAM_RANK(team), (void *)mc_va, mc_size);
611651

612652
// Store the handles for cleanup in team destroy
613653
nvls->mc_va = mc_va;
614654
nvls->uc_va = uc_va;
615655
nvls->mc_memhandle = mem_handle;
616656
nvls->mc_offset = 0; // mcOffset;
617-
nvls->coll_ids = (size_t *)ucc_malloc(
618-
lib->cfg.max_concurrent * sizeof(size_t), "coll_ids");
619-
if (!nvls->coll_ids) {
620-
status = UCC_ERR_NO_MEMORY;
621-
goto cleanup;
657+
658+
team->state = UCC_TL_CUDA_NVLS_STATE_BARRIER;
659+
// fall through
660+
}
661+
case UCC_TL_CUDA_NVLS_STATE_BARRIER:
662+
{
663+
/* Final OOB barrier — ensures all ranks have completed the NVLS
664+
* memory setup (including any async fabric operations) before any
665+
* collective can use the team. */
666+
if (nvls->barrier_data == NULL) {
667+
nvls->barrier_data = (char *)ucc_malloc(
668+
UCC_TL_TEAM_SIZE(team), "nvls_barrier");
669+
if (!nvls->barrier_data) {
670+
status = UCC_ERR_NO_MEMORY;
671+
goto cleanup;
672+
}
673+
nvls->barrier_data[UCC_TL_TEAM_RANK(team)] = 1;
622674
}
623-
// Initialize the coll_ids to 0
624-
memset(nvls->coll_ids, 0, lib->cfg.max_concurrent * sizeof(size_t));
625675

626-
if (UCC_TL_TEAM_RANK(team) == 0) {
627-
// root rank zero-initializes the control region for each task slot
628-
CUDA_CHECK(cudaMemset2D(
629-
(void *)(uc_va + lib->cfg.nvls_symmetric_size),
630-
lib->cfg.nvls_symmetric_size + NVLS_CONTROL_SIZE,
631-
0,
632-
NVLS_CONTROL_SIZE,
633-
lib->cfg.max_concurrent));
676+
if (team->oob_req == NULL) {
677+
status = team->oob.allgather(
678+
&nvls->barrier_data[UCC_TL_TEAM_RANK(team)],
679+
nvls->barrier_data,
680+
1,
681+
team->oob.coll_info,
682+
&team->oob_req);
683+
if (status != UCC_OK) {
684+
tl_error(UCC_TL_TEAM_LIB(team),
685+
"failed to initiate NVLS barrier");
686+
ucc_free(nvls->barrier_data);
687+
nvls->barrier_data = NULL;
688+
goto cleanup;
689+
}
634690
}
635691

692+
status = team->oob.req_test(team->oob_req);
693+
if (status > 0) {
694+
return UCC_INPROGRESS;
695+
}
696+
if (status < 0) {
697+
tl_error(UCC_TL_TEAM_LIB(team), "NVLS barrier failed");
698+
ucc_free(nvls->barrier_data);
699+
nvls->barrier_data = NULL;
700+
goto cleanup;
701+
}
702+
703+
team->oob.req_free(team->oob_req);
704+
team->oob_req = NULL;
705+
ucc_free(nvls->barrier_data);
706+
nvls->barrier_data = NULL;
707+
708+
tl_debug(UCC_TL_TEAM_LIB(team),
709+
"NVLS init: rank %d OOB barrier complete — team ready",
710+
UCC_TL_TEAM_RANK(team));
636711
break;
637712
}
638713
default:
@@ -647,47 +722,49 @@ ucc_status_t ucc_tl_cuda_nvls_init(
647722
ucc_free(nvls->share_data);
648723
nvls->share_data = NULL;
649724
}
725+
if (nvls->barrier_data) {
726+
ucc_free(nvls->barrier_data);
727+
nvls->barrier_data = NULL;
728+
}
650729

651730
// Clean up CUDA resources - check local variables for partial allocations
652731
// Unmap and free multicast VA if it was reserved/mapped
653732
if (mc_va != 0) {
654733
if (CUDADRV_FUNC(cuMemUnmap(mc_va, mc_size)) != UCC_OK) {
655-
tl_error(
656-
UCC_TL_TEAM_LIB(team), "failed to unmap mc_va during cleanup");
734+
tl_error(UCC_TL_TEAM_LIB(team),
735+
"failed to unmap mc_va during cleanup");
657736
}
658737
if (CUDADRV_FUNC(cuMemAddressFree(mc_va, mc_size)) != UCC_OK) {
659-
tl_error(
660-
UCC_TL_TEAM_LIB(team), "failed to free mc_va during cleanup");
738+
tl_error(UCC_TL_TEAM_LIB(team),
739+
"failed to free mc_va during cleanup");
661740
}
662741
}
663742

664743
// Unmap and free unicast VA if it was reserved/mapped
665744
if (uc_va != 0) {
666745
if (CUDADRV_FUNC(cuMemUnmap(uc_va, mc_size)) != UCC_OK) {
667-
tl_error(
668-
UCC_TL_TEAM_LIB(team), "failed to unmap uc_va during cleanup");
746+
tl_error(UCC_TL_TEAM_LIB(team),
747+
"failed to unmap uc_va during cleanup");
669748
}
670749
if (CUDADRV_FUNC(cuMemAddressFree(uc_va, mc_size)) != UCC_OK) {
671-
tl_error(
672-
UCC_TL_TEAM_LIB(team), "failed to free uc_va during cleanup");
750+
tl_error(UCC_TL_TEAM_LIB(team),
751+
"failed to free uc_va during cleanup");
673752
}
674753
}
675754

676755
// Release memory handle if it was created
677756
if (mem_handle != 0) {
678757
if (CUDADRV_FUNC(cuMemRelease(mem_handle)) != UCC_OK) {
679-
tl_error(
680-
UCC_TL_TEAM_LIB(team),
681-
"failed to release mem_handle during cleanup");
758+
tl_error(UCC_TL_TEAM_LIB(team),
759+
"failed to release mem_handle during cleanup");
682760
}
683761
}
684762

685763
// Release multicast handle if it was created or imported
686764
if (nvls->mc_handle != 0) {
687765
if (CUDADRV_FUNC(cuMemRelease(nvls->mc_handle)) != UCC_OK) {
688-
tl_error(
689-
UCC_TL_TEAM_LIB(team),
690-
"failed to release mc_handle during cleanup");
766+
tl_error(UCC_TL_TEAM_LIB(team),
767+
"failed to release mc_handle during cleanup");
691768
}
692769
nvls->mc_handle = 0;
693770
}
@@ -742,5 +819,9 @@ ucc_status_t ucc_tl_cuda_nvls_destroy(ucc_tl_cuda_team_t *team)
742819
ucc_free(team->nvls.share_data);
743820
team->nvls.share_data = NULL;
744821
}
822+
if (team->nvls.barrier_data) {
823+
ucc_free(team->nvls.barrier_data);
824+
team->nvls.barrier_data = NULL;
825+
}
745826
return UCC_OK;
746827
}

src/components/tl/cuda/tl_cuda_nvls.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ typedef enum {
4040
UCC_TL_CUDA_NVLS_STATE_SHARE_HANDLES,
4141
UCC_TL_CUDA_NVLS_STATE_IMPORT_HANDLE,
4242
UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE,
43+
UCC_TL_CUDA_NVLS_STATE_BARRIER,
4344
} ucc_tl_cuda_nvls_state_t;
4445

4546
typedef struct ucc_tl_cuda_nvls {
@@ -71,6 +72,8 @@ typedef struct ucc_tl_cuda_nvls {
7172
size_t minGran;
7273
// Granularity
7374
size_t gran;
75+
// Temporary buffer for final OOB barrier (STATE_BARRIER)
76+
char *barrier_data;
7477
} ucc_tl_cuda_nvls_t;
7578

7679
typedef struct ucc_tl_cuda_nvls_control {

0 commit comments

Comments
 (0)