@@ -512,19 +512,19 @@ ucc_status_t ucc_tl_cuda_nvls_init(
512512 // fall through
513513 case UCC_TL_CUDA_NVLS_STATE_ADD_DEVICE :
514514 {
515- // Add device to multicast object
516- status = CUDADRV_FUNC (
517- cuMulticastAddDevice ( nvls -> mc_handle , nvls -> device ));
518- if ( status != UCC_OK ) {
519- tl_error (
520- UCC_TL_TEAM_LIB ( team ), "failed to add device to multicast" );
521- goto cleanup ;
522- }
523- tl_debug (
524- UCC_TL_TEAM_LIB ( team ),
525- "RANK %d: added device %d to multicast\n" ,
526- UCC_TL_TEAM_RANK ( team ),
527- nvls -> device );
515+ /* Correct ordering for NVLS memory setup:
516+ *
517+ * 1. Allocate physical memory and map UC VA first.
518+ * 2. Zero-initialise the control region on UC VA on ALL ranks BEFORE
519+ * calling cuMulticastAddDevice. cuMulticastBindAddr blocks until
520+ * every rank has called cuMulticastAddDevice, so when it returns
521+ * ALL ranks have already zeroed their control regions — no extra
522+ * barrier needed to protect the counters.
523+ * 3. Add device to multicast → bind (collective barrier point) → map MC VA.
524+ *
525+ * Doing the memset only on rank 0 (or after bind) leaves arrival_counter
526+ * uninitialised on non-root ranks which causes incorrect collective results.
527+ */
528528
529529 // Allocate physical memory
530530 prop .type = CU_MEM_ALLOCATION_TYPE_PINNED ;
@@ -537,9 +537,8 @@ ucc_status_t ucc_tl_cuda_nvls_init(
537537 mc_size = nvls -> mc_size ;
538538 status = CUDADRV_FUNC (cuMemCreate (& mem_handle , mc_size , & prop , 0 ));
539539 if (status != UCC_OK ) {
540- tl_error (
541- UCC_TL_TEAM_LIB (team ),
542- "failed to create memory allocation for multicast" );
540+ tl_error (UCC_TL_TEAM_LIB (team ),
541+ "failed to create memory allocation for multicast" );
543542 goto cleanup ;
544543 }
545544
@@ -549,12 +548,11 @@ ucc_status_t ucc_tl_cuda_nvls_init(
549548 accessDesc .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE ;
550549
551550 // Reserve and map unicast virtual address space
552- status = CUDADRV_FUNC (
551+ status = CUDADRV_FUNC (
553552 cuMemAddressReserve (& uc_va , mc_size , nvls -> minGran , 0U , 0 ));
554553 if (status != UCC_OK ) {
555- tl_error (
556- UCC_TL_TEAM_LIB (team ),
557- "failed to reserve virtual address space" );
554+ tl_error (UCC_TL_TEAM_LIB (team ),
555+ "failed to reserve virtual address space" );
558556 goto cleanup ;
559557 }
560558
@@ -570,22 +568,67 @@ ucc_status_t ucc_tl_cuda_nvls_init(
570568 goto cleanup ;
571569 }
572570
571+ // Allocate coll_ids
572+ nvls -> coll_ids = (size_t * )ucc_malloc (
573+ lib -> cfg .max_concurrent * sizeof (size_t ), "coll_ids" );
574+ if (!nvls -> coll_ids ) {
575+ status = UCC_ERR_NO_MEMORY ;
576+ goto cleanup ;
577+ }
578+ // Initialize the coll_ids to 0
579+ memset (nvls -> coll_ids , 0 , lib -> cfg .max_concurrent * sizeof (size_t ));
580+
581+ /* ALL ranks zero-initialise their own control region via UC VA
582+ * BEFORE cuMulticastAddDevice. cuMulticastBindAddr acts as a
583+ * collective barrier: when it returns every rank has already
584+ * completed its memset, so arrival_counter starts at 0 everywhere. */
585+ // Initialize control regions on uc_va BEFORE adding device to multicast.
586+ // This is critical: cuMulticastBindAddr blocks until all devices are added,
587+ // so by doing memset BEFORE cuMulticastAddDevice, we guarantee that when
588+ // cuMulticastBindAddr unblocks, ALL ranks have initialized their control regions.
589+ tl_debug (UCC_TL_TEAM_LIB (team ),
590+ "NVLS init: rank %d initialising control regions "
591+ "uc_va=%p symm_size=%zu ctrl_size=%d slots=%u" ,
592+ UCC_TL_TEAM_RANK (team ), (void * )uc_va ,
593+ lib -> cfg .nvls_symmetric_size , NVLS_CONTROL_SIZE ,
594+ lib -> cfg .max_concurrent );
595+ CUDA_CHECK (cudaMemset2D (
596+ (void * )(uc_va + lib -> cfg .nvls_symmetric_size ),
597+ lib -> cfg .nvls_symmetric_size + NVLS_CONTROL_SIZE ,
598+ 0 ,
599+ NVLS_CONTROL_SIZE ,
600+ lib -> cfg .max_concurrent ));
601+
602+ // Add device to multicast object
603+ status = CUDADRV_FUNC (
604+ cuMulticastAddDevice (nvls -> mc_handle , nvls -> device ));
605+ if (status != UCC_OK ) {
606+ tl_error (UCC_TL_TEAM_LIB (team ),
607+ "failed to add device to multicast" );
608+ goto cleanup ;
609+ }
610+ tl_debug (UCC_TL_TEAM_LIB (team ),
611+ "RANK %d: added device %d to multicast" ,
612+ UCC_TL_TEAM_RANK (team ), nvls -> device );
613+
573614 // Bind memory to multicast object
615+ // This BLOCKS until all devices have called cuMulticastAddDevice.
616+ // Since we initialized control regions BEFORE cuMulticastAddDevice,
617+ // when this returns, ALL ranks have completed their control region init.
574618 status = CUDADRV_FUNC (cuMulticastBindAddr (
575619 nvls -> mc_handle , 0 /*mcOffset*/ , uc_va , mc_size , 0 ));
576620 if (status != UCC_OK ) {
577- tl_error (
578- UCC_TL_TEAM_LIB ( team ), "failed to bind memory to multicast" );
621+ tl_error (UCC_TL_TEAM_LIB ( team ),
622+ "failed to bind memory to multicast" );
579623 goto cleanup ;
580624 }
581625
582626 // Reserve and map multicast virtual address space
583627 status = CUDADRV_FUNC (
584628 cuMemAddressReserve (& mc_va , mc_size , nvls -> minGran , 0U , 0 ));
585629 if (status != UCC_OK ) {
586- tl_error (
587- UCC_TL_TEAM_LIB (team ),
588- "failed to reserve multicast virtual address space" );
630+ tl_error (UCC_TL_TEAM_LIB (team ),
631+ "failed to reserve multicast virtual address space" );
589632 goto cleanup ;
590633 }
591634
@@ -597,42 +640,74 @@ ucc_status_t ucc_tl_cuda_nvls_init(
597640
598641 status = CUDADRV_FUNC (cuMemSetAccess (mc_va , mc_size , & accessDesc , 1 ));
599642 if (status != UCC_OK ) {
600- tl_error (
601- UCC_TL_TEAM_LIB ( team ), "failed to set multicast memory access" );
643+ tl_error (UCC_TL_TEAM_LIB ( team ),
644+ "failed to set multicast memory access" );
602645 goto cleanup ;
603646 }
604647
605- tl_debug (
606- UCC_TL_TEAM_LIB (team ),
607- "Rank: %d symmetric memory is set: %p [%ld bytes]\n" ,
608- UCC_TL_TEAM_RANK (team ),
609- (void * )mc_va ,
610- mc_size );
648+ tl_debug (UCC_TL_TEAM_LIB (team ),
649+ "Rank: %d symmetric memory is set: %p [%ld bytes]" ,
650+ UCC_TL_TEAM_RANK (team ), (void * )mc_va , mc_size );
611651
612652 // Store the handles for cleanup in team destroy
613653 nvls -> mc_va = mc_va ;
614654 nvls -> uc_va = uc_va ;
615655 nvls -> mc_memhandle = mem_handle ;
616656 nvls -> mc_offset = 0 ; // mcOffset;
617- nvls -> coll_ids = (size_t * )ucc_malloc (
618- lib -> cfg .max_concurrent * sizeof (size_t ), "coll_ids" );
619- if (!nvls -> coll_ids ) {
620- status = UCC_ERR_NO_MEMORY ;
621- goto cleanup ;
657+
658+ team -> state = UCC_TL_CUDA_NVLS_STATE_BARRIER ;
659+ // fall through
660+ }
661+ case UCC_TL_CUDA_NVLS_STATE_BARRIER :
662+ {
663+ /* Final OOB barrier — ensures all ranks have completed the NVLS
664+ * memory setup (including any async fabric operations) before any
665+ * collective can use the team. */
666+ if (nvls -> barrier_data == NULL ) {
667+ nvls -> barrier_data = (char * )ucc_malloc (
668+ UCC_TL_TEAM_SIZE (team ), "nvls_barrier" );
669+ if (!nvls -> barrier_data ) {
670+ status = UCC_ERR_NO_MEMORY ;
671+ goto cleanup ;
672+ }
673+ nvls -> barrier_data [UCC_TL_TEAM_RANK (team )] = 1 ;
622674 }
623- // Initialize the coll_ids to 0
624- memset (nvls -> coll_ids , 0 , lib -> cfg .max_concurrent * sizeof (size_t ));
625675
626- if (UCC_TL_TEAM_RANK (team ) == 0 ) {
627- // root rank zero-initializes the control region for each task slot
628- CUDA_CHECK (cudaMemset2D (
629- (void * )(uc_va + lib -> cfg .nvls_symmetric_size ),
630- lib -> cfg .nvls_symmetric_size + NVLS_CONTROL_SIZE ,
631- 0 ,
632- NVLS_CONTROL_SIZE ,
633- lib -> cfg .max_concurrent ));
676+ if (team -> oob_req == NULL ) {
677+ status = team -> oob .allgather (
678+ & nvls -> barrier_data [UCC_TL_TEAM_RANK (team )],
679+ nvls -> barrier_data ,
680+ 1 ,
681+ team -> oob .coll_info ,
682+ & team -> oob_req );
683+ if (status != UCC_OK ) {
684+ tl_error (UCC_TL_TEAM_LIB (team ),
685+ "failed to initiate NVLS barrier" );
686+ ucc_free (nvls -> barrier_data );
687+ nvls -> barrier_data = NULL ;
688+ goto cleanup ;
689+ }
634690 }
635691
692+ status = team -> oob .req_test (team -> oob_req );
693+ if (status > 0 ) {
694+ return UCC_INPROGRESS ;
695+ }
696+ if (status < 0 ) {
697+ tl_error (UCC_TL_TEAM_LIB (team ), "NVLS barrier failed" );
698+ ucc_free (nvls -> barrier_data );
699+ nvls -> barrier_data = NULL ;
700+ goto cleanup ;
701+ }
702+
703+ team -> oob .req_free (team -> oob_req );
704+ team -> oob_req = NULL ;
705+ ucc_free (nvls -> barrier_data );
706+ nvls -> barrier_data = NULL ;
707+
708+ tl_debug (UCC_TL_TEAM_LIB (team ),
709+ "NVLS init: rank %d OOB barrier complete — team ready" ,
710+ UCC_TL_TEAM_RANK (team ));
636711 break ;
637712 }
638713 default :
@@ -647,47 +722,49 @@ ucc_status_t ucc_tl_cuda_nvls_init(
647722 ucc_free (nvls -> share_data );
648723 nvls -> share_data = NULL ;
649724 }
725+ if (nvls -> barrier_data ) {
726+ ucc_free (nvls -> barrier_data );
727+ nvls -> barrier_data = NULL ;
728+ }
650729
651730 // Clean up CUDA resources - check local variables for partial allocations
652731 // Unmap and free multicast VA if it was reserved/mapped
653732 if (mc_va != 0 ) {
654733 if (CUDADRV_FUNC (cuMemUnmap (mc_va , mc_size )) != UCC_OK ) {
655- tl_error (
656- UCC_TL_TEAM_LIB ( team ), "failed to unmap mc_va during cleanup" );
734+ tl_error (UCC_TL_TEAM_LIB ( team ),
735+ "failed to unmap mc_va during cleanup" );
657736 }
658737 if (CUDADRV_FUNC (cuMemAddressFree (mc_va , mc_size )) != UCC_OK ) {
659- tl_error (
660- UCC_TL_TEAM_LIB ( team ), "failed to free mc_va during cleanup" );
738+ tl_error (UCC_TL_TEAM_LIB ( team ),
739+ "failed to free mc_va during cleanup" );
661740 }
662741 }
663742
664743 // Unmap and free unicast VA if it was reserved/mapped
665744 if (uc_va != 0 ) {
666745 if (CUDADRV_FUNC (cuMemUnmap (uc_va , mc_size )) != UCC_OK ) {
667- tl_error (
668- UCC_TL_TEAM_LIB ( team ), "failed to unmap uc_va during cleanup" );
746+ tl_error (UCC_TL_TEAM_LIB ( team ),
747+ "failed to unmap uc_va during cleanup" );
669748 }
670749 if (CUDADRV_FUNC (cuMemAddressFree (uc_va , mc_size )) != UCC_OK ) {
671- tl_error (
672- UCC_TL_TEAM_LIB ( team ), "failed to free uc_va during cleanup" );
750+ tl_error (UCC_TL_TEAM_LIB ( team ),
751+ "failed to free uc_va during cleanup" );
673752 }
674753 }
675754
676755 // Release memory handle if it was created
677756 if (mem_handle != 0 ) {
678757 if (CUDADRV_FUNC (cuMemRelease (mem_handle )) != UCC_OK ) {
679- tl_error (
680- UCC_TL_TEAM_LIB (team ),
681- "failed to release mem_handle during cleanup" );
758+ tl_error (UCC_TL_TEAM_LIB (team ),
759+ "failed to release mem_handle during cleanup" );
682760 }
683761 }
684762
685763 // Release multicast handle if it was created or imported
686764 if (nvls -> mc_handle != 0 ) {
687765 if (CUDADRV_FUNC (cuMemRelease (nvls -> mc_handle )) != UCC_OK ) {
688- tl_error (
689- UCC_TL_TEAM_LIB (team ),
690- "failed to release mc_handle during cleanup" );
766+ tl_error (UCC_TL_TEAM_LIB (team ),
767+ "failed to release mc_handle during cleanup" );
691768 }
692769 nvls -> mc_handle = 0 ;
693770 }
@@ -742,5 +819,9 @@ ucc_status_t ucc_tl_cuda_nvls_destroy(ucc_tl_cuda_team_t *team)
742819 ucc_free (team -> nvls .share_data );
743820 team -> nvls .share_data = NULL ;
744821 }
822+ if (team -> nvls .barrier_data ) {
823+ ucc_free (team -> nvls .barrier_data );
824+ team -> nvls .barrier_data = NULL ;
825+ }
745826 return UCC_OK ;
746827}
0 commit comments