Skip to content

Commit 7deed23

Browse files
committed
TL/CUDA: raise MAX_NVLS_PEERS to 576 and guard nvls_init
GB200/NVL systems support up to 72 GPUs per NVSwitch domain with up to 8 NVLink partitions per domain, yielding a theoretical maximum of 576 participants in a single NVLS multicast group. Raise UCC_TL_CUDA_MAX_NVLS_PEERS from 144 to 72*8=576 to accommodate these configurations. Add a guard in nvls_init STATE_INIT that rejects teams exceeding this limit with UCC_ERR_NOT_SUPPORTED and a clear warning, preventing out-of-bounds accesses in the NVLS allgather buffer.
1 parent e60cc40 commit 7deed23

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

src/components/tl/cuda/tl_cuda.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
#define UCC_TL_CUDA_MAX_RING_CHUNKS 8
3333

3434
#ifdef HAVE_NVLS
35-
#define UCC_TL_CUDA_MAX_NVLS_PEERS 144
35+
/* 72 GPUs per NVSwitch domain * 8 partitions = 576 max NVLS peers */
36+
#define UCC_TL_CUDA_MAX_NVLS_PEERS (72 * 8)
3637
#define UCC_TL_CUDA_MAX_NVLS_SM_COUNT 32
3738
#define UCC_TL_CUDA_MAX_NVLS_THREADS 1024
3839

src/components/tl/cuda/tl_cuda_nvls.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,14 @@ ucc_status_t ucc_tl_cuda_nvls_init(
329329
return status;
330330
}
331331

332+
if (UCC_TL_TEAM_SIZE(team) > UCC_TL_CUDA_MAX_NVLS_PEERS) {
333+
tl_warn(lib,
334+
"NVLS: team size %u exceeds maximum supported peers %d; "
335+
"disabling NVLS for this team",
336+
UCC_TL_TEAM_SIZE(team), UCC_TL_CUDA_MAX_NVLS_PEERS);
337+
return UCC_ERR_NOT_SUPPORTED;
338+
}
339+
332340
if (nvls->is_multinode) {
333341
ucc_team_t *ucc_team = UCC_TL_CORE_TEAM(team);
334342
ucc_rank_t min_ppn, max_ppn;

0 commit comments

Comments
 (0)