Skip to content

Commit 78a8d13

Browse files
committed
TL/CUDA: use nvmlDeviceGetGpuFabricInfoV for GB200+ NVL partition support
GB200 NVLink systems introduce NVL partitions (sub-fabric cliques): multiple logical NVLink domains may share the same cliqueId but have different partitionIds, and only GPUs within the same partition can form a multicast group. The older nvmlDeviceGetGpuFabricInfo API (v1) does not expose partitionId. Changes: - config/m4/cuda.m4: add AC_CHECK_DECL for nvmlDeviceGetGpuFabricInfoV to define HAVE_NVML_GPU_FABRIC_INFO_V when the versioned API is available (NVML r525+). - utils/ucc_proc_info.h: add fabric_partition_id field to ucc_gpu_info_t. - topo/cuda/ucc_sysinfo_cuda.c: use nvmlDeviceGetGpuFabricInfoV when HAVE_NVML_GPU_FABRIC_INFO_V is defined, populating partitionId; fall back to v1 (partitionId=0) when the new API is unavailable. Add debug-level logging so admins can diagnose fabric detection. - topo/ucc_topo.c: ucc_topo_is_single_nvlink_domain() now also checks that all ranks share the same non-zero partitionId when the v2 API populated it; a partitionId of 0 skips the partition check (v1 compat). Add per-rank debug messages for each failure case. - tl/cuda/tl_cuda_nvls.c: expand the NVLS domain warning to mention partition mismatch and direct users to DEBUG logging for details.
1 parent 4e47506 commit 78a8d13

File tree

5 files changed

+83
-12
lines changed

5 files changed

+83
-12
lines changed

config/m4/cuda.m4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ AS_IF([test "x$cuda_checked" != "xyes"],
8585
[AC_CHECK_LIB([nvidia-ml], [nvmlInit_v2],
8686
[NVML_LIBS="-lnvidia-ml"],
8787
[nvml_happy="no"])])
88+
AS_IF([test "x$cuda_happy" = "xyes" -a "x$nvml_happy" = "xyes"],
89+
[AC_CHECK_DECL([nvmlDeviceGetGpuFabricInfoV],
90+
[AC_DEFINE([HAVE_NVML_GPU_FABRIC_INFO_V], 1,
91+
["nvmlDeviceGetGpuFabricInfoV (versioned) is available"])],
92+
[],
93+
[[#include <nvml.h>]])])
8894
AS_IF([test "x$cuda_happy" = "xyes" -a "x$nvml_happy" = "xyes"],
8995
[AC_CHECK_DECL([nvmlDeviceGetNvLinkRemoteDeviceType],
9096
[AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetNvLinkRemoteDeviceType],

src/components/tl/cuda/tl_cuda_nvls.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,11 @@ ucc_status_t ucc_tl_cuda_nvls_init(
343343

344344
if (!ucc_topo_is_single_nvlink_domain(ucc_team->topo)) {
345345
tl_warn(lib,
346-
"NVLS: team spans multiple NVLink fabric domains or "
347-
"fabric info is unavailable; disabling multinode NVLS");
346+
"NVLS: multinode team does not share a single NVLink "
347+
"fabric domain (mismatched clique IDs, different NVL "
348+
"partitions, or fabric info unavailable); "
349+
"run with UCC_LOG_LEVEL=DEBUG for details; "
350+
"disabling multinode NVLS");
348351
return UCC_ERR_NOT_SUPPORTED;
349352
}
350353

src/components/topo/cuda/ucc_sysinfo_cuda.c

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -498,21 +498,49 @@ static ucc_status_t ucc_sysinfo_cuda_get_info(void **info, int *n_info)
498498

499499
#ifdef HAVE_NVLS
500500
{
501+
#ifdef HAVE_NVML_GPU_FABRIC_INFO_V
502+
/* Use the versioned API (NVML r525+) which provides partitionId
503+
* needed for GB200+ NVL sub-fabric partition checks. */
504+
nvmlGpuFabricInfoV_t fabric_info;
505+
fabric_info.version = nvmlGpuFabricInfo_v2;
506+
nvml_st = nvmlDeviceGetGpuFabricInfoV(nvml_dev, &fabric_info);
507+
if (nvml_st == NVML_SUCCESS &&
508+
fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) {
509+
gpu_info->gpus[i].fabric_capable = 1;
510+
gpu_info->gpus[i].fabric_clique_id = fabric_info.cliqueId;
511+
gpu_info->gpus[i].fabric_partition_id = fabric_info.partitionId;
512+
ucc_debug("GPU %d: fabric_capable=1 clique=%u partition=%u",
513+
i, fabric_info.cliqueId, fabric_info.partitionId);
514+
} else {
515+
gpu_info->gpus[i].fabric_capable = 0;
516+
gpu_info->gpus[i].fabric_clique_id = 0;
517+
gpu_info->gpus[i].fabric_partition_id = 0;
518+
ucc_debug("GPU %d: fabric not ready (nvml_st=%d state=%d)",
519+
i, (int)nvml_st,
520+
nvml_st == NVML_SUCCESS ? (int)fabric_info.state : -1);
521+
}
522+
#else
523+
/* Fall back to unversioned API (no partitionId field). */
501524
nvmlGpuFabricInfo_t fabric_info;
502-
503525
nvml_st = nvmlDeviceGetGpuFabricInfo(nvml_dev, &fabric_info);
504526
if (nvml_st == NVML_SUCCESS &&
505527
fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) {
506-
gpu_info->gpus[i].fabric_capable = 1;
507-
gpu_info->gpus[i].fabric_clique_id = fabric_info.cliqueId;
528+
gpu_info->gpus[i].fabric_capable = 1;
529+
gpu_info->gpus[i].fabric_clique_id = fabric_info.cliqueId;
530+
gpu_info->gpus[i].fabric_partition_id = 0;
531+
ucc_debug("GPU %d: fabric_capable=1 clique=%u (no partition info)",
532+
i, fabric_info.cliqueId);
508533
} else {
509-
gpu_info->gpus[i].fabric_capable = 0;
510-
gpu_info->gpus[i].fabric_clique_id = 0;
534+
gpu_info->gpus[i].fabric_capable = 0;
535+
gpu_info->gpus[i].fabric_clique_id = 0;
536+
gpu_info->gpus[i].fabric_partition_id = 0;
511537
}
538+
#endif
512539
}
513540
#else
514-
gpu_info->gpus[i].fabric_capable = 0;
515-
gpu_info->gpus[i].fabric_clique_id = 0;
541+
gpu_info->gpus[i].fabric_capable = 0;
542+
gpu_info->gpus[i].fabric_clique_id = 0;
543+
gpu_info->gpus[i].fabric_partition_id = 0;
516544
#endif
517545
}
518546

src/components/topo/ucc_topo.c

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -561,29 +561,60 @@ int ucc_topo_is_single_nvlink_domain(const ucc_topo_t *topo)
561561
const ucc_host_info_t *host;
562562
ucc_device_id_t dev;
563563
uint64_t ref_clique_id;
564+
uint32_t ref_partition_id;
564565
ucc_rank_t i;
565566

566567
if (size == 0) {
567568
return 0;
568569
}
569570

570571
if (!ucc_topo_rank_device_info(topo, 0, &host, &dev)) {
572+
ucc_debug("nvlink domain check: rank 0 has no device info");
571573
return 0;
572574
}
573575

574576
if (!host->gpus[dev].fabric_capable ||
575577
host->gpus[dev].fabric_clique_id == 0) {
578+
ucc_debug("nvlink domain check: rank 0 GPU %u not fabric-capable "
579+
"or clique_id=0 (fabric_capable=%u clique_id=%llu)",
580+
(unsigned)dev,
581+
(unsigned)host->gpus[dev].fabric_capable,
582+
(unsigned long long)host->gpus[dev].fabric_clique_id);
576583
return 0;
577584
}
578585

579-
ref_clique_id = host->gpus[dev].fabric_clique_id;
586+
ref_clique_id = host->gpus[dev].fabric_clique_id;
587+
ref_partition_id = host->gpus[dev].fabric_partition_id;
580588

581589
for (i = 1; i < size; i++) {
582590
if (!ucc_topo_rank_device_info(topo, i, &host, &dev)) {
591+
ucc_debug("nvlink domain check: rank %u has no device info",
592+
(unsigned)i);
583593
return 0;
584594
}
585-
if (!host->gpus[dev].fabric_capable ||
586-
host->gpus[dev].fabric_clique_id != ref_clique_id) {
595+
if (!host->gpus[dev].fabric_capable) {
596+
ucc_debug("nvlink domain check: rank %u GPU %u not fabric-capable",
597+
(unsigned)i, (unsigned)dev);
598+
return 0;
599+
}
600+
if (host->gpus[dev].fabric_clique_id != ref_clique_id) {
601+
ucc_debug("nvlink domain check: rank %u clique_id=%llu differs "
602+
"from rank 0 clique_id=%llu",
603+
(unsigned)i,
604+
(unsigned long long)host->gpus[dev].fabric_clique_id,
605+
(unsigned long long)ref_clique_id);
606+
return 0;
607+
}
608+
/* If partition IDs are populated (GB200+ NVL), require same partition.
609+
* partition_id==0 means single-partition or unpopulated (pre-r525 NVML). */
610+
if (ref_partition_id != 0 &&
611+
host->gpus[dev].fabric_partition_id != ref_partition_id) {
612+
ucc_debug("nvlink domain check: rank %u partition_id=%u differs "
613+
"from rank 0 partition_id=%u; "
614+
"ranks span different NVL partitions",
615+
(unsigned)i,
616+
(unsigned)host->gpus[dev].fabric_partition_id,
617+
(unsigned)ref_partition_id);
587618
return 0;
588619
}
589620
}

src/utils/ucc_proc_info.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ typedef struct ucc_gpu_info {
5757
uint8_t nvswitch_connected;
5858
/**< NVSwitch fabric clique ID (0 if unknown) */
5959
uint64_t fabric_clique_id;
60+
/**< NVLink partition ID for GB200+ NVL sub-fabric partitions.
61+
* 0 means single partition or not populated (NVML < r525). */
62+
uint32_t fabric_partition_id;
6063
/**< Hash of GPU UUID for unique identification */
6164
uint64_t uuid;
6265
} ucc_gpu_info_t;

0 commit comments

Comments
 (0)