Skip to content

Commit e60cc40

Browse files
committed
TL/CUDA: use ucc_topo device map and NVLink matrix for team topo
Improvement 1: replace the private NVML-based NVLink link-count query in ucc_tl_cuda_team_topo_init_matrix with the pre-gathered topology data already available in ucc_topo. For each pair of ranks we look up their CUDA device index via topo->device_map.device_ids[] and read the link count from host_info->nvlink_matrix[di][dj]. Cross-node pairs are assigned zero links directly without any NVML round-trip. Improvement 2: replace the matrix zero-count heuristic for is_fully_connected with a call to ucc_topo_is_nvlink_fully_connected(). This correctly handles NVSwitch-connected and fabric-clique GPUs in addition to point-to-point NVLink, and skips the O(n^2) proxy search entirely when all ranks are fully connected.
1 parent 6112357 commit e60cc40

File tree

1 file changed

+43
-21
lines changed

1 file changed

+43
-21
lines changed

src/components/tl/cuda/tl_cuda_team_topo.c

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "tl_cuda_team_topo.h"
88
#include "tl_cuda.h"
9+
#include "core/ucc_team.h"
910

1011
#define UCC_TL_CUDA_TEAM_TOPO_SAME_DEVICE ((ucc_rank_t)(UCC_RANK_MAX))
1112

@@ -341,27 +342,32 @@ static ucc_status_t
341342
ucc_tl_cuda_team_topo_init_matrix(const ucc_tl_cuda_team_t *team,
342343
ucc_rank_t *matrix)
343344
{
344-
ucc_tl_cuda_topo_t *topo = UCC_TL_CUDA_TEAM_CTX(team)->topo;
345-
int size = UCC_TL_TEAM_SIZE(team);
346-
ucc_status_t status;
347-
int i, j;
345+
ucc_topo_t *topo = UCC_TL_CORE_TEAM(team)->topo;
346+
ucc_proc_info_t *procs = topo->topo->procs;
347+
ucc_device_id_t *dev_ids = topo->device_map.device_ids;
348+
int size = UCC_TL_TEAM_SIZE(team);
349+
int i, j;
350+
ucc_rank_t ci, cj;
351+
ucc_device_id_t di, dj;
352+
const ucc_host_info_t *host_i;
348353

349354
for (i = 0; i < size; i++) {
350-
matrix[i + i*size] = UCC_TL_CUDA_TEAM_TOPO_SAME_DEVICE;
355+
matrix[i + i * size] = UCC_TL_CUDA_TEAM_TOPO_SAME_DEVICE;
356+
ci = ucc_ep_map_eval(topo->set.map, i);
357+
di = dev_ids[i];
358+
host_i = &topo->topo->hosts[ci];
351359
for (j = i + 1; j < size; j++) {
352-
if (ucc_tl_cuda_topo_device_id_equal(&team->ids[i].pci_id,
353-
&team->ids[j].pci_id)) {
354-
matrix[i + j*size] = UCC_TL_CUDA_TEAM_TOPO_SAME_DEVICE;
360+
cj = ucc_ep_map_eval(topo->set.map, j);
361+
dj = dev_ids[j];
362+
if (procs[ci].host_hash != procs[cj].host_hash) {
363+
/* Cross-node pair: no intra-node NVLink */
364+
matrix[i + j * size] = 0;
365+
} else if (di == dj) {
366+
matrix[i + j * size] = UCC_TL_CUDA_TEAM_TOPO_SAME_DEVICE;
355367
} else {
356-
status = ucc_tl_cuda_topo_num_links(topo,
357-
&team->ids[i].pci_id,
358-
&team->ids[j].pci_id,
359-
&matrix[i + j*size]);
360-
if (status != UCC_OK) {
361-
return status;
362-
}
368+
matrix[i + j * size] = host_i->nvlink_matrix[di][dj];
363369
}
364-
matrix[j + i*size] = matrix[i +j*size];
370+
matrix[j + i * size] = matrix[i + j * size];
365371
}
366372
}
367373

@@ -394,12 +400,28 @@ ucc_status_t ucc_tl_cuda_team_topo_create(const ucc_tl_team_t *cuda_team,
394400
goto free_matrix;
395401
}
396402

397-
status = ucc_tl_cuda_team_topo_init_proxies(team, topo);
398-
if (status != UCC_OK) {
399-
if (status != UCC_ERR_NOT_SUPPORTED) {
400-
tl_error(UCC_TL_TEAM_LIB(team), "failed to init cuda topo proxy");
403+
/* Use the authoritative ucc_topo NVLink check to determine full
404+
* connectivity. This handles NVSwitch, fabric clique, and direct NVLink
405+
* connections consistently and avoids rescanning the matrix for zeros. */
406+
{
407+
ucc_topo_t *utopo = UCC_TL_CORE_TEAM(team)->topo;
408+
ucc_sbgp_t *node_sg = ucc_topo_get_sbgp(utopo, UCC_SBGP_NODE);
409+
topo->is_fully_connected =
410+
ucc_topo_is_nvlink_fully_connected(utopo, node_sg);
411+
}
412+
413+
if (topo->is_fully_connected) {
414+
topo->num_proxies = 0;
415+
topo->proxy_needed = 0;
416+
} else {
417+
status = ucc_tl_cuda_team_topo_init_proxies(team, topo);
418+
if (status != UCC_OK) {
419+
if (status != UCC_ERR_NOT_SUPPORTED) {
420+
tl_error(UCC_TL_TEAM_LIB(team),
421+
"failed to init cuda topo proxy");
422+
}
423+
goto free_matrix;
401424
}
402-
goto free_matrix;
403425
}
404426

405427
status = ucc_tl_cuda_team_topo_init_rings(team, topo);

0 commit comments

Comments
 (0)