TL/CUDA: guard team creation when device info is incomplete

ikryukov · ikryukov · commit 61123575db88 · 2026-03-23T19:36:31.000+01:00
ucc_tl_cuda_team_topo_create relies on per-rank GPU device information
(PCI IDs, NVLink matrices) that is populated only when every rank has
at least one visible GPU.  Without this check the topo init code
dereferenced uninitialised or invalid device info, causing silent
failures or incorrect topology matrices.

Add an ucc_topo_has_device_info() guard before the topo_create call so
that TL/CUDA gracefully reports UCC_ERR_NOT_SUPPORTED and falls back to
another TL when device info is missing for any rank.
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
@@ -341,6 +341,14 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
         team->scratch.rem[i] = NULL;
     }
 
+    if (!ucc_topo_has_device_info(UCC_TL_CORE_TEAM(team)->topo)) {
+        tl_debug(tl_team->context->lib,
+                 "not all ranks have visible GPU device info; "
+                 "skipping TL/CUDA team creation");
+        status = UCC_ERR_NOT_SUPPORTED;
+        goto exit_err;
+    }
+
     status = ucc_tl_cuda_team_topo_create(&team->super, &team->topo);
     if (status != UCC_OK) {
         goto exit_err;