Skip to content

Commit 155ea45

Browse files
fengjicaaranadive
andauthored
[libfabric] fix GPU NIC grouping in libfabric_topology. (#1024)
When there are more GPU devices than NICs on a topology node, share the NICs with each GPU. Co-authored-by: Adit Ranadive <[email protected]>
1 parent 1f85172 commit 155ea45

File tree

1 file changed

+15
-6
lines changed

1 file changed

+15
-6
lines changed

src/utils/libfabric/libfabric_topology.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -681,8 +681,8 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
681681
});
682682

683683
// Split NICs among GPUs
684-
int nics_per_group = nics.size() / num_groups;
685-
int extra_nics = nics.size() % num_groups;
684+
const int nics_per_group = nics.size() / num_groups;
685+
const int extra_nics = nics.size() % num_groups;
686686

687687
size_t nic_idx = 0;
688688
for (int group_idx = 0; group_idx < num_groups && group_idx < (int)gpus.size();
@@ -691,11 +691,20 @@ nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_
691691
group.has_gpu = true;
692692
group.closest_gpu = gpus[group_idx];
693693
group.common_ancestor = ancestor;
694-
// Assign NICs to this group
695-
int group_size = nics_per_group + (group_idx < extra_nics ? 1 : 0);
696-
for (int i = 0; i < group_size && nic_idx < nics.size(); ++i, ++nic_idx) {
697-
group.nics.push_back(nics[nic_idx]);
694+
695+
if (nics.size() < (size_t)num_groups) {
696+
// Give all NICs to this GPU
697+
NIXL_DEBUG << "Fewer NICs (" << nics.size() << ") than GPUs (" << num_groups
698+
<< ") at ancestor - sharing all NICs with each GPU";
699+
group.nics = nics;
700+
} else {
701+
// Assign NICs to this group via partitioning
702+
int group_size = nics_per_group + (group_idx < extra_nics ? 1 : 0);
703+
for (int i = 0; i < group_size && nic_idx < nics.size(); ++i, ++nic_idx) {
704+
group.nics.push_back(nics[nic_idx]);
705+
}
698706
}
707+
699708
if (!group.nics.empty()) {
700709
nic_groups.push_back(group);
701710
}

0 commit comments

Comments
 (0)