Skip to content

Commit f9800fd

Browse files
committed
opal/ofi: fix round-robin selection logic
This change fixes current round-robin selection logic: - Only providers of the same type should be considered, i.e. providers that match the head of the list. This deviates from the documented behavior. - For unbound process the selection should be based on its local rank, i.e. rank among processes on the same node. Currently only the first NIC will be selected. Signed-off-by: Wenduo Wang <[email protected]> (cherry picked from commit b061f96)
1 parent d29a19b commit f9800fd

File tree

2 files changed

+48
-9
lines changed

2 files changed

+48
-9
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -771,16 +771,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
771771
return ret;
772772
}
773773

774-
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank,
775-
size_t num_providers)
774+
/**
775+
* @brief Selects a provider from the list in a round-robin fashion
776+
*
777+
* This function implements a round-robin algorithm to select a provider from
778+
* the provided list based on a rank. Only providers of the same type as the
779+
* first provider are eligible for selection.
780+
*
781+
* @param[in] provider_list A list of providers to select from.
782+
* @param[out] rank A rank metric for the current process, such as
783+
* the rank on the same node or CPU package.
784+
* @return Pointer to the selected provider
785+
*/
786+
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank)
776787
{
777-
uint32_t provider_rank = rank % num_providers;
778-
struct fi_info *current_provider = provider_list;
788+
uint32_t provider_rank = 0, current_rank = 0;
789+
size_t num_providers = 0;
790+
struct fi_info *current_provider = NULL;
779791

780-
for (uint32_t i = 0; i < provider_rank; ++i) {
792+
for (current_provider = provider_list; NULL != current_provider;) {
793+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) {
794+
++num_providers;
795+
}
781796
current_provider = current_provider->next;
782797
}
783798

799+
current_provider = provider_list;
800+
if (2 > num_providers) {
801+
goto out;
802+
}
803+
804+
provider_rank = rank % num_providers;
805+
806+
while (NULL != current_provider) {
807+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)
808+
&& provider_rank == current_rank++) {
809+
break;
810+
}
811+
current_provider = current_provider->next;
812+
}
813+
out:
784814
return current_provider;
785815
}
786816

@@ -888,7 +918,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
888918
{
889919
int ret, num_providers = 0;
890920
struct fi_info *provider = NULL;
891-
uint32_t package_rank = 0;
921+
uint32_t package_rank = process_info->my_local_rank;
892922

893923
num_providers = count_providers(provider_list);
894924
if (!process_info->proc_is_bound || 2 > num_providers) {
@@ -914,7 +944,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
914944
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
915945

916946
round_robin:
917-
provider = select_provider_round_robin(provider_list, package_rank, num_providers);
947+
if (!process_info->proc_is_bound && 1 < num_providers
948+
&& opal_output_get_verbosity(opal_common_ofi.output) >= 1) {
949+
opal_show_help("help-common-ofi.txt", "unbound_process", true, 1);
950+
}
951+
952+
provider = select_provider_round_robin(provider_list, package_rank);
918953
out:
919954
#if OPAL_ENABLE_DEBUG
920955
opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank,
@@ -988,5 +1023,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
9881023
}
9891024
return ret;
9901025
}
991-
992-

opal/mca/common/ofi/help-common-ofi.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
#
88
# $HEADER$
99
#
10+
[unbound_process]
11+
Open MPI's OFI driver detected multiple NICs on the system but cannot select an
12+
optimal device because the current process is not bound. This may negatively
13+
impact performance. This can be resolved by specifying "--bind-to ..." on
14+
command line.
15+
1016
[package_rank failed]
1117
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
1218
but had insufficient information to ensure MPI processes fairly pick a NIC for use.

0 commit comments

Comments
 (0)