@@ -771,16 +771,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
771771 return ret ;
772772}
773773
774- static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank ,
775- size_t num_providers )
774+ /**
775+ * @brief Selects a provider from the list in a round-robin fashion
776+ *
777+ * This function implements a round-robin algorithm to select a provider from
778+ * the provided list based on a rank. Only providers of the same type as the
779+ * first provider are eligible for selection.
780+ *
781+ * @param[in] provider_list A list of providers to select from.
782+ * @param[out] rank A rank metric for the current process, such as
783+ * the rank on the same node or CPU package.
784+ * @return Pointer to the selected provider
785+ */
786+ static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank )
776787{
777- uint32_t provider_rank = rank % num_providers ;
778- struct fi_info * current_provider = provider_list ;
788+ uint32_t provider_rank = 0 , current_rank = 0 ;
789+ size_t num_providers = 0 ;
790+ struct fi_info * current_provider = NULL ;
779791
780- for (uint32_t i = 0 ; i < provider_rank ; ++ i ) {
792+ for (current_provider = provider_list ; NULL != current_provider ;) {
793+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )) {
794+ ++ num_providers ;
795+ }
781796 current_provider = current_provider -> next ;
782797 }
783798
799+ current_provider = provider_list ;
800+ if (2 > num_providers ) {
801+ goto out ;
802+ }
803+
804+ provider_rank = rank % num_providers ;
805+
806+ while (NULL != current_provider ) {
807+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )
808+ && provider_rank == current_rank ++ ) {
809+ break ;
810+ }
811+ current_provider = current_provider -> next ;
812+ }
813+ out :
784814 return current_provider ;
785815}
786816
@@ -888,7 +918,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
888918{
889919 int ret , num_providers = 0 ;
890920 struct fi_info * provider = NULL ;
891- uint32_t package_rank = 0 ;
921+ uint32_t package_rank = process_info -> my_local_rank ;
892922
893923 num_providers = count_providers (provider_list );
894924 if (!process_info -> proc_is_bound || 2 > num_providers ) {
@@ -914,7 +944,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
914944#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
915945
916946round_robin :
917- provider = select_provider_round_robin (provider_list , package_rank , num_providers );
947+ if (!process_info -> proc_is_bound && 1 < num_providers
948+ && opal_output_get_verbosity (opal_common_ofi .output ) >= 1 ) {
949+ opal_show_help ("help-common-ofi.txt" , "unbound_process" , true, 1 );
950+ }
951+
952+ provider = select_provider_round_robin (provider_list , package_rank );
918953out :
919954#if OPAL_ENABLE_DEBUG
920955 opal_output_verbose (1 , opal_common_ofi .output , "package rank: %d device: %s" , package_rank ,
@@ -988,5 +1023,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
9881023 }
9891024 return ret ;
9901025}
991-
992-
0 commit comments