@@ -78,7 +78,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
7878static int ompi_osc_rdma_component_select (struct ompi_win_t * win , void * * base , size_t size , int disp_unit ,
7979 struct ompi_communicator_t * comm , struct opal_info_t * info ,
8080 int flavor , int * model );
81- static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
81+ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
8282static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
8383
8484static const char * ompi_osc_rdma_set_no_lock_info (opal_infosubscriber_t * obj , const char * key , const char * value );
@@ -395,7 +395,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
395395 }
396396#endif /* OPAL_CUDA_SUPPORT */
397397
398- if (OMPI_SUCCESS == ompi_osc_rdma_query_btls (comm , NULL )) {
398+ if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm , NULL )) {
399399 return mca_osc_rdma_component .priority ;
400400 }
401401
@@ -882,7 +882,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
882882 * @return OMPI_ERR_UNREACH if no BTLs can be found that match
883883 *
884884 * In this case an "alternate" BTL is a BTL does not meet the
885- * requirements of a BTL outlined in ompi_osc_rdma_query_btls ().
885+ * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls ().
886886 * Either it does not provide connectivity to all peers, provide
887887 * remote completion, or natively support put/get/atomic.. Since more
888888 * than one BTL may be needed for this support the OSC component will
@@ -937,6 +937,20 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937937 return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
938938}
939939
940+ /* Check for BTL requirements:
941+ * 1) RDMA (put/get) and ATOMIC operations. We only require cswap
942+ * and fetch and add and will emulate other opterations with those
943+ * two as necessary.
944+ * 2) Remote Completion
945+ */
946+ static bool ompi_osc_rdma_check_accelerated_btl (struct mca_btl_base_module_t * btl )
947+ {
948+ return ((btl -> btl_flags & MCA_BTL_FLAGS_RDMA ) &&
949+ (btl -> btl_flags & MCA_BTL_FLAGS_ATOMIC_FOPS ) &&
950+ (btl -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION ) &&
951+ (btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ));
952+ }
953+
940954/*
941955 * Attempt to find a BTL that can be used for native RDMA
942956 *
@@ -957,18 +971,12 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
957971 * If module is NULL, the code acts as a query mechanism to find any
958972 * potential BTLs, and is used to implement osc_rdma_query().
959973 */
960- static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
974+ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
961975{
962- struct mca_btl_base_module_t * * possible_btls = NULL ;
963976 int comm_size = ompi_comm_size (comm );
964- int comm_rank = ompi_comm_rank (comm );
965- int rc = OMPI_SUCCESS , max_btls = 0 ;
966- unsigned int selected_latency = INT_MAX ;
967- struct mca_btl_base_module_t * selected_btl = NULL ;
968- mca_btl_base_selected_module_t * item ;
969- int * btl_counts = NULL ;
977+ struct mca_btl_base_module_t * selected_btl ;
978+ mca_bml_base_endpoint_t * base_endpoint ;
970979 char * * btls_to_use ;
971- void * tmp ;
972980
973981 if (module ) {
974982 ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
@@ -980,37 +988,30 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
980988 in general usage. */
981989 btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
982990 if (btls_to_use ) {
983- /* rdma and atomics are only supported with BTLs at the moment
984- * If a btl does not support remote completion, it cannot be used as the primary btl.
985- * It can still be selected as an alternate btl */
991+ mca_btl_base_selected_module_t * item ;
992+
993+ selected_btl = NULL ;
994+
995+ /* rdma and atomics are only supported with BTLs at the moment */
986996 OPAL_LIST_FOREACH (item , & mca_btl_base_modules_initialized , mca_btl_base_selected_module_t ) {
987997 for (int i = 0 ; btls_to_use [i ] ; ++ i ) {
988998 if (0 != strcmp (btls_to_use [i ], item -> btl_module -> btl_component -> btl_version .mca_component_name )) {
989999 continue ;
9901000 }
9911001
992- if ((item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_RDMA )) == MCA_BTL_FLAGS_RDMA &&
993- (item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION ))) {
994- if (!selected_btl || item -> btl_module -> btl_latency < selected_btl -> btl_latency ) {
1002+ if (ompi_osc_rdma_check_accelerated_btl (item -> btl_module )) {
1003+ if (NULL == selected_btl || item -> btl_module -> btl_latency < selected_btl -> btl_latency ) {
9951004 selected_btl = item -> btl_module ;
9961005 }
9971006 }
9981007 }
9991008 }
10001009
10011010 opal_argv_free (btls_to_use );
1002- }
10031011
1004- if (NULL != selected_btl ) {
1005- if (module ) {
1006- ompi_osc_rdma_selected_btl_insert (module , selected_btl , 0 );
1007- module -> btls_in_use = 1 ;
1008- module -> use_memory_registration = selected_btl -> btl_register_mem != NULL ;
1012+ if (NULL != selected_btl ) {
1013+ goto btl_selection_complete ;
10091014 }
1010-
1011- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "selected btl: %s" ,
1012- selected_btl -> btl_component -> btl_version .mca_component_name );
1013- return OMPI_SUCCESS ;
10141015 }
10151016
10161017 /* if osc/rdma gets selected we need to ensure that all local procs have been added */
@@ -1021,123 +1022,78 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10211022 * other requirements was not found. Look for BTLs that may be
10221023 * able to talk to all peers. This is obviously more expensive
10231024 * than the check above.
1025+ *
1026+ * This algorithm skips a potential use case: it requires
1027+ * reachability to self, which is not strictly needed if BTL and
1028+ * CPU atomics are atomic with each other. However, the set of
1029+ * BTLs which can not send to self, which have RDMA semantics, an
1030+ * which have the rquired atomicity is currently the null set and
1031+ * almost certain to remain the null set, so we keep it simple.
1032+ *
1033+ * We only want BTLs that can reach all peers, so use rank 0's BTL
1034+ * list as the list of all available BTLs. Any BTL that cannot
1035+ * be used to communicate with rank 0 necessarily is not in the
1036+ * list of all available BTLs for this algorithm.
10241037 */
1038+ base_endpoint = mca_bml_base_get_endpoint (ompi_comm_peer_lookup (comm , 0 ));
1039+ if (NULL == base_endpoint ) {
1040+ return OMPI_ERR_UNREACH ;
1041+ }
10251042
1026- for (int rank = 0 ; rank < comm_size ; ++ rank ) {
1027- ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
1028- mca_bml_base_endpoint_t * endpoint ;
1029- int num_btls , prev_max ;
1030- bool found_btl = false;
1031-
1032- endpoint = mca_bml_base_get_endpoint (proc );
1033- if (NULL == endpoint ) {
1034- /* can't continue if some peer is unreachable */
1035- rc = OMPI_ERR_UNREACH ;
1036- break ;
1037- }
1043+ selected_btl = NULL ;
1044+ for (size_t i_btl = 0 ;
1045+ i_btl < mca_bml_base_btl_array_get_size (& base_endpoint -> btl_rdma );
1046+ ++ i_btl ) {
1047+ bool have_connectivity = true;
1048+ struct mca_bml_base_btl_t * examine_bml_btl ;
1049+ struct mca_btl_base_module_t * examine_btl ;
10381050
1039- num_btls = mca_bml_base_btl_array_get_size (& endpoint -> btl_rdma );
1040- if (0 == num_btls ) {
1041- rc = OMPI_ERR_NOT_AVAILABLE ;
1042- /* at least one rank doesn't have an RDMA capable btl */
1043- break ;
1051+ examine_bml_btl = mca_bml_base_btl_array_get_index (& base_endpoint -> btl_rdma , i_btl );
1052+ if (NULL == examine_bml_btl ) {
1053+ return OMPI_ERR_NOT_FOUND ;
10441054 }
1055+ examine_btl = examine_bml_btl -> btl ;
10451056
1046- prev_max = max_btls ;
1047-
1048- max_btls = (max_btls > num_btls ) ? max_btls : num_btls ;
1049-
1050- tmp = realloc (possible_btls , sizeof (void * ) * max_btls );
1051- if (NULL == tmp ) {
1052- rc = OMPI_ERR_OUT_OF_RESOURCE ;
1053- break ;
1057+ /* skip any BTL which doesn't meet our requirements */
1058+ if (!ompi_osc_rdma_check_accelerated_btl (examine_btl )) {
1059+ continue ;
10541060 }
1055- possible_btls = tmp ;
10561061
1057- for (int j = prev_max ; j < max_btls ; ++ j ) {
1058- possible_btls [j ] = NULL ;
1059- }
1062+ /* check connectivity across all ranks */
1063+ for (int rank = 0 ; rank < comm_size ; ++ rank ) {
1064+ ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
1065+ mca_bml_base_endpoint_t * endpoint ;
10601066
1061- tmp = realloc (btl_counts , sizeof (int ) * max_btls );
1062- if (NULL == tmp ) {
1063- rc = OMPI_ERR_OUT_OF_RESOURCE ;
1064- break ;
1065- }
1066- btl_counts = tmp ;
1067-
1068- for (int i_btl = 0 ; i_btl < num_btls ; ++ i_btl ) {
1069- /* Check for BTL requirements:
1070- * 1) RDMA (put/get) and ATOMIC operations. We only
1071- * require cswap and fetch and add and will emulate
1072- * other opterations with those two as necessary.
1073- * 2) Remote Completion
1074- *
1075- * If the BTL meets all those requirements, increment the
1076- * btl_counts to indicate that this btl can talk to the
1077- * current peer proc.
1078- */
1079- if (((endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) ==
1080- (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) &&
1081- (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ) &&
1082- (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION )) {
1083- for (int j = 0 ; j < max_btls ; ++ j ) {
1084- if (endpoint -> btl_rdma .bml_btls [i_btl ].btl == possible_btls [j ]) {
1085- ++ btl_counts [j ];
1086- found_btl = true;
1087- break ;
1088- } else if (NULL == possible_btls [j ]) {
1089- possible_btls [j ] = endpoint -> btl_rdma .bml_btls [i_btl ].btl ;
1090- btl_counts [j ] = 1 ;
1091- found_btl = true;
1092- break ;
1093- }
1094- }
1067+ endpoint = mca_bml_base_get_endpoint (proc );
1068+ if (NULL == endpoint ) {
1069+ have_connectivity = false;
1070+ break ;
10951071 }
1096- }
1097-
1098- /* any non-local rank must have a usable btl */
1099- if (!found_btl && comm_rank != rank ) {
1100- /* no btl = no rdma/atomics */
1101- rc = OMPI_ERR_UNREACH ;
1102- break ;
1103- }
1104- }
11051072
1106- if (OMPI_SUCCESS != rc ) {
1107- free (possible_btls );
1108- free (btl_counts );
1109- return rc ;
1110- }
1111-
1112- for (int i = 0 ; i < max_btls ; ++ i ) {
1113- int btl_count = btl_counts [i ];
1114-
1115- if (NULL == possible_btls [i ]) {
1116- break ;
1117- }
1118-
1119- if (possible_btls [i ]-> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB ) {
1120- /* The onesided component can, if BTL atomics are atomic
1121- relative to CPU atomics, handle atomics to self, so
1122- increment the counter once to cover that case. */
1123- btl_count ++ ;
1073+ if (NULL == mca_bml_base_btl_array_find (& endpoint -> btl_rdma ,
1074+ examine_btl )) {
1075+ have_connectivity = false;
1076+ break ;
1077+ }
11241078 }
11251079
1126- if (btl_count >= comm_size && possible_btls [i ]-> btl_latency < selected_latency ) {
1127- selected_btl = possible_btls [i ];
1128- selected_latency = possible_btls [i ]-> btl_latency ;
1080+ /* if we have connectivity, displace currently selected btl if
1081+ * this one has lower latency; we prioritize latency over all
1082+ * other parameters
1083+ */
1084+ if (have_connectivity ) {
1085+ if (NULL == selected_btl || examine_btl -> btl_latency < selected_btl -> btl_latency ) {
1086+ selected_btl = examine_btl ;
1087+ }
11291088 }
11301089 }
11311090
1132- free (possible_btls );
1133- free (btl_counts );
1134-
11351091 if (NULL == selected_btl ) {
11361092 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "no suitable btls found" );
1137- /* no btl = no rdma/atomics */
11381093 return OMPI_ERR_NOT_AVAILABLE ;
11391094 }
11401095
1096+ btl_selection_complete :
11411097 if (module ) {
11421098 ompi_osc_rdma_selected_btl_insert (module , selected_btl , 0 );
11431099 module -> btls_in_use = 1 ;
@@ -1414,7 +1370,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14141370 }
14151371
14161372 /* find rdma capable endpoints */
1417- ret = ompi_osc_rdma_query_btls (module -> comm , module );
1373+ ret = ompi_osc_rdma_query_accelerated_btls (module -> comm , module );
14181374 if (OMPI_SUCCESS != ret ) {
14191375 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_WARN , "could not find a suitable btl. falling back on "
14201376 "active-message BTLs" );
0 commit comments