@@ -881,12 +881,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
881881 * @return OMPI_SUCCESS if BTLs can be found
882882 * @return OMPI_ERR_UNREACH if no BTLs can be found that match
883883 *
884- * In this case an "alternate" BTL is a BTL that does not provide true RDMA but
885- * can use active messages using the BTL base AM RDMA/atomics. Since more than
886- * one BTL may be needed for this support the OSC component will disable the
887- * use of registration-based RDMA (these BTLs will not be used) and will use
888- * any remaining BTL. By default the BTLs used will be tcp and sm but any single
889- * (or pair) of BTLs may be used.
884+ * In this case an "alternate" BTL is a BTL does not meet the
885+ * requirements of a BTL outlined in ompi_osc_rdma_query_btls().
886+ * Either it does not provide connectivity to all peers, provide
887+ * remote completion, or natively support put/get/atomic.. Since more
888+ * than one BTL may be needed for this support the OSC component will
889+ * disable the use of registration-based RDMA (these BTLs will not be
890+ * used) and will use any remaining BTL. By default the BTLs used will
891+ * be tcp and sm but any single (or pair) of BTLs may be used.
890892 */
891893static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
892894{
@@ -935,6 +937,26 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
935937 return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
936938}
937939
940+ /*
941+ * Attempt to find a BTL that can be used for native RDMA
942+ *
943+ * Attempt to find an "accelerated" BTL that can be used directly, as
944+ * opposed to emulated rdma semantics with the alternate BTLs. To be
945+ * an accelerated BTL, four conditions must be true:
946+ *
947+ * 1) The BTL must be able to communicate with all peers in the
948+ * Window
949+ * 2) The BTL must provide remote completion
950+ * 3) The BTL must be able to register the entire target window
951+ * 4) The BTL must natively support put/get/atomic operations
952+ *
953+ * Testing (1) is expensive, so as an optimization, the
954+ * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL
955+ * components we know can achieve (1) in almost all usage scenarios.
956+ *
957+ * If module is NULL, the code acts as a query mechanism to find any
958+ * potential BTLs, and is used to implement osc_rdma_query().
959+ */
938960static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
939961{
940962 struct mca_btl_base_module_t * * possible_btls = NULL ;
@@ -948,14 +970,15 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
948970 char * * btls_to_use ;
949971 void * tmp ;
950972
951- btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
952-
953973 if (module ) {
954974 ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
955975 module -> btls_in_use = 0 ;
956976 module -> use_memory_registration = false;
957977 }
958978
979+ /* Check for BTLs in the list of BTLs we know can reach all peers
980+ in general usage. */
981+ btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
959982 if (btls_to_use ) {
960983 /* rdma and atomics are only supported with BTLs at the moment
961984 * If a btl does not support remote completion, it cannot be used as the primary btl.
@@ -992,7 +1015,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
9921015
9931016 /* if osc/rdma gets selected we need to ensure that all local procs have been added */
9941017 ompi_osc_rdma_ensure_local_add_procs ();
995-
1018+
1019+ /*
1020+ * A BTL in the list of known can reach all peers that met our
1021+ * other requirements was not found. Look for BTLs that may be
1022+ * able to talk to all peers. This is obviously more expensive
1023+ * than the check above.
1024+ */
1025+
9961026 for (int rank = 0 ; rank < comm_size ; ++ rank ) {
9971027 ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
9981028 mca_bml_base_endpoint_t * endpoint ;
@@ -1036,10 +1066,16 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10361066 btl_counts = tmp ;
10371067
10381068 for (int i_btl = 0 ; i_btl < num_btls ; ++ i_btl ) {
1039- /* for this implementation we need only compare-and-swap and fetch-and-add
1069+ /* Check for BTL requirements:
1070+ * 1) RDMA (put/get) and ATOMIC operations. We only
1071+ * require cswap and fetch and add and will emulate
1072+ * other opterations with those two as necessary.
1073+ * 2) Remote Completion
10401074 *
1041- * If a btl does not support remote completion, it cannot be used as the primary btl.
1042- * It can still be selected as an alternate btl */
1075+ * If the BTL meets all those requirements, increment the
1076+ * btl_counts to indicate that this btl can talk to the
1077+ * current peer proc.
1078+ */
10431079 if (((endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) ==
10441080 (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) &&
10451081 (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ) &&
@@ -1081,7 +1117,9 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10811117 }
10821118
10831119 if (possible_btls [i ]-> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB ) {
1084- /* do not need to use the btl for self communication */
1120+ /* The onesided component can, if BTL atomics are atomic
1121+ relative to CPU atomics, handle atomics to self, so
1122+ increment the counter once to cover that case. */
10851123 btl_count ++ ;
10861124 }
10871125
0 commit comments