@@ -410,16 +410,17 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
410410 region -> len = size ;
411411
412412 if (module -> use_memory_registration && size ) {
413+ assert (module -> use_accelerated_btl );
413414 if (MPI_WIN_FLAVOR_ALLOCATE != module -> flavor || NULL == module -> state_handle ) {
414415 ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , * base , size , MCA_BTL_REG_FLAG_ACCESS_ANY ,
415416 & module -> base_handle );
416417 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
417418 return OMPI_ERR_OUT_OF_RESOURCE ;
418419 }
419420
420- memcpy (region -> btl_handle_data , module -> base_handle , module -> selected_btls [ 0 ] -> btl_registration_handle_size );
421+ memcpy (region -> btl_handle_data , module -> base_handle , module -> accelerated_btl -> btl_registration_handle_size );
421422 } else {
422- memcpy (region -> btl_handle_data , module -> state_handle , module -> selected_btls [ 0 ] -> btl_registration_handle_size );
423+ memcpy (region -> btl_handle_data , module -> state_handle , module -> accelerated_btl -> btl_registration_handle_size );
423424 }
424425 }
425426
@@ -580,8 +581,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
580581 module -> use_cpu_atomics = module -> single_node ;
581582
582583 if (!module -> single_node ) {
583- for (int i = 0 ; i < module -> btls_in_use ; ++ i ) {
584- module -> use_cpu_atomics = module -> use_cpu_atomics && !!(module -> selected_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
584+ if (module -> use_accelerated_btl ) {
585+ module -> use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
586+ } else {
587+ for (int i = 0 ; i < module -> alternate_btl_count ; ++ i ) {
588+ module -> use_cpu_atomics &= !!(module -> alternate_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
589+ }
585590 }
586591 }
587592
@@ -703,14 +708,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
703708 if (0 == local_rank ) {
704709 /* unlink the shared memory backing file */
705710 opal_shmem_unlink (& module -> seg_ds );
706- /* just go ahead and register the whole segment */
707- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size ,
708- MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
709- if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
710- state_region -> base = (intptr_t ) module -> segment_base ;
711- if (module -> state_handle ) {
712- memcpy (state_region -> btl_handle_data , module -> state_handle ,
713- module -> selected_btls [0 ]-> btl_registration_handle_size );
711+ if (module -> use_accelerated_btl ) {
712+ /* just go ahead and register the whole segment */
713+ ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size ,
714+ MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
715+ if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
716+ state_region -> base = (intptr_t ) module -> segment_base ;
717+ if (module -> state_handle ) {
718+ memcpy (state_region -> btl_handle_data , module -> state_handle ,
719+ module -> accelerated_btl -> btl_registration_handle_size );
720+ }
714721 }
715722 }
716723 }
@@ -730,8 +737,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
730737 region -> base = state_region -> base + my_base_offset ;
731738 region -> len = size ;
732739 if (module -> use_memory_registration ) {
733- memcpy (region -> btl_handle_data , state_region -> btl_handle_data ,
734- module -> selected_btls [0 ]-> btl_registration_handle_size );
740+ assert (module -> use_accelerated_btl );
741+ memcpy (region -> btl_handle_data , state_region -> btl_handle_data ,
742+ module -> accelerated_btl -> btl_registration_handle_size );
735743 }
736744 }
737745
@@ -910,12 +918,23 @@ static int btl_latency_sort_fn(const void *a, const void *b)
910918 */
911919static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
912920{
921+ size_t btl_count ;
922+ size_t index = 0 ;
913923 mca_btl_base_selected_module_t * item ;
914924 int ret ;
915925
916926 assert (NULL != module );
917927
918- module -> btls_in_use = 0 ;
928+ btl_count = opal_list_get_size (& mca_btl_base_modules_initialized );
929+ if (btl_count > UINT8_MAX ) {
930+ return OMPI_ERROR ;
931+ }
932+
933+ module -> alternate_btl_count = btl_count ;
934+ module -> alternate_btls = malloc (sizeof (struct mca_btl_base_module_t * ) * btl_count );
935+ if (NULL == module -> alternate_btls ) {
936+ return OMPI_ERR_TEMP_OUT_OF_RESOURCE ;
937+ }
919938
920939 /* add all alternate btls to the selected_btls list, not worrying
921940 about ordering yet. We have to add all btls unless we want to
@@ -937,17 +956,17 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937956 if (OMPI_SUCCESS != ret ) {
938957 return ret ;
939958 }
940- ompi_osc_rdma_selected_btl_insert ( module , item -> btl_module , module -> btls_in_use ++ ) ;
959+ module -> alternate_btls [ index ++ ] = item -> btl_module ;
941960 }
961+ assert (index == btl_count );
942962
943963 /* sort based on latency, lowest first */
944- qsort (module -> selected_btls , module -> btls_in_use ,
964+ qsort (module -> alternate_btls , module -> alternate_btl_count ,
945965 sizeof (struct mca_btl_base_module_t * ), btl_latency_sort_fn );
946966
947- /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */
948967 module -> use_memory_registration = false;
949968
950- return module -> btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
969+ return OMPI_SUCCESS ;
951970}
952971
953972
@@ -991,8 +1010,7 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
9911010
9921011 assert (NULL != module );
9931012
994- ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
995- module -> btls_in_use = 0 ;
1013+ module -> use_accelerated_btl = false;
9961014 module -> use_memory_registration = false;
9971015
9981016 /* Check for BTLs in the list of BTLs we know can reach all peers
@@ -1106,8 +1124,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
11061124 }
11071125
11081126btl_selection_complete :
1109- ompi_osc_rdma_selected_btl_insert ( module , selected_btl , 0 ) ;
1110- module -> btls_in_use = 1 ;
1127+ module -> use_accelerated_btl = true ;
1128+ module -> accelerated_btl = selected_btl ;
11111129 module -> use_memory_registration = selected_btl -> btl_register_mem != NULL ;
11121130
11131131 opal_output_verbose (MCA_BASE_VERBOSE_INFO , ompi_osc_base_framework .framework_output ,
@@ -1152,7 +1170,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
11521170 my_data -> len = (osc_rdma_size_t ) my_rank ;
11531171
11541172 if (module -> use_memory_registration && module -> state_handle ) {
1155- memcpy (my_data -> btl_handle_data , module -> state_handle , module -> selected_btls [0 ]-> btl_registration_handle_size );
1173+ assert (module -> use_accelerated_btl );
1174+ memcpy (my_data -> btl_handle_data , module -> state_handle , module -> accelerated_btl -> btl_registration_handle_size );
11561175 }
11571176
11581177 /* gather state data at each node leader */
@@ -1326,9 +1345,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13261345 module -> acc_use_amo = mca_osc_rdma_component .acc_use_amo ;
13271346 module -> network_amo_max_count = mca_osc_rdma_component .network_amo_max_count ;
13281347
1329- module -> selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT ;
1330- module -> selected_btls = calloc (module -> selected_btls_size , sizeof (struct mca_btl_base_module_t * ));
1331-
13321348 module -> all_sync .module = module ;
13331349
13341350 module -> flavor = flavor ;
@@ -1386,6 +1402,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13861402 }
13871403
13881404 /* find rdma capable endpoints */
1405+ module -> use_accelerated_btl = false;
13891406 ret = ompi_osc_rdma_query_accelerated_btls (module -> comm , module );
13901407 if (OMPI_SUCCESS != ret ) {
13911408 opal_output_verbose (MCA_BASE_VERBOSE_WARN , ompi_osc_base_framework .framework_output ,
@@ -1404,7 +1421,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14041421
14051422 module -> region_size = sizeof (ompi_osc_rdma_region_t );
14061423 if (module -> use_memory_registration ) {
1407- module -> region_size += module -> selected_btls [0 ]-> btl_registration_handle_size ;
1424+ assert (module -> use_accelerated_btl );
1425+ module -> region_size += module -> accelerated_btl -> btl_registration_handle_size ;
14081426 }
14091427
14101428 module -> state_size = sizeof (ompi_osc_rdma_state_t );
0 commit comments