@@ -427,7 +427,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
427427 return OMPI_SUCCESS ;
428428}
429429
430- static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
430+ static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size , bool use_cpu_atomics )
431431{
432432 size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
433433 ompi_osc_rdma_peer_t * my_peer ;
@@ -507,7 +507,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
507507 my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
508508 my_peer -> state = (uint64_t ) (uintptr_t ) module -> state ;
509509
510- if (module -> use_cpu_atomics ) {
510+ if (use_cpu_atomics ) {
511511 /* all peers are local or it is safe to mix cpu and nic atomics */
512512 my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
513513 } else {
@@ -526,7 +526,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
526526 ex_peer -> size = size ;
527527 }
528528
529- if (!module -> use_cpu_atomics ) {
529+ if (!use_cpu_atomics ) {
530530 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
531531 /* base is local and cpu atomics are available */
532532 ex_peer -> super .base_handle = module -> state_handle ;
@@ -570,6 +570,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
570570 struct _local_data * temp ;
571571 char * data_file ;
572572 size_t memory_alignment = module -> memory_alignment ;
573+ bool use_cpu_atomics ;
573574
574575 shared_comm = module -> shared_comm ;
575576
@@ -578,21 +579,26 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
578579
579580 /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
580581 module -> single_node = local_size == global_size ;
581- module -> use_cpu_atomics = module -> single_node ;
582582
583- if (!module -> single_node ) {
584- if (module -> use_accelerated_btl ) {
585- module -> use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
586- } else {
587- for (int i = 0 ; i < module -> alternate_btl_count ; ++ i ) {
588- module -> use_cpu_atomics &= !!(module -> alternate_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
589- }
590- }
583+ if (module -> single_node ) {
584+ use_cpu_atomics = true;
585+ } else if (module -> use_accelerated_btl ) {
586+ use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
587+ } else {
588+ /* using the shared state optimization that is enabled by
589+ * being able to use cpu atomics was never enabled for
590+ * alternate btls, due to a previous bug in the enablement
591+ * logic when alternate btls were first supported. It is
592+ * likely that this optimization could work with sufficient
593+ * testing, but for now, always disable to not introduce new
594+ * correctness risks.
595+ */
596+ use_cpu_atomics = false;
591597 }
592598
593599 if (1 == local_size ) {
594600 /* no point using a shared segment if there are no other processes on this node */
595- return allocate_state_single (module , base , size );
601+ return allocate_state_single (module , base , size , use_cpu_atomics );
596602 }
597603
598604 opal_output_verbose (MCA_BASE_VERBOSE_TRACE , ompi_osc_base_framework .framework_output ,
@@ -771,7 +777,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
771777 ex_peer = (ompi_osc_rdma_peer_extended_t * ) peer ;
772778
773779 /* set up peer state */
774- if (module -> use_cpu_atomics ) {
780+ if (use_cpu_atomics ) {
775781 /* all peers are local or it is safe to mix cpu and nic atomics */
776782 peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
777783 peer -> state = (osc_rdma_counter_t ) peer_state ;
@@ -796,7 +802,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
796802 }
797803
798804 if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
799- !module -> use_cpu_atomics && temp [i ].size && i > 0 ) {
805+ !use_cpu_atomics && temp [i ].size && i > 0 ) {
800806 /* use the local leader's endpoint */
801807 peer -> data_endpoint = local_leader -> data_endpoint ;
802808 peer -> data_btl_index = local_leader -> data_btl_index ;
@@ -805,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
805811 ompi_osc_module_add_peer (module , peer );
806812
807813 if (MPI_WIN_FLAVOR_DYNAMIC == module -> flavor ) {
808- if (module -> use_cpu_atomics && peer_rank == my_rank ) {
814+ if (use_cpu_atomics && peer_rank == my_rank ) {
809815 peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
810816 }
811817 /* nothing more to do */
@@ -821,7 +827,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
821827 ex_peer -> size = temp [i ].size ;
822828 }
823829
824- if (module -> use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor || peer_rank == my_rank )) {
830+ if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor || peer_rank == my_rank )) {
825831 /* base is local and cpu atomics are available */
826832 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
827833 ex_peer -> super .base = (uintptr_t ) module -> segment_base + offset ;
0 commit comments