99 * University of Stuttgart. All rights reserved.
1010 * Copyright (c) 2004-2005 The Regents of the University of California.
1111 * All rights reserved.
12- * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
12+ * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
1313 * reserved.
1414 * Copyright (c) 2006-2008 University of Houston. All rights reserved.
1515 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
@@ -446,7 +446,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
446446 my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
447447 my_peer -> state = (uint64_t ) (uintptr_t ) module -> state ;
448448
449- if (module -> selected_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB ) {
449+ if (module -> use_cpu_atomics ) {
450450 /* all peers are local or it is safe to mix cpu and nic atomics */
451451 my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
452452 } else {
@@ -496,6 +496,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
496496 local_rank = ompi_comm_rank (shared_comm );
497497 local_size = ompi_comm_size (shared_comm );
498498
499+ /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
500+ module -> use_cpu_atomics = local_size == global_size || (module -> selected_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
501+
499502 if (1 == local_size ) {
500503 /* no point using a shared segment if there are no other processes on this node */
501504 return allocate_state_single (module , base , size );
@@ -625,13 +628,15 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
625628 }
626629 }
627630
628- /* barrier to make sure all ranks have attached */
631+ /* barrier to make sure all ranks have set up their region data */
629632 shared_comm -> c_coll -> coll_barrier (shared_comm , shared_comm -> c_coll -> coll_barrier_module );
630633
631634 offset = data_base ;
632635 for (int i = 0 ; i < local_size ; ++ i ) {
636+ /* local pointer to peer's state */
637+ ompi_osc_rdma_state_t * peer_state = (ompi_osc_rdma_state_t * ) ((uintptr_t ) module -> segment_base + state_base + module -> state_size * i );
638+ ompi_osc_rdma_region_t * peer_region = (ompi_osc_rdma_region_t * ) peer_state -> regions ;
633639 ompi_osc_rdma_peer_extended_t * ex_peer ;
634- ompi_osc_rdma_state_t * peer_state ;
635640 ompi_osc_rdma_peer_t * peer ;
636641 int peer_rank = temp [i ].rank ;
637642
@@ -642,13 +647,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
642647
643648 ex_peer = (ompi_osc_rdma_peer_extended_t * ) peer ;
644649
645- /* peer state local pointer */
646- peer_state = (ompi_osc_rdma_state_t * ) ((uintptr_t ) module -> segment_base + state_base + module -> state_size * i );
647-
648- if (local_size == global_size || (module -> selected_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB )) {
650+ /* set up peer state */
651+ if (module -> use_cpu_atomics ) {
649652 /* all peers are local or it is safe to mix cpu and nic atomics */
650653 peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
651654 peer -> state = (osc_rdma_counter_t ) peer_state ;
655+ peer -> state_endpoint = NULL ;
652656 } else {
653657 /* use my endpoint handle to modify the peer's state */
654658 if (module -> selected_btl -> btl_register_mem ) {
@@ -658,38 +662,39 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
658662 peer -> state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module , temp [0 ].rank );
659663 }
660664
661- /* finish setting up the local peer structure */
662- if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor ) {
663- if (!module -> same_disp_unit ) {
664- ex_peer -> disp_unit = peer_state -> disp_unit ;
665- }
666-
667- if (!module -> same_size ) {
668- ex_peer -> size = temp [i ].size ;
669- }
665+ if (MPI_WIN_FLAVOR_DYNAMIC == module -> flavor || MPI_WIN_FLAVOR_CREATE == module -> flavor ) {
666+ /* use the peer's BTL endpoint directly */
667+ peer -> data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module , peer_rank );
668+ } else if (!module -> use_cpu_atomics && temp [i ].size ) {
669+ /* use the local leader's endpoint */
670+ peer -> data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module , temp [0 ].rank );
671+ }
670672
671- if (my_rank == peer_rank ) {
672- peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
673- }
673+ ompi_osc_module_add_peer (module , peer );
674674
675- if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
676- if (temp [i ].size ) {
677- ex_peer -> super .base = state_region -> base + offset ;
678- offset += temp [i ].size ;
679- } else {
680- ex_peer -> super .base = 0 ;
681- }
682- }
675+ if (MPI_WIN_FLAVOR_DYNAMIC == module -> flavor || 0 == temp [i ].size ) {
676+ /* nothing more to do */
677+ continue ;
678+ }
683679
684- ompi_osc_rdma_region_t * peer_region = (ompi_osc_rdma_region_t * ) peer_state -> regions ;
680+ /* finish setting up the local peer structure for win allocate/create */
681+ if (!(module -> same_disp_unit && module -> same_size )) {
682+ ex_peer -> disp_unit = peer_state -> disp_unit ;
683+ ex_peer -> size = temp [i ].size ;
684+ }
685685
686+ if (module -> use_cpu_atomics && MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
687+ /* base is local and cpu atomics are available */
688+ ex_peer -> super .base = (uintptr_t ) module -> segment_base + offset ;
689+ peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
690+ offset += temp [i ].size ;
691+ } else {
686692 ex_peer -> super .base = peer_region -> base ;
693+
687694 if (module -> selected_btl -> btl_register_mem ) {
688695 ex_peer -> super .base_handle = (mca_btl_base_registration_handle_t * ) peer_region -> btl_handle_data ;
689696 }
690697 }
691-
692- ompi_osc_module_add_peer (module , peer );
693698 }
694699 } while (0 );
695700
0 commit comments