diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 06315df89d3..c9424242225 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -990,7 +990,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare * OR if we have an exclusive lock * OR if other processes won't try to use the network either */ bool use_shared_mem = module->single_node || - (ompi_osc_rdma_peer_local_base (peer) && + (ompi_osc_rdma_peer_cpu_atomics (peer) && (ompi_osc_rdma_peer_is_exclusive (peer) || !module->acc_single_intrinsic)); @@ -1013,7 +1013,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare lock_acquired = true; } - if (ompi_osc_rdma_peer_local_base (peer)) { + if (ompi_osc_rdma_peer_cpu_atomics (peer)) { ret = ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, peer, target_address, target_handle, module, lock_acquired); @@ -1095,7 +1095,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_win_t *win, const void *origin_ (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } - if (ompi_osc_rdma_peer_local_base (peer)) { + if (ompi_osc_rdma_peer_cpu_atomics (peer)) { /* local/self optimization */ ret = ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index a5d06cb7916..98b758eb08c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -69,6 +69,8 @@ #include "ompi/mca/bml/base/base.h" #include "ompi/mca/mtl/base/base.h" +static int ompi_osc_rdma_shared_query(struct ompi_win_t *win, int rank, size_t *size, + ptrdiff_t *disp_unit, void *baseptr); static int ompi_osc_rdma_component_register (void); static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads); static int ompi_osc_rdma_component_finalize (void); @@ -113,6 +115,7 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = { MCA_BASE_COMPONENT_INIT(ompi, osc, rdma) ompi_osc_base_module_t ompi_osc_rdma_module_rdma_template = { + .osc_win_shared_query = ompi_osc_rdma_shared_query, .osc_win_attach = ompi_osc_rdma_attach, .osc_win_detach = ompi_osc_rdma_detach, .osc_free = ompi_osc_rdma_free, @@ -527,6 +530,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s module->my_peer = my_peer; module->free_after = module->rank_array; my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; + my_peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS; my_peer->state = (uint64_t) (uintptr_t) module->state; if (use_cpu_atomics) { @@ -636,7 +640,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* ensure proper alignment */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, memory_alignment); - size += OPAL_ALIGN_PAD_AMOUNT(size, memory_alignment); } do { @@ -836,6 +839,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { if (use_cpu_atomics && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; + peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS; } /* nothing more to do */ continue; @@ -850,7 +854,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset; @@ -859,7 +863,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; + if (use_cpu_atomics) { + peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS; + } offset += temp[i].size; + offset += OPAL_ALIGN_PAD_AMOUNT(offset, memory_alignment); } else { ex_peer->super.base = peer_region->base; @@ -898,7 +906,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) /* this will cause add_proc to get called if it has not already been called */ (void) mca_bml_base_get_endpoint (proc); } - } + } free(procs); } @@ -1632,3 +1640,60 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons */ return module->no_locks ? "true" : "false"; } + +int ompi_osc_rdma_shared_query( + struct ompi_win_t *win, int rank, size_t *size, + ptrdiff_t *disp_unit, void *baseptr) +{ + int rc = OMPI_ERR_NOT_SUPPORTED; + ompi_osc_rdma_peer_t *peer; + int actual_rank = rank; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + + peer = ompi_osc_module_get_peer (module, actual_rank); + if (NULL == peer) { + return OMPI_ERR_NOT_SUPPORTED; + } + + /* currently only supported for allocated windows */ + if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) { + return OMPI_ERR_NOT_SUPPORTED; + } + + if (!ompi_osc_rdma_peer_local_base(peer)) { + return OMPI_ERR_NOT_SUPPORTED; + } + + if (MPI_PROC_NULL == rank) { + /* iterate until we find a rank that has a non-zero size */ + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + peer = ompi_osc_module_get_peer (module, i); + ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; + if (!ompi_osc_rdma_peer_local_base(peer)) { + continue; + } else if (module->same_size && ex_peer->super.base) { + break; + } else if (ex_peer->size > 0) { + break; + } + } + } + + if (module->same_size && module->same_disp_unit) { + *size = module->size; + *disp_unit = module->disp_unit; + ompi_osc_rdma_peer_basic_t *ex_peer = (ompi_osc_rdma_peer_basic_t *) peer; + *((void**) baseptr) = (void *) (intptr_t)ex_peer->base; + rc = OMPI_SUCCESS; + } else { + ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; + if (ex_peer->super.base != 0) { + /* we know the base of the peer */ + *((void**) baseptr) = (void *) (intptr_t)ex_peer->super.base; + *size = ex_peer->size; + *disp_unit = ex_peer->disp_unit; + rc = OMPI_SUCCESS; + } + } + return rc; +} diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index cea680a44d3..0f3cd0a774c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -142,6 +142,8 @@ enum { OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40, /** peer was demand locked as part of lock-all (when in demand locking mode) */ OMPI_OSC_RDMA_PEER_DEMAND_LOCKED = 0x80, + /** we can use CPU atomics on that peer */ + OMPI_OSC_RDMA_PEER_CPU_ATOMICS = 0x100, }; /** @@ -224,6 +226,11 @@ static inline bool ompi_osc_rdma_peer_local_base (ompi_osc_rdma_peer_t *peer) return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_BASE); } +static inline bool ompi_osc_rdma_peer_cpu_atomics (ompi_osc_rdma_peer_t *peer) +{ + return ompi_osc_rdma_peer_local_base(peer) && !!(peer->flags & OMPI_OSC_RDMA_PEER_CPU_ATOMICS); +} + /** * @brief check if the peer's state pointer is local to this process * diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index 87ed6a1431b..60f8445f289 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -283,12 +283,12 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis if (module->noncontig) { opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, "allocating window using non-contiguous strategy"); - total = ((size - 1) / pagesize + 1) * pagesize; } else { opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, "allocating window using contiguous strategy"); - total = size; } + + total = size; ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG, rbuf, 1, MPI_UNSIGNED_LONG, module->comm, @@ -301,6 +301,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis total = 0; for (i = 0 ; i < comm_size ; ++i) { total += rbuf[i]; + if (module->noncontig) { + total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize); + } } /* user opal/shmem directly to create a shared memory segment */ @@ -378,6 +381,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis if (module->sizes[i] || !module->noncontig) { module->bases[i] = ((char *) module->segment_base) + total; total += rbuf[i]; + if (module->noncontig) { + total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize); + } } else { module->bases[i] = NULL; } @@ -481,10 +487,6 @@ ompi_osc_sm_shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; - if (module->flavor != MPI_WIN_FLAVOR_SHARED) { - return MPI_ERR_WIN; - } - if (MPI_PROC_NULL != rank) { *size = module->sizes[rank]; *((void**) baseptr) = module->bases[rank]; diff --git a/ompi/mca/osc/ucx/osc_ucx.h b/ompi/mca/osc/ucx/osc_ucx.h index 1c349f30592..09b232be29c 100644 --- a/ompi/mca/osc/ucx/osc_ucx.h +++ b/ompi/mca/osc/ucx/osc_ucx.h @@ -120,7 +120,8 @@ typedef struct ompi_osc_ucx_module { ompi_osc_base_module_t super; struct ompi_communicator_t *comm; int flavor; - size_t size; + size_t size; + size_t *sizes; /* used if not every process has the same size */ uint64_t *addrs; uint64_t *state_addrs; uint64_t *comm_world_ranks; @@ -143,13 +144,13 @@ typedef struct ompi_osc_ucx_module { bool lock_all_is_nocheck; bool no_locks; bool acc_single_intrinsic; + bool same_size; opal_common_ucx_ctx_t *ctx; opal_common_ucx_wpmem_t *mem; opal_common_ucx_wpmem_t *state_mem; ompi_osc_ucx_mem_ranges_t *epoc_outstanding_ops_mems; bool skip_sync_check; bool noncontig_shared_win; - size_t *sizes; /* in shared windows, shmem_addrs can be used for direct load store to * remote windows */ uint64_t *shmem_addrs; @@ -171,7 +172,7 @@ typedef struct ompi_osc_ucx_lock { } ompi_osc_ucx_lock_t; #define OSC_UCX_GET_EP(_module, rank_) (mca_osc_ucx_component.endpoints[_module->comm_world_ranks[rank_]]) -#define OSC_UCX_GET_DISP(module_, rank_) ((module_->disp_unit < 0) ? module_->disp_units[rank_] : module_->disp_unit) +#define OSC_UCX_GET_DISP(module_, rank_) ompi_osc_ucx_get_disp_unit((module_), (rank_)) #define OSC_UCX_GET_DEFAULT_EP(_ep_ptr, _module, _target) \ if (opal_common_ucx_thread_enabled) { \ @@ -275,4 +276,24 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_ int ompi_osc_ucx_dynamic_lock(ompi_osc_ucx_module_t *module, int target); int ompi_osc_ucx_dynamic_unlock(ompi_osc_ucx_module_t *module, int target); +/* returns the size at the peer */ +static inline size_t ompi_osc_ucx_get_size(ompi_osc_ucx_module_t *module, int rank) +{ + if (module->sizes) { + return module->sizes[rank]; + } else { + return module->size; + } +} + +/* returns the displacement unit for the given peer */ +static inline ptrdiff_t ompi_osc_ucx_get_disp_unit(ompi_osc_ucx_module_t *module, int rank) +{ + if (module->disp_units) { + return module->disp_units[rank]; + } else { + return module->disp_unit; + } +} + #endif /* OMPI_OSC_UCX_H */ diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 27201eae8ff..80b3f6618de 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -290,7 +290,7 @@ static int component_init(bool enable_progress_threads, bool enable_mpi_threads) return OMPI_SUCCESS; } -static int component_set_priority() { +static int component_set_priority(void) { int param, ret; opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE; mca_base_var_source_t param_source = MCA_BASE_VAR_SOURCE_DEFAULT; @@ -468,39 +468,68 @@ static const char* ompi_osc_ucx_set_no_lock_info(opal_infosubscriber_t *obj, con return module->no_locks ? "true" : "false"; } +static int ompi_osc_ucx_shared_query_peer(ompi_osc_ucx_module_t *module, int peer, size_t *size, + ptrdiff_t *disp_unit, void *baseptr) { + + int rc; + ucp_ep_h *dflt_ep; + ucp_ep_h ep; // ignored + ucp_rkey_h rkey; + OSC_UCX_GET_DEFAULT_EP(dflt_ep, module, peer); + opal_common_ucx_winfo_t *winfo; // ignored + rc = opal_common_ucx_tlocal_fetch(module->mem, peer, &ep, &rkey, &winfo, dflt_ep); + if (OMPI_SUCCESS != rc) { + return OMPI_ERR_NOT_SUPPORTED; + } + void *addr_p; + if (UCS_OK != ucp_rkey_ptr(rkey, module->addrs[peer], &addr_p)) { + return OMPI_ERR_NOT_SUPPORTED; + } + *size = ompi_osc_ucx_get_size(module, peer); + *((void**) baseptr) = addr_p; + *disp_unit = (module->disp_unit < 0) ? module->disp_units[peer] : module->disp_unit; + + return OMPI_SUCCESS; +} + int ompi_osc_ucx_shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff_t *disp_unit, void *baseptr) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; + *size = 0; + *((void**) baseptr) = NULL; + *disp_unit = 0; + if (module->flavor != MPI_WIN_FLAVOR_SHARED) { - return MPI_ERR_WIN; - } - if (MPI_PROC_NULL != rank) { - *size = module->sizes[rank]; - *((void**) baseptr) = (void *)module->shmem_addrs[rank]; - if (module->disp_unit == -1) { - *disp_unit = module->disp_units[rank]; + if (MPI_PROC_NULL == rank) { + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (0 != ompi_osc_ucx_get_size(module, i)) { + if (OMPI_SUCCESS == ompi_osc_ucx_shared_query_peer(module, i, size, disp_unit, baseptr)) { + return OMPI_SUCCESS; + } + } + } } else { - *disp_unit = module->disp_unit; + if (0 != ompi_osc_ucx_get_size(module, rank)) { + return ompi_osc_ucx_shared_query_peer(module, rank, size, disp_unit, baseptr); + } } - } else { - int i = 0; - - *size = 0; - *((void**) baseptr) = NULL; - *disp_unit = 0; - for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { - if (0 != module->sizes[i]) { - *size = module->sizes[i]; + return OMPI_ERR_NOT_SUPPORTED; + + } else if (MPI_PROC_NULL != rank) { // shared memory window with given rank + *size = ompi_osc_ucx_get_size(module, rank); + *((void**) baseptr) = (void *)module->shmem_addrs[rank]; + *disp_unit = ompi_osc_ucx_get_disp_unit(module, rank); + } else { // shared memory window with MPI_PROC_NULL + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + size_t peer_size = ompi_osc_ucx_get_size(module, i); + if (0 != peer_size) { + *size = peer_size; *((void**) baseptr) = (void *)module->shmem_addrs[i]; - if (module->disp_unit == -1) { - *disp_unit = module->disp_units[rank]; - } else { - *disp_unit = module->disp_unit; - } + *disp_unit = ompi_osc_ucx_get_disp_unit(module, i); break; } } @@ -514,8 +543,9 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt int flavor, int *model) { ompi_osc_ucx_module_t *module = NULL; char *name = NULL; - long values[2]; + long values[4]; int ret = OMPI_SUCCESS; + int val_count = 0; int i, comm_size = ompi_comm_size(comm); bool env_initialized = false; void *state_base = NULL; @@ -525,7 +555,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt uint64_t my_info[3] = {0}; char *recv_buf = NULL; void *dynamic_base = NULL; - unsigned long total, *rbuf; + unsigned long adjusted_size = size; int flag; size_t pagesize; bool unlink_needed = false; @@ -639,35 +669,102 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info); module->skip_sync_check = false; + if (flavor == MPI_WIN_FLAVOR_SHARED) { + opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, + "allocating shared memory region of size %ld\n", (long) size); + /* get the pagesize */ + pagesize = opal_getpagesize(); + + /* Note that the alloc_shared_noncontig info key only has + * meaning during window creation. Once the window is + * created, we can't move memory around without making + * everything miserable. So we intentionally do not subscribe + * to updates on the info key, because there's no useful + * update to occur. */ + module->noncontig_shared_win = false; + if (OMPI_SUCCESS != opal_info_get_bool(info, "alloc_shared_noncontig", + &module->noncontig_shared_win, &flag)) { + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + + if (module->noncontig_shared_win) { + opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, + "allocating window using non-contiguous strategy"); + adjusted_size = ((size - 1) / pagesize + 1) * pagesize; + } else { + opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, + "allocating window using contiguous strategy"); + adjusted_size = size; + } + } + /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ values[0] = disp_unit; values[1] = -disp_unit; + values[2] = adjusted_size; + values[3] = -(long)adjusted_size; - ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 2, MPI_LONG, + ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 4, MPI_LONG, MPI_MIN, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } - if (values[0] == -values[1]) { /* everyone has the same disp_unit, we do not need O(p) space */ + bool same_disp_unit = (values[0] == -values[1]); + bool same_size = (values[2] == -values[3]); + + if (same_disp_unit) { /* everyone has the same disp_unit, we do not need O(p) space */ module->disp_unit = disp_unit; - } else { /* different disp_unit sizes, allocate O(p) space to store them */ - module->disp_unit = -1; - module->disp_units = calloc(comm_size, sizeof(ptrdiff_t)); - if (module->disp_units == NULL) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto error; - } + module->disp_units = NULL; + } else { + values[val_count++] = disp_unit; + } + + if (same_size) { + module->same_size = true; + module->sizes = NULL; + } else { + values[val_count++] = size; + } - ret = module->comm->c_coll->coll_allgather(&disp_unit, sizeof(ptrdiff_t), MPI_BYTE, - module->disp_units, sizeof(ptrdiff_t), MPI_BYTE, - module->comm, - module->comm->c_coll->coll_allgather_module); + if (!same_disp_unit || !same_size) { + long* peer_values = malloc(comm_size * val_count * sizeof(long)); + ret = module->comm->c_coll->coll_allgather(values, val_count * sizeof(long), MPI_BYTE, + peer_values, sizeof(long) * val_count, MPI_BYTE, + module->comm, + module->comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } + + if (!same_disp_unit) { /* everyone has a different disp_unit */ + module->disp_unit = -1; + module->disp_units = calloc(comm_size, sizeof(ptrdiff_t)); + if (module->disp_units == NULL) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + for (i = 0; i < comm_size; i++) { + module->disp_units[i] = (ptrdiff_t)peer_values[i*val_count]; + } + } + + if (!same_size) { /* everyone has the same disp_unit, we do not need O(p) space */ + module->same_size = false; + module->sizes = calloc(comm_size, sizeof(size_t)); + if (module->sizes == NULL) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + + for (i = 0; i < comm_size; i++) { + module->sizes[i] = (size_t)peer_values[(i+1)*val_count - 1]; + } + } + free(peer_values); } ret = opal_common_ucx_wpctx_create(mca_osc_ucx_component.wpool, comm_size, @@ -679,50 +776,14 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt if (flavor == MPI_WIN_FLAVOR_SHARED) { /* create the segment */ - opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, - "allocating shared memory region of size %ld\n", (long) size); - /* get the pagesize */ - pagesize = opal_getpagesize(); - - rbuf = malloc(sizeof(unsigned long) * comm_size); - if (NULL == rbuf) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - /* Note that the alloc_shared_noncontig info key only has - * meaning during window creation. Once the window is - * created, we can't move memory around without making - * everything miserable. So we intentionally do not subscribe - * to updates on the info key, because there's no useful - * update to occur. */ - module->noncontig_shared_win = false; - if (OMPI_SUCCESS != opal_info_get_bool(info, "alloc_shared_noncontig", - &module->noncontig_shared_win, &flag)) { - free(rbuf); - goto error; - } - - if (module->noncontig_shared_win) { - opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, - "allocating window using non-contiguous strategy"); - total = ((size - 1) / pagesize + 1) * pagesize; - } else { - opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output, - "allocating window using contiguous strategy"); - total = size; - } - ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG, - rbuf, 1, MPI_UNSIGNED_LONG, - module->comm, - module->comm->c_coll->coll_allgather_module); - if (OMPI_SUCCESS != ret) return ret; - - total = 0; + size_t total = 0; for (i = 0 ; i < comm_size ; ++i) { - total += rbuf[i]; + total += ompi_osc_ucx_get_size(module, i); } module->segment_base = NULL; module->shmem_addrs = NULL; - module->sizes = NULL; if (total != 0) { /* user opal/shmem directly to create a shared memory segment */ @@ -733,14 +794,12 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt OMPI_PROC_MY_NAME->jobid, (int) OMPI_PROC_MY_NAME->vpid, ompi_comm_print_cid(module->comm)); if (ret < 0) { - free(rbuf); return OMPI_ERR_OUT_OF_RESOURCE; } ret = opal_shmem_segment_create (&module->seg_ds, data_file, total); free(data_file); if (OPAL_SUCCESS != ret) { - free(rbuf); goto error; } @@ -750,20 +809,18 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt ret = module->comm->c_coll->coll_bcast (&module->seg_ds, sizeof (module->seg_ds), MPI_BYTE, 0, module->comm, module->comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != ret) { - free(rbuf); goto error; } module->segment_base = opal_shmem_segment_attach (&module->seg_ds); if (NULL == module->segment_base) { - free(rbuf); + ret = OMPI_ERR_OUT_OF_RESOURCE; goto error; } /* wait for all processes to attach */ ret = module->comm->c_coll->coll_barrier (module->comm, module->comm->c_coll->coll_barrier_module); if (OMPI_SUCCESS != ret) { - free(rbuf); goto error; } @@ -778,34 +835,25 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt * different between different processes. To use direct load/store, * shmem_addrs can be used, however, for RDMA, virtual address of * remote process that will be stored in module->addrs should be used */ - module->sizes = malloc(sizeof(size_t) * comm_size); - if (NULL == module->sizes) { - free(rbuf); - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto error; - } module->shmem_addrs = malloc(sizeof(uint64_t) * comm_size); if (NULL == module->shmem_addrs) { free(module->sizes); - free(rbuf); ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } for (i = 0, total = 0; i < comm_size ; ++i) { - module->sizes[i] = rbuf[i]; - if (module->sizes[i] || !module->noncontig_shared_win) { + size_t peer_size = ompi_osc_ucx_get_size(module, i); + if (peer_size || !module->noncontig_shared_win) { module->shmem_addrs[i] = ((uint64_t) module->segment_base) + total; - total += rbuf[i]; + total += peer_size; } else { module->shmem_addrs[i] = (uint64_t)NULL; } } - free(rbuf); - - module->size = module->sizes[ompi_comm_rank(module->comm)]; + module->size = ompi_osc_ucx_get_size(module, ompi_comm_rank(module->comm)); *base = (void *)module->shmem_addrs[ompi_comm_rank(module->comm)]; } @@ -942,6 +990,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt error: if (module->disp_units) free(module->disp_units); if (module->comm) ompi_comm_free(&module->comm); + if (module->sizes) free(module->sizes); free(module); module = NULL; @@ -1168,8 +1217,6 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { opal_shmem_segment_detach(&module->seg_ds); if (module->shmem_addrs != NULL) free(module->shmem_addrs); - if (module->sizes != NULL) - free(module->sizes); } if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { @@ -1202,6 +1249,9 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { if (module->disp_units) { free(module->disp_units); } + if (module->sizes) { + free(module->sizes); + } ompi_comm_free(&module->comm); free(module); diff --git a/ompi/mpi/c/win_shared_query.c.in b/ompi/mpi/c/win_shared_query.c.in index ad88189428f..0616a9366aa 100644 --- a/ompi/mpi/c/win_shared_query.c.in +++ b/ompi/mpi/c/win_shared_query.c.in @@ -26,9 +26,9 @@ PROTOTYPE ERROR_CLASS win_shared_query(WIN win, INT rank, AINT_OUT size, INT_AINT_OUT disp_unit, BUFFER_OUT baseptr) { - int rc; size_t tsize; ptrdiff_t du; + int rc = OMPI_SUCCESS; if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); @@ -40,12 +40,23 @@ PROTOTYPE ERROR_CLASS win_shared_query(WIN win, INT rank, AINT_OUT size, INT_AIN } } + rc = OMPI_ERR_NOT_SUPPORTED; + if (NULL != win->w_osc_module->osc_win_shared_query) { rc = win->w_osc_module->osc_win_shared_query(win, rank, &tsize, &du, baseptr); - *size = tsize; - *disp_unit = du; - } else { - rc = MPI_ERR_RMA_FLAVOR; + if (OMPI_SUCCESS == rc) { + *size = tsize; + *disp_unit = du; + } + } + + if (OMPI_ERR_NOT_SUPPORTED == rc) { + /* gracefully bail out */ + *size = 0; + *disp_unit = 0; + *(void**) baseptr = NULL; + rc = MPI_SUCCESS; // don't raise an error if the function is not supported } + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); }