Skip to content

MPI 4.0: Allow MPI_WIN_SHARED_QUERY on regular windows #13330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
* OR if we have an exclusive lock
* OR if other processes won't try to use the network either */
bool use_shared_mem = module->single_node ||
(ompi_osc_rdma_peer_local_base (peer) &&
(ompi_osc_rdma_peer_cpu_atomics (peer) &&
(ompi_osc_rdma_peer_is_exclusive (peer) ||
!module->acc_single_intrinsic));

Expand All @@ -1013,7 +1013,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
lock_acquired = true;
}

if (ompi_osc_rdma_peer_local_base (peer)) {
if (ompi_osc_rdma_peer_cpu_atomics (peer)) {
ret = ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt,
peer, target_address, target_handle, module,
lock_acquired);
Expand Down Expand Up @@ -1095,7 +1095,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_win_t *win, const void *origin_
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}

if (ompi_osc_rdma_peer_local_base (peer)) {
if (ompi_osc_rdma_peer_cpu_atomics (peer)) {
/* local/self optimization */
ret = ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
result_datatype, peer, target_address, target_handle, target_count,
Expand Down
71 changes: 68 additions & 3 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
#include "ompi/mca/bml/base/base.h"
#include "ompi/mca/mtl/base/base.h"

static int ompi_osc_rdma_shared_query(struct ompi_win_t *win, int rank, size_t *size,
ptrdiff_t *disp_unit, void *baseptr);
static int ompi_osc_rdma_component_register (void);
static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
static int ompi_osc_rdma_component_finalize (void);
Expand Down Expand Up @@ -113,6 +115,7 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = {
MCA_BASE_COMPONENT_INIT(ompi, osc, rdma)

ompi_osc_base_module_t ompi_osc_rdma_module_rdma_template = {
.osc_win_shared_query = ompi_osc_rdma_shared_query,
.osc_win_attach = ompi_osc_rdma_attach,
.osc_win_detach = ompi_osc_rdma_detach,
.osc_free = ompi_osc_rdma_free,
Expand Down Expand Up @@ -527,6 +530,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
module->my_peer = my_peer;
module->free_after = module->rank_array;
my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
my_peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
my_peer->state = (uint64_t) (uintptr_t) module->state;

if (use_cpu_atomics) {
Expand Down Expand Up @@ -636,7 +640,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
/* ensure proper alignment */
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, memory_alignment);
size += OPAL_ALIGN_PAD_AMOUNT(size, memory_alignment);
}

do {
Expand Down Expand Up @@ -836,6 +839,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
if (use_cpu_atomics && peer_rank == my_rank) {
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
}
/* nothing more to do */
continue;
Expand All @@ -850,7 +854,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ex_peer->size = temp[i].size;
}

if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) {
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank) {
/* base is local and cpu atomics are available */
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
ex_peer->super.base = (uintptr_t) module->segment_base + offset;
Expand All @@ -859,7 +863,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
}

peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
if (use_cpu_atomics) {
peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
}
offset += temp[i].size;
offset += OPAL_ALIGN_PAD_AMOUNT(offset, memory_alignment);
} else {
ex_peer->super.base = peer_region->base;

Expand Down Expand Up @@ -898,7 +906,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
/* this will cause add_proc to get called if it has not already been called */
(void) mca_bml_base_get_endpoint (proc);
}
}
}

free(procs);
}
Expand Down Expand Up @@ -1632,3 +1640,60 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
*/
return module->no_locks ? "true" : "false";
}

int ompi_osc_rdma_shared_query(
struct ompi_win_t *win, int rank, size_t *size,
ptrdiff_t *disp_unit, void *baseptr)
{
int rc = OMPI_ERR_NOT_SUPPORTED;
ompi_osc_rdma_peer_t *peer;
int actual_rank = rank;
ompi_osc_rdma_module_t *module = GET_MODULE(win);

peer = ompi_osc_module_get_peer (module, actual_rank);
if (NULL == peer) {
return OMPI_ERR_NOT_SUPPORTED;
}

/* currently only supported for allocated windows */
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
return OMPI_ERR_NOT_SUPPORTED;
}

if (!ompi_osc_rdma_peer_local_base(peer)) {
return OMPI_ERR_NOT_SUPPORTED;
}

if (MPI_PROC_NULL == rank) {
/* iterate until we find a rank that has a non-zero size */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
peer = ompi_osc_module_get_peer (module, i);
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
if (!ompi_osc_rdma_peer_local_base(peer)) {
continue;
} else if (module->same_size && ex_peer->super.base) {
break;
} else if (ex_peer->size > 0) {
break;
}
}
}

if (module->same_size && module->same_disp_unit) {
*size = module->size;
*disp_unit = module->disp_unit;
ompi_osc_rdma_peer_basic_t *ex_peer = (ompi_osc_rdma_peer_basic_t *) peer;
*((void**) baseptr) = (void *) (intptr_t)ex_peer->base;
rc = OMPI_SUCCESS;
} else {
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
if (ex_peer->super.base != 0) {
/* we know the base of the peer */
*((void**) baseptr) = (void *) (intptr_t)ex_peer->super.base;
*size = ex_peer->size;
*disp_unit = ex_peer->disp_unit;
rc = OMPI_SUCCESS;
}
}
return rc;
}
7 changes: 7 additions & 0 deletions ompi/mca/osc/rdma/osc_rdma_peer.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ enum {
OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40,
/** peer was demand locked as part of lock-all (when in demand locking mode) */
OMPI_OSC_RDMA_PEER_DEMAND_LOCKED = 0x80,
/** we can use CPU atomics on that peer */
OMPI_OSC_RDMA_PEER_CPU_ATOMICS = 0x100,
};

/**
Expand Down Expand Up @@ -224,6 +226,11 @@ static inline bool ompi_osc_rdma_peer_local_base (ompi_osc_rdma_peer_t *peer)
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_BASE);
}

static inline bool ompi_osc_rdma_peer_cpu_atomics (ompi_osc_rdma_peer_t *peer)
{
return ompi_osc_rdma_peer_local_base(peer) && !!(peer->flags & OMPI_OSC_RDMA_PEER_CPU_ATOMICS);
}

/**
* @brief check if the peer's state pointer is local to this process
*
Expand Down
14 changes: 8 additions & 6 deletions ompi/mca/osc/sm/osc_sm_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,12 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
if (module->noncontig) {
opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
"allocating window using non-contiguous strategy");
total = ((size - 1) / pagesize + 1) * pagesize;
} else {
opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
"allocating window using contiguous strategy");
total = size;
}

total = size;
ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG,
rbuf, 1, MPI_UNSIGNED_LONG,
module->comm,
Expand All @@ -301,6 +301,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
total = 0;
for (i = 0 ; i < comm_size ; ++i) {
total += rbuf[i];
if (module->noncontig) {
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
}
}

/* user opal/shmem directly to create a shared memory segment */
Expand Down Expand Up @@ -378,6 +381,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
if (module->sizes[i] || !module->noncontig) {
module->bases[i] = ((char *) module->segment_base) + total;
total += rbuf[i];
if (module->noncontig) {
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
}
} else {
module->bases[i] = NULL;
}
Expand Down Expand Up @@ -481,10 +487,6 @@ ompi_osc_sm_shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff
ompi_osc_sm_module_t *module =
(ompi_osc_sm_module_t*) win->w_osc_module;

if (module->flavor != MPI_WIN_FLAVOR_SHARED) {
return MPI_ERR_WIN;
}

if (MPI_PROC_NULL != rank) {
*size = module->sizes[rank];
*((void**) baseptr) = module->bases[rank];
Expand Down
27 changes: 24 additions & 3 deletions ompi/mca/osc/ucx/osc_ucx.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ typedef struct ompi_osc_ucx_module {
ompi_osc_base_module_t super;
struct ompi_communicator_t *comm;
int flavor;
size_t size;
size_t size;
size_t *sizes; /* used if not every process has the same size */
uint64_t *addrs;
uint64_t *state_addrs;
uint64_t *comm_world_ranks;
Expand All @@ -143,13 +144,13 @@ typedef struct ompi_osc_ucx_module {
bool lock_all_is_nocheck;
bool no_locks;
bool acc_single_intrinsic;
bool same_size;
opal_common_ucx_ctx_t *ctx;
opal_common_ucx_wpmem_t *mem;
opal_common_ucx_wpmem_t *state_mem;
ompi_osc_ucx_mem_ranges_t *epoc_outstanding_ops_mems;
bool skip_sync_check;
bool noncontig_shared_win;
size_t *sizes;
/* in shared windows, shmem_addrs can be used for direct load store to
* remote windows */
uint64_t *shmem_addrs;
Expand All @@ -171,7 +172,7 @@ typedef struct ompi_osc_ucx_lock {
} ompi_osc_ucx_lock_t;

#define OSC_UCX_GET_EP(_module, rank_) (mca_osc_ucx_component.endpoints[_module->comm_world_ranks[rank_]])
#define OSC_UCX_GET_DISP(module_, rank_) ((module_->disp_unit < 0) ? module_->disp_units[rank_] : module_->disp_unit)
#define OSC_UCX_GET_DISP(module_, rank_) ompi_osc_ucx_get_disp_unit((module_), (rank_))

#define OSC_UCX_GET_DEFAULT_EP(_ep_ptr, _module, _target) \
if (opal_common_ucx_thread_enabled) { \
Expand Down Expand Up @@ -275,4 +276,24 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_
int ompi_osc_ucx_dynamic_lock(ompi_osc_ucx_module_t *module, int target);
int ompi_osc_ucx_dynamic_unlock(ompi_osc_ucx_module_t *module, int target);

/* returns the size at the peer */
static inline size_t ompi_osc_ucx_get_size(ompi_osc_ucx_module_t *module, int rank)
{
if (module->sizes) {
return module->sizes[rank];
} else {
return module->size;
}
}

/* returns the displacement unit for the given peer */
static inline ptrdiff_t ompi_osc_ucx_get_disp_unit(ompi_osc_ucx_module_t *module, int rank)
{
if (module->disp_units) {
return module->disp_units[rank];
} else {
return module->disp_unit;
}
}

#endif /* OMPI_OSC_UCX_H */
Loading
Loading