Skip to content

Commit 0c5b0ca

Browse files
committed
Allow MPI_WIN_SHARED_QUERY on regular windows
MPI 4.0 introduced allows applications to query regular windows for shared memory. This patch enables it for osc/rdma and osc/ucx and otherwise makes sure we fail gracefully if the component does not provide the query callback. For osc/rdma, this is currently supported only for allocated windows but could later be extended to windows with application-provided memory through xpmem. Signed-off-by: Joseph Schuchart <[email protected]>
1 parent 9977df7 commit 0c5b0ca

File tree

7 files changed

+272
-116
lines changed

7 files changed

+272
-116
lines changed

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
990990
* OR if we have an exclusive lock
991991
* OR if other processes won't try to use the network either */
992992
bool use_shared_mem = module->single_node ||
993-
(ompi_osc_rdma_peer_local_base (peer) &&
993+
(ompi_osc_rdma_peer_cpu_atomics (peer) &&
994994
(ompi_osc_rdma_peer_is_exclusive (peer) ||
995995
!module->acc_single_intrinsic));
996996

@@ -1013,7 +1013,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
10131013
lock_acquired = true;
10141014
}
10151015

1016-
if (ompi_osc_rdma_peer_local_base (peer)) {
1016+
if (ompi_osc_rdma_peer_cpu_atomics (peer)) {
10171017
ret = ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt,
10181018
peer, target_address, target_handle, module,
10191019
lock_acquired);
@@ -1095,7 +1095,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_win_t *win, const void *origin_
10951095
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
10961096
}
10971097

1098-
if (ompi_osc_rdma_peer_local_base (peer)) {
1098+
if (ompi_osc_rdma_peer_cpu_atomics (peer)) {
10991099
/* local/self optimization */
11001100
ret = ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
11011101
result_datatype, peer, target_address, target_handle, target_count,

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@
6969
#include "ompi/mca/bml/base/base.h"
7070
#include "ompi/mca/mtl/base/base.h"
7171

72+
static int ompi_osc_rdma_shared_query(struct ompi_win_t *win, int rank, size_t *size,
73+
ptrdiff_t *disp_unit, void *baseptr);
7274
static int ompi_osc_rdma_component_register (void);
7375
static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
7476
static int ompi_osc_rdma_component_finalize (void);
@@ -113,6 +115,7 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = {
113115
MCA_BASE_COMPONENT_INIT(ompi, osc, rdma)
114116

115117
ompi_osc_base_module_t ompi_osc_rdma_module_rdma_template = {
118+
.osc_win_shared_query = ompi_osc_rdma_shared_query,
116119
.osc_win_attach = ompi_osc_rdma_attach,
117120
.osc_win_detach = ompi_osc_rdma_detach,
118121
.osc_free = ompi_osc_rdma_free,
@@ -527,6 +530,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
527530
module->my_peer = my_peer;
528531
module->free_after = module->rank_array;
529532
my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
533+
my_peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
530534
my_peer->state = (uint64_t) (uintptr_t) module->state;
531535

532536
if (use_cpu_atomics) {
@@ -636,7 +640,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
636640
/* ensure proper alignment */
637641
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
638642
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, memory_alignment);
639-
size += OPAL_ALIGN_PAD_AMOUNT(size, memory_alignment);
640643
}
641644

642645
do {
@@ -836,6 +839,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
836839
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
837840
if (use_cpu_atomics && peer_rank == my_rank) {
838841
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
842+
peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
839843
}
840844
/* nothing more to do */
841845
continue;
@@ -850,7 +854,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
850854
ex_peer->size = temp[i].size;
851855
}
852856

853-
if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) {
857+
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank) {
854858
/* base is local and cpu atomics are available */
855859
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
856860
ex_peer->super.base = (uintptr_t) module->segment_base + offset;
@@ -859,7 +863,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
859863
}
860864

861865
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
866+
if (use_cpu_atomics) {
867+
peer->flags |= OMPI_OSC_RDMA_PEER_CPU_ATOMICS;
868+
}
862869
offset += temp[i].size;
870+
offset += OPAL_ALIGN_PAD_AMOUNT(offset, memory_alignment);
863871
} else {
864872
ex_peer->super.base = peer_region->base;
865873

@@ -898,7 +906,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
898906
/* this will cause add_proc to get called if it has not already been called */
899907
(void) mca_bml_base_get_endpoint (proc);
900908
}
901-
}
909+
}
902910

903911
free(procs);
904912
}
@@ -1632,3 +1640,60 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
16321640
*/
16331641
return module->no_locks ? "true" : "false";
16341642
}
1643+
1644+
int ompi_osc_rdma_shared_query(
1645+
struct ompi_win_t *win, int rank, size_t *size,
1646+
ptrdiff_t *disp_unit, void *baseptr)
1647+
{
1648+
int rc = OMPI_ERR_NOT_SUPPORTED;
1649+
ompi_osc_rdma_peer_t *peer;
1650+
int actual_rank = rank;
1651+
ompi_osc_rdma_module_t *module = GET_MODULE(win);
1652+
1653+
peer = ompi_osc_module_get_peer (module, actual_rank);
1654+
if (NULL == peer) {
1655+
return OMPI_ERR_NOT_SUPPORTED;
1656+
}
1657+
1658+
/* currently only supported for allocated windows */
1659+
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
1660+
return OMPI_ERR_NOT_SUPPORTED;
1661+
}
1662+
1663+
if (!ompi_osc_rdma_peer_local_base(peer)) {
1664+
return OMPI_ERR_NOT_SUPPORTED;
1665+
}
1666+
1667+
if (MPI_PROC_NULL == rank) {
1668+
/* iterate until we find a rank that has a non-zero size */
1669+
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
1670+
peer = ompi_osc_module_get_peer (module, i);
1671+
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
1672+
if (!ompi_osc_rdma_peer_local_base(peer)) {
1673+
continue;
1674+
} else if (module->same_size && ex_peer->super.base) {
1675+
break;
1676+
} else if (ex_peer->size > 0) {
1677+
break;
1678+
}
1679+
}
1680+
}
1681+
1682+
if (module->same_size && module->same_disp_unit) {
1683+
*size = module->size;
1684+
*disp_unit = module->disp_unit;
1685+
ompi_osc_rdma_peer_basic_t *ex_peer = (ompi_osc_rdma_peer_basic_t *) peer;
1686+
*((void**) baseptr) = (void *) (intptr_t)ex_peer->base;
1687+
rc = OMPI_SUCCESS;
1688+
} else {
1689+
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
1690+
if (ex_peer->super.base != 0) {
1691+
/* we know the base of the peer */
1692+
*((void**) baseptr) = (void *) (intptr_t)ex_peer->super.base;
1693+
*size = ex_peer->size;
1694+
*disp_unit = ex_peer->disp_unit;
1695+
rc = OMPI_SUCCESS;
1696+
}
1697+
}
1698+
return rc;
1699+
}

ompi/mca/osc/rdma/osc_rdma_peer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ enum {
142142
OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40,
143143
/** peer was demand locked as part of lock-all (when in demand locking mode) */
144144
OMPI_OSC_RDMA_PEER_DEMAND_LOCKED = 0x80,
145+
/** we can use CPU atomics on that peer */
146+
OMPI_OSC_RDMA_PEER_CPU_ATOMICS = 0x100,
145147
};
146148

147149
/**
@@ -224,6 +226,11 @@ static inline bool ompi_osc_rdma_peer_local_base (ompi_osc_rdma_peer_t *peer)
224226
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_BASE);
225227
}
226228

229+
static inline bool ompi_osc_rdma_peer_cpu_atomics (ompi_osc_rdma_peer_t *peer)
230+
{
231+
return ompi_osc_rdma_peer_local_base(peer) && !!(peer->flags & OMPI_OSC_RDMA_PEER_CPU_ATOMICS);
232+
}
233+
227234
/**
228235
* @brief check if the peer's state pointer is local to this process
229236
*

ompi/mca/osc/sm/osc_sm_component.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -283,12 +283,12 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
283283
if (module->noncontig) {
284284
opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
285285
"allocating window using non-contiguous strategy");
286-
total = ((size - 1) / pagesize + 1) * pagesize;
287286
} else {
288287
opal_output_verbose(MCA_BASE_VERBOSE_DEBUG, ompi_osc_base_framework.framework_output,
289288
"allocating window using contiguous strategy");
290-
total = size;
291289
}
290+
291+
total = size;
292292
ret = module->comm->c_coll->coll_allgather(&total, 1, MPI_UNSIGNED_LONG,
293293
rbuf, 1, MPI_UNSIGNED_LONG,
294294
module->comm,
@@ -301,6 +301,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
301301
total = 0;
302302
for (i = 0 ; i < comm_size ; ++i) {
303303
total += rbuf[i];
304+
if (module->noncontig) {
305+
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
306+
}
304307
}
305308

306309
/* user opal/shmem directly to create a shared memory segment */
@@ -378,6 +381,9 @@ component_select(struct ompi_win_t *win, void **base, size_t size, ptrdiff_t dis
378381
if (module->sizes[i] || !module->noncontig) {
379382
module->bases[i] = ((char *) module->segment_base) + total;
380383
total += rbuf[i];
384+
if (module->noncontig) {
385+
total += OPAL_ALIGN_PAD_AMOUNT(total, pagesize);
386+
}
381387
} else {
382388
module->bases[i] = NULL;
383389
}
@@ -481,10 +487,6 @@ ompi_osc_sm_shared_query(struct ompi_win_t *win, int rank, size_t *size, ptrdiff
481487
ompi_osc_sm_module_t *module =
482488
(ompi_osc_sm_module_t*) win->w_osc_module;
483489

484-
if (module->flavor != MPI_WIN_FLAVOR_SHARED) {
485-
return MPI_ERR_WIN;
486-
}
487-
488490
if (MPI_PROC_NULL != rank) {
489491
*size = module->sizes[rank];
490492
*((void**) baseptr) = module->bases[rank];

ompi/mca/osc/ucx/osc_ucx.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ typedef struct ompi_osc_ucx_module {
120120
ompi_osc_base_module_t super;
121121
struct ompi_communicator_t *comm;
122122
int flavor;
123-
size_t size;
123+
size_t size;
124+
size_t *sizes; /* used if not every process has the same size */
124125
uint64_t *addrs;
125126
uint64_t *state_addrs;
126127
uint64_t *comm_world_ranks;
@@ -143,13 +144,13 @@ typedef struct ompi_osc_ucx_module {
143144
bool lock_all_is_nocheck;
144145
bool no_locks;
145146
bool acc_single_intrinsic;
147+
bool same_size;
146148
opal_common_ucx_ctx_t *ctx;
147149
opal_common_ucx_wpmem_t *mem;
148150
opal_common_ucx_wpmem_t *state_mem;
149151
ompi_osc_ucx_mem_ranges_t *epoc_outstanding_ops_mems;
150152
bool skip_sync_check;
151153
bool noncontig_shared_win;
152-
size_t *sizes;
153154
/* in shared windows, shmem_addrs can be used for direct load store to
154155
* remote windows */
155156
uint64_t *shmem_addrs;
@@ -171,7 +172,7 @@ typedef struct ompi_osc_ucx_lock {
171172
} ompi_osc_ucx_lock_t;
172173

173174
#define OSC_UCX_GET_EP(_module, rank_) (mca_osc_ucx_component.endpoints[_module->comm_world_ranks[rank_]])
174-
#define OSC_UCX_GET_DISP(module_, rank_) ((module_->disp_unit < 0) ? module_->disp_units[rank_] : module_->disp_unit)
175+
#define OSC_UCX_GET_DISP(module_, rank_) ompi_osc_ucx_get_disp_unit((module_), (rank_))
175176

176177
#define OSC_UCX_GET_DEFAULT_EP(_ep_ptr, _module, _target) \
177178
if (opal_common_ucx_thread_enabled) { \
@@ -275,4 +276,24 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_
275276
int ompi_osc_ucx_dynamic_lock(ompi_osc_ucx_module_t *module, int target);
276277
int ompi_osc_ucx_dynamic_unlock(ompi_osc_ucx_module_t *module, int target);
277278

279+
/* returns the size at the peer */
280+
static inline size_t ompi_osc_ucx_get_size(ompi_osc_ucx_module_t *module, int rank)
281+
{
282+
if (module->sizes) {
283+
return module->sizes[rank];
284+
} else {
285+
return module->size;
286+
}
287+
}
288+
289+
/* returns the displacement unit for the given peer */
290+
static inline ptrdiff_t ompi_osc_ucx_get_disp_unit(ompi_osc_ucx_module_t *module, int rank)
291+
{
292+
if (module->disp_units) {
293+
return module->disp_units[rank];
294+
} else {
295+
return module->disp_unit;
296+
}
297+
}
298+
278299
#endif /* OMPI_OSC_UCX_H */

0 commit comments

Comments
 (0)