Skip to content

Commit 9444df1

Browse files
committed
osc/pt2pt: make lock_all locking on-demand
The original lock_all algorithm in osc/pt2pt sent a lock message to each peer in the communicator even if the peer is never the target of an operation. Since this scales very poorly the implementation has been replaced by one that locks the remote peer on first communication after a call to MPI_Win_lock_all. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 7589a25 commit 9444df1

File tree

5 files changed

+105
-36
lines changed

5 files changed

+105
-36
lines changed

ompi/mca/osc/pt2pt/osc_pt2pt.h

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,15 @@ struct ompi_osc_pt2pt_component_t {
9191
};
9292
typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t;
9393

94+
enum {
95+
/** peer has sent an unexpected post message (no matching start) */
96+
OMPI_OSC_PT2PT_PEER_FLAG_UNEX = 1,
97+
/** eager sends are active on this peer */
98+
OMPI_OSC_PT2PT_PEER_FLAG_EAGER = 2,
99+
/** peer has been locked (on-demand locking for lock_all) */
100+
OMPI_OSC_PT2PT_PEER_FLAG_LOCK = 4,
101+
};
102+
94103

95104
struct ompi_osc_pt2pt_peer_t {
96105
/** make this an opal object */
@@ -111,13 +120,54 @@ struct ompi_osc_pt2pt_peer_t {
111120
/** number of fragments incomming (negative - expected, positive - unsynchronized) */
112121
int32_t passive_incoming_frag_count;
113122

114-
/** unexpected post message arrived */
115-
bool unexpected_post;
123+
/** peer flags */
124+
int32_t flags;
116125
};
117126
typedef struct ompi_osc_pt2pt_peer_t ompi_osc_pt2pt_peer_t;
118127

119128
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_peer_t);
120129

130+
static inline bool ompi_osc_pt2pt_peer_locked (ompi_osc_pt2pt_peer_t *peer)
131+
{
132+
return !!(peer->flags & OMPI_OSC_PT2PT_PEER_FLAG_LOCK);
133+
}
134+
135+
static inline bool ompi_osc_pt2pt_peer_unex (ompi_osc_pt2pt_peer_t *peer)
136+
{
137+
return !!(peer->flags & OMPI_OSC_PT2PT_PEER_FLAG_UNEX);
138+
}
139+
140+
static inline bool ompi_osc_pt2pt_peer_eager_active (ompi_osc_pt2pt_peer_t *peer)
141+
{
142+
return !!(peer->flags & OMPI_OSC_PT2PT_PEER_FLAG_EAGER);
143+
}
144+
145+
static inline void ompi_osc_pt2pt_peer_set_flag (ompi_osc_pt2pt_peer_t *peer, int32_t flag, bool value)
146+
{
147+
if (value) {
148+
peer->flags |= flag;
149+
} else {
150+
peer->flags &= ~flag;
151+
}
152+
}
153+
154+
static inline bool ompi_osc_pt2pt_peer_set_locked (ompi_osc_pt2pt_peer_t *peer, bool value)
155+
{
156+
ompi_osc_pt2pt_peer_set_flag (peer, OMPI_OSC_PT2PT_PEER_FLAG_LOCK, value);
157+
}
158+
159+
static inline bool ompi_osc_pt2pt_peer_set_unex (ompi_osc_pt2pt_peer_t *peer, bool value)
160+
{
161+
ompi_osc_pt2pt_peer_set_flag (peer, OMPI_OSC_PT2PT_PEER_FLAG_UNEX, value);
162+
}
163+
164+
static inline bool ompi_osc_pt2pt_peer_set_eager_active (ompi_osc_pt2pt_peer_t *peer, bool value)
165+
{
166+
ompi_osc_pt2pt_peer_set_flag (peer, OMPI_OSC_PT2PT_PEER_FLAG_EAGER, value);
167+
}
168+
169+
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_peer_t);
170+
121171
/** Module structure. Exactly one of these is associated with each
122172
PT2PT window */
123173
struct ompi_osc_pt2pt_module_t {
@@ -431,6 +481,8 @@ int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
431481
int tag,
432482
struct ompi_communicator_t *comm);
433483

484+
int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_sync_t *lock);
485+
434486
/**
435487
* ompi_osc_pt2pt_progress_pending_acc:
436488
*
@@ -845,6 +897,12 @@ static inline void ompi_osc_pt2pt_module_lock_remove (struct ompi_osc_pt2pt_modu
845897
static inline ompi_osc_pt2pt_sync_t *ompi_osc_pt2pt_module_sync_lookup (ompi_osc_pt2pt_module_t *module, int target,
846898
struct ompi_osc_pt2pt_peer_t **peer)
847899
{
900+
ompi_osc_pt2pt_peer_t *tmp;
901+
902+
if (NULL == peer) {
903+
peer = &tmp;
904+
}
905+
848906
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
849907
"osc/pt2pt: looking for synchronization object for target %d", target));
850908

@@ -862,8 +920,9 @@ static inline ompi_osc_pt2pt_sync_t *ompi_osc_pt2pt_module_sync_lookup (ompi_osc
862920

863921
/* fence epoch is now active */
864922
module->all_sync.epoch_active = true;
865-
if (peer) {
866-
*peer = ompi_osc_pt2pt_peer_lookup (module, target);
923+
*peer = ompi_osc_pt2pt_peer_lookup (module, target);
924+
if (OMPI_OSC_PT2PT_SYNC_TYPE_LOCK == module->all_sync.type && !ompi_osc_pt2pt_peer_locked (*peer)) {
925+
(void) ompi_osc_pt2pt_lock_remote (module, target, &module->all_sync);
867926
}
868927

869928
return &module->all_sync;

ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -261,13 +261,13 @@ int ompi_osc_pt2pt_start (ompi_group_t *group, int assert, ompi_win_t *win)
261261
for (int i = 0 ; i < sync->num_peers ; ++i) {
262262
ompi_osc_pt2pt_peer_t *peer = sync->peer_list.peers[i];
263263

264-
if (peer->unexpected_post) {
264+
if (ompi_osc_pt2pt_peer_unex (peer)) {
265265
/* the peer already sent a post message for this pscw access epoch */
266266
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
267267
"found unexpected post from %d",
268268
peer->rank));
269269
OPAL_THREAD_ADD32 (&sync->sync_expected, -1);
270-
peer->unexpected_post = false;
270+
ompi_osc_pt2pt_peer_set_unex (peer, false);
271271
}
272272
}
273273
OPAL_THREAD_UNLOCK(&sync->lock);
@@ -600,7 +600,7 @@ void osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
600600
"received unexpected post message from %d for future PSCW synchronization",
601601
source));
602602

603-
peer->unexpected_post = true;
603+
ompi_osc_pt2pt_peer_set_unex (peer, true);
604604
OPAL_THREAD_UNLOCK(&sync->lock);
605605
} else {
606606
OPAL_THREAD_UNLOCK(&sync->lock);

ompi/mca/osc/pt2pt/osc_pt2pt_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ static void ompi_osc_pt2pt_peer_construct (ompi_osc_pt2pt_peer_t *peer)
492492
OBJ_CONSTRUCT(&peer->lock, opal_mutex_t);
493493
peer->active_frag = NULL;
494494
peer->passive_incoming_frag_count = 0;
495-
peer->unexpected_post = false;
495+
peer->flags = 0;
496496
}
497497

498498
static void ompi_osc_pt2pt_peer_destruct (ompi_osc_pt2pt_peer_t *peer)

ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,14 @@ static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_
5858
static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_sync_t *lock)
5959
{
6060
const int my_rank = ompi_comm_rank (module->comm);
61+
ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, my_rank);
6162
int lock_type = lock->sync.lock.type;
6263
bool acquired = false;
6364

6465
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
6566

67+
(void) OPAL_THREAD_ADD32(&lock->sync_expected, 1);
68+
6669
acquired = ompi_osc_pt2pt_lock_try_acquire (module, my_rank, lock_type, (uint64_t) (uintptr_t) lock);
6770
if (!acquired) {
6871
/* queue the lock */
@@ -73,6 +76,9 @@ static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, omp
7376
ompi_osc_pt2pt_sync_wait_expected (lock);
7477
}
7578

79+
ompi_osc_pt2pt_peer_set_locked (peer, true);
80+
ompi_osc_pt2pt_peer_set_eager_active (peer, true);
81+
7682
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
7783
"local lock aquired"));
7884

@@ -81,8 +87,12 @@ static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, omp
8187

8288
static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_sync_t *lock)
8389
{
90+
const int my_rank = ompi_comm_rank (module->comm);
91+
ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, my_rank);
8492
int lock_type = lock->sync.lock.type;
8593

94+
(void) OPAL_THREAD_ADD32(&lock->sync_expected, 1);
95+
8696
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
8797

8898
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
@@ -98,16 +108,22 @@ static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module,
98108
/* need to ensure we make progress */
99109
opal_progress();
100110

111+
ompi_osc_pt2pt_peer_set_locked (peer, false);
112+
ompi_osc_pt2pt_peer_set_eager_active (peer, false);
113+
101114
ompi_osc_pt2pt_sync_expected (lock);
102115
}
103116

104-
static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_sync_t *lock)
117+
int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_sync_t *lock)
105118
{
119+
ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, target);
106120
int lock_type = lock->sync.lock.type;
107121
ompi_osc_pt2pt_header_lock_t lock_req;
108122

109123
int ret;
110124

125+
(void) OPAL_THREAD_ADD32(&lock->sync_expected, 1);
126+
111127
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
112128

113129
/* generate a lock request */
@@ -128,6 +144,9 @@ static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, i
128144

129145
/* make sure the request gets sent, so we can start eager sending... */
130146
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
147+
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
148+
ompi_osc_pt2pt_peer_set_locked (peer, true);
149+
}
131150

132151
return ret;
133152
}
@@ -140,6 +159,8 @@ static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module,
140159
ompi_osc_pt2pt_header_unlock_t unlock_req;
141160
int ret;
142161

162+
(void) OPAL_THREAD_ADD32(&lock->sync_expected, 1);
163+
143164
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
144165

145166
unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ;
@@ -169,6 +190,9 @@ static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module,
169190
return ret;
170191
}
171192

193+
ompi_osc_pt2pt_peer_set_locked (peer, false);
194+
ompi_osc_pt2pt_peer_set_eager_active (peer, false);
195+
172196
return ompi_osc_pt2pt_frag_flush_target(module, target);
173197
}
174198

@@ -179,6 +203,8 @@ static inline int ompi_osc_pt2pt_flush_remote (ompi_osc_pt2pt_module_t *module,
179203
int32_t frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
180204
int ret;
181205

206+
(void) OPAL_THREAD_ADD32(&lock->sync_expected, 1);
207+
182208
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
183209

184210
flush_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ;
@@ -218,8 +244,6 @@ static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module
218244
assert (lock->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK);
219245

220246
if (0 == (assert & MPI_MODE_NOCHECK)) {
221-
lock->sync_expected = (-1 == target) ? ompi_comm_size (module->comm) : 1;
222-
223247
if (my_rank != target && target != -1) {
224248
ret = ompi_osc_pt2pt_lock_remote (module, target, lock);
225249
} else {
@@ -231,19 +255,7 @@ static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module
231255
return ret;
232256
}
233257

234-
if (-1 == target) {
235-
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
236-
if (my_rank == i) {
237-
continue;
238-
}
239-
240-
ret = ompi_osc_pt2pt_lock_remote (module, i, lock);
241-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
242-
return ret;
243-
}
244-
}
245-
246-
}
258+
/* for lock_all there is nothing more to do. we will lock peer's on demand */
247259
} else {
248260
lock->eager_send_active = true;
249261
}
@@ -312,7 +324,7 @@ static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert,
312324
lock->sync.lock.target = target;
313325
lock->sync.lock.type = lock_type;
314326
lock->sync.lock.assert = assert;
315-
327+
lock->num_peers = (-1 == target) ? ompi_comm_size (&module->comm) : 1;
316328
lock->sync_expected = 0;
317329

318330
/* delay all eager sends until we've heard back.. */
@@ -376,13 +388,13 @@ static int ompi_osc_pt2pt_unlock_internal (int target, ompi_win_t *win)
376388
"ompi_osc_pt2pt_unlock_internal: all lock acks received"));
377389

378390
if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
379-
lock->sync_expected = (-1 == target) ? ompi_comm_size (module->comm) : 1;
380-
381391
if (my_rank != target) {
382392
if (-1 == target) {
383393
/* send unlock messages to all of my peers */
384394
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
385-
if (my_rank == i) {
395+
ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, i);
396+
397+
if (my_rank == i || !ompi_osc_pt2pt_peer_locked (peer)) {
386398
continue;
387399
}
388400

@@ -469,12 +481,6 @@ static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_
469481
able to eager send before we can transfer all the data... */
470482
ompi_osc_pt2pt_sync_wait_expected (lock);
471483

472-
if (-1 == target) {
473-
lock->sync_expected = ompi_comm_size(module->comm) - 1;
474-
} else {
475-
lock->sync_expected = 1;
476-
}
477-
478484
if (-1 == target) {
479485
/* NTH: no local flush */
480486
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
@@ -488,7 +494,6 @@ static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_
488494
}
489495
}
490496
} else {
491-
492497
/* send control message with flush request and count */
493498
ret = ompi_osc_pt2pt_flush_remote (module, target, lock);
494499
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -787,6 +792,7 @@ int ompi_osc_pt2pt_process_lock (ompi_osc_pt2pt_module_t* module, int source,
787792
void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
788793
ompi_osc_pt2pt_header_lock_ack_t *lock_ack_header)
789794
{
795+
ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, lock_ack_header->source);
790796
ompi_osc_pt2pt_sync_t *lock;
791797

792798
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
@@ -796,6 +802,8 @@ void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
796802
lock = (ompi_osc_pt2pt_sync_t *) (uintptr_t) lock_ack_header->lock_ptr;
797803
assert (NULL != lock);
798804

805+
ompi_osc_pt2pt_peer_set_eager_active (peer, true);
806+
799807
ompi_osc_pt2pt_sync_expected (lock);
800808
}
801809

ompi/mca/osc/pt2pt/osc_pt2pt_sync.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,9 @@ static inline void ompi_osc_pt2pt_sync_expected (ompi_osc_pt2pt_sync_t *sync)
164164
int32_t new_value = OPAL_THREAD_ADD32 (&sync->sync_expected, -1);
165165
if (0 == new_value) {
166166
OPAL_THREAD_LOCK(&sync->lock);
167-
sync->eager_send_active = true;
167+
if (!(sync->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK && sync->num_peers > 1)) {
168+
sync->eager_send_active = true;
169+
}
168170
opal_condition_broadcast (&sync->cond);
169171
OPAL_THREAD_UNLOCK(&sync->lock);
170172
}

0 commit comments

Comments
 (0)