Skip to content

Commit 022c658

Browse files
committed
osc/rdma: rework locking code to improve behavior of unlock
This commit changes the locking code to allow the lock release to be non-blocking. This helps with releasing the accumulate lock which may occur in a BTL callback. Fixes #3616 Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 055ce80 commit 022c658

File tree

3 files changed

+136
-130
lines changed

3 files changed

+136
-130
lines changed

ompi/mca/osc/rdma/osc_rdma_active_target.c

Lines changed: 46 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,47 @@ typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
4848

4949
static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
5050

51+
static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pending_op)
52+
{
53+
pending_op->op_frag = NULL;
54+
pending_op->op_buffer = NULL;
55+
pending_op->op_result = NULL;
56+
pending_op->op_complete = false;
57+
}
58+
59+
static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
60+
{
61+
if (NULL != pending_op->op_frag) {
62+
ompi_osc_rdma_frag_complete (pending_op->op_frag);
63+
}
64+
65+
ompi_osc_rdma_pending_op_construct (pending_op);
66+
}
67+
68+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t,
69+
ompi_osc_rdma_pending_op_construct,
70+
ompi_osc_rdma_pending_op_destruct);
71+
5172
/**
5273
* Dummy completion function for atomic operations
5374
*/
5475
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
5576
void *local_address, mca_btl_base_registration_handle_t *local_handle,
5677
void *context, void *data, int status)
5778
{
58-
volatile bool *atomic_complete = (volatile bool *) context;
79+
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;
5980

60-
if (atomic_complete) {
61-
*atomic_complete = true;
81+
if (pending_op->op_result) {
82+
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
6283
}
84+
85+
if (NULL != pending_op->op_frag) {
86+
ompi_osc_rdma_frag_complete (pending_op->op_frag);
87+
pending_op->op_frag = NULL;
88+
}
89+
90+
pending_op->op_complete = true;
91+
OBJ_RELEASE(pending_op);
6392
}
6493

6594
/**
@@ -182,9 +211,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
182211
ompi_osc_rdma_peer_t **peers;
183212
int my_rank = ompi_comm_rank (module->comm);
184213
ompi_osc_rdma_state_t *state = module->state;
185-
volatile bool atomic_complete;
186-
ompi_osc_rdma_frag_t *frag = NULL;
187-
osc_rdma_counter_t *temp = NULL;
188214
int ret;
189215

190216
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name);
@@ -212,9 +238,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
212238
state->num_complete_msgs = 0;
213239
OPAL_THREAD_UNLOCK(&module->lock);
214240

215-
/* allocate a temporary buffer for atomic response */
216-
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
217-
218241
if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) {
219242
return OMPI_SUCCESS;
220243
}
@@ -226,7 +249,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
226249
/* translate group ranks into the communicator */
227250
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
228251
if (OPAL_UNLIKELY(NULL == peers)) {
229-
ompi_osc_rdma_frag_complete (frag);
230252
return OMPI_ERR_OUT_OF_RESOURCE;
231253
}
232254

@@ -236,65 +258,40 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
236258
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
237259
ompi_osc_rdma_peer_t *peer = peers[i];
238260
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
239-
int post_index;
261+
ompi_osc_rdma_lock_t post_index;
240262

241263
if (peer->rank == my_rank) {
242264
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
243265
continue;
244266
}
245267

246268
/* get a post index */
247-
atomic_complete = false;
248269
if (!ompi_osc_rdma_peer_local_state (peer)) {
249-
do {
250-
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle,
251-
peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
252-
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
253-
assert (OPAL_SUCCESS >= ret);
254-
255-
if (OMPI_SUCCESS == ret) {
256-
while (!atomic_complete) {
257-
ompi_osc_rdma_progress (module);
258-
}
259-
260-
break;
261-
}
262-
263-
ompi_osc_rdma_progress (module);
264-
} while (1);
270+
ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true);
271+
assert (OMPI_SUCCESS == ret);
265272
} else {
266-
*temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
273+
post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
267274
}
268-
post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1);
275+
276+
post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1;
269277

270278
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
271279
sizeof (osc_rdma_counter_t) * post_index;
272280

273281
do {
282+
ompi_osc_rdma_lock_t result;
283+
274284
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", post_index, peer->rank);
275285

276286
/* try to post. if the value isn't 0 then another rank is occupying this index */
277287
if (!ompi_osc_rdma_peer_local_state (peer)) {
278-
atomic_complete = false;
279-
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle,
280-
0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
281-
(void *) &atomic_complete, NULL);
282-
assert (OPAL_SUCCESS >= ret);
283-
284-
if (OMPI_SUCCESS == ret) {
285-
while (!atomic_complete) {
286-
ompi_osc_rdma_progress (module);
287-
}
288-
} else {
289-
ompi_osc_rdma_progress (module);
290-
continue;
291-
}
292-
288+
ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result);
289+
assert (OMPI_SUCCESS == ret);
293290
} else {
294-
*temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
291+
result = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
295292
}
296293

297-
if (OPAL_LIKELY(0 == *temp)) {
294+
if (OPAL_LIKELY(0 == result)) {
298295
break;
299296
}
300297

@@ -313,8 +310,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
313310
} while (1);
314311
}
315312

316-
ompi_osc_rdma_frag_complete (frag);
317-
318313
ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));
319314

320315
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete");
@@ -422,9 +417,7 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
422417
{
423418
ompi_osc_rdma_module_t *module = GET_MODULE(win);
424419
ompi_osc_rdma_sync_t *sync = &module->all_sync;
425-
ompi_osc_rdma_frag_t *frag = NULL;
426420
ompi_osc_rdma_peer_t **peers;
427-
void *scratch_lock = NULL;
428421
ompi_group_t *group;
429422
int group_size, ret;
430423

@@ -459,45 +452,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
459452

460453
ompi_osc_rdma_sync_rdma_complete (sync);
461454

462-
if (!(MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags)) {
463-
/* need a temporary buffer for performing fetching atomics */
464-
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &scratch_lock);
465-
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
466-
return ret;
467-
}
468-
}
469-
470455
/* for each process in the group increment their number of complete messages */
471456
for (int i = 0 ; i < group_size ; ++i) {
472457
ompi_osc_rdma_peer_t *peer = peers[i];
473458
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs);
474459

475460
if (!ompi_osc_rdma_peer_local_state (peer)) {
476-
do {
477-
if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) {
478-
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle,
479-
MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
480-
ompi_osc_rdma_atomic_complete, NULL, NULL);
481-
} else {
482-
/* don't care about the read value so use the scratch lock */
483-
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, scratch_lock,
484-
target, frag->handle, peer->state_handle, MCA_BTL_ATOMIC_ADD, 1,
485-
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL);
486-
}
487-
488-
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
489-
break;
490-
}
491-
} while (1);
461+
ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true);
462+
assert (OMPI_SUCCESS == ret);
492463
} else {
493464
(void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1);
494465
}
495466
}
496467

497-
if (frag) {
498-
ompi_osc_rdma_frag_complete (frag);
499-
}
500-
501468
/* release our reference to peers in this group */
502469
ompi_osc_rdma_release_peers (peers, group_size);
503470

0 commit comments

Comments
 (0)