Skip to content

Commit c8c0437

Browse files
committed
Hotfix for lost callbacks
We must protect the activation of the completion of a continuation request to ensure that we don't miss any updates. Signed-off-by: Joseph Schuchart <[email protected]>
1 parent d5765ab commit c8c0437

File tree

1 file changed

+28
-6
lines changed

1 file changed

+28
-6
lines changed

ompi/mpiext/continue/c/continuation.c

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,19 @@ void ompi_continue_cont_release(ompi_continuation_t *cont, int rc)
267267
cont_req->cont_errorinfo.type = OMPI_REQUEST_CONT;
268268
}
269269

270+
opal_atomic_lock(&cont_req->cont_lock);
270271
int num_active = OPAL_THREAD_ADD_FETCH32(&cont_req->cont_num_active, -1);
271272
if (num_active == 0) {
272273
opal_atomic_wmb();
273-
/* signal that all continuations were found complete */
274-
ompi_request_complete(&cont_req->super, true);
274+
//opal_atomic_lock(&cont_req->cont_lock);
275+
if (!REQUEST_COMPLETE(&cont_req->super)) {
276+
/* signal that all continuations were found complete */
277+
//printf("COMPLETE cont_req %p cont %p\n", cont_req, cont);
278+
ompi_request_complete(&cont_req->super, true);
279+
}
280+
//opal_atomic_unlock(&cont_req->cont_lock);
275281
}
282+
opal_atomic_unlock(&cont_req->cont_lock);
276283

277284
OBJ_RELEASE(cont_req);
278285

@@ -618,17 +625,31 @@ ompi_continuation_t *ompi_continue_cont_create(
618625
/* signal that the continuation request has a new continuation */
619626
OBJ_RETAIN(cont_req);
620627

621-
OPAL_THREAD_ADD_FETCH32(&cont_req->cont_num_active, 1);
628+
opal_atomic_lock(&cont_req->cont_lock);
629+
int prev_num_active = OPAL_THREAD_ADD_FETCH32(&cont_req->cont_num_active, 1);
622630

623631
/* if the continuation request was completed we mark it pending here */
624-
if (REQUEST_COMPLETE(&cont_req->super)) {
632+
//if (REQUEST_COMPLETE(&cont_req->super)) {
633+
if (prev_num_active == 1) {
634+
//printf("PENDING cont_req %p cont %p\n", cont_req, cont);
625635
if (using_threads) {
626-
intptr_t tmp = (intptr_t)REQUEST_COMPLETED;
627-
opal_atomic_compare_exchange_strong_ptr((intptr_t*)&cont_req->super.req_complete, &tmp, (intptr_t)REQUEST_PENDING);
636+
//opal_atomic_lock(&cont_req->cont_lock);
637+
//if (REQUEST_COMPLETE(&cont_req->super)) {
638+
cont_req->super.req_complete = REQUEST_PENDING;
639+
cont_req->super.req_complete_cb = NULL;
640+
//}
641+
/* NOTE: atomic operations not required here, we're protected by the lock */
642+
//intptr_t tmp = (intptr_t)REQUEST_COMPLETED;
643+
//opal_atomic_compare_exchange_strong_ptr((intptr_t*)&cont_req->super.req_complete, &tmp, (intptr_t)REQUEST_PENDING);
644+
//tmp = (intptr_t)REQUEST_CB_COMPLETED;
645+
//opal_atomic_compare_exchange_strong_ptr((intptr_t*)&cont_req->super.req_complete_cb, &tmp, (intptr_t)NULL);
646+
//opal_atomic_unlock(&cont_req->cont_lock);
628647
} else {
629648
cont_req->super.req_complete = REQUEST_PENDING;
649+
cont_req->super.req_complete_cb = NULL;
630650
}
631651
}
652+
opal_atomic_unlock(&cont_req->cont_lock);
632653

633654
/* if we don't have the requests we cannot handle oob errors,
634655
* so don't bother keeping the continuation around */
@@ -1005,6 +1026,7 @@ static int ompi_continue_request_start(size_t count, ompi_request_t** cont_req_p
10051026
if (using_threads) {
10061027
opal_atomic_lock(&cont_req->cont_lock);
10071028
}
1029+
//OMPI_REQUEST_INIT(&cont_req->super, true);
10081030
if (NULL == cont_req->cont_complete_list) {
10091031
opal_list_join(&continuation_list,
10101032
opal_list_get_begin(&continuation_list),

0 commit comments

Comments
 (0)