Skip to content

Commit 7cc5841

Browse files
authored
Merge pull request open-mpi#7180 from devreal/btl-ugni-deadlock-v4.0.x
uGNI: Fix potential deadlock when processing outstanding transfers (v4.0.x)
2 parents d57bea0 + a346756 commit 7cc5841

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

opal/mca/btl/ugni/btl_ugni_component.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -664,34 +664,46 @@ static inline int
664664
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
665665
{
666666
int rc = OPAL_SUCCESS;
667+
opal_list_t tmplist;
668+
opal_list_t *waitlist = &ugni_module->ep_wait_list;
667669
mca_btl_base_endpoint_t *endpoint = NULL;
668670
int count;
669671

670-
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
671-
return 0;
672-
}
673-
674672
/* check the count before taking the lock to avoid unnecessary locking */
675-
count = opal_list_get_size(&ugni_module->ep_wait_list);
673+
count = opal_list_get_size(waitlist);
676674
if (0 == count) {
677675
return 0;
678676
}
679677

678+
/* Don't hold the wait-list lock while processing the list as that may lead
679+
* to a deadlock.
680+
* Instead, move the wait_list elements into a temporary list and work on that.*/
681+
OBJ_CONSTRUCT(&tmplist, opal_list_t);
680682
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
681-
count = opal_list_get_size(&ugni_module->ep_wait_list);
683+
opal_list_join(&tmplist, opal_list_get_end(&tmplist), waitlist);
684+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
685+
count = opal_list_get_size(&tmplist);
682686
do {
683-
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
687+
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&tmplist);
684688
if (endpoint != NULL) {
685689
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
686690

687691
if (OPAL_SUCCESS != rc) {
688-
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
692+
opal_list_append (&tmplist, &endpoint->super);
689693
} else {
690694
endpoint->wait_listed = false;
691695
}
692696
}
693697
} while (endpoint != NULL && --count > 0) ;
694-
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
698+
699+
/* reinsert unfinished elements into the wait-list */
700+
count = opal_list_get_size(&tmplist);
701+
if (0 < count) {
702+
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
703+
opal_list_join(waitlist, opal_list_get_end(waitlist), &tmplist);
704+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
705+
}
706+
OBJ_DESTRUCT(&tmplist);
695707

696708
return rc;
697709
}

0 commit comments

Comments
 (0)