@@ -666,34 +666,46 @@ static inline int
666666mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t * ugni_module )
667667{
668668 int rc = OPAL_SUCCESS ;
669+ opal_list_t tmplist ;
670+ opal_list_t * waitlist = & ugni_module -> ep_wait_list ;
669671 mca_btl_base_endpoint_t * endpoint = NULL ;
670672 int count ;
671673
672- if (0 == opal_list_get_size (& ugni_module -> ep_wait_list )) {
673- return 0 ;
674- }
675-
676674 /* check the count before taking the lock to avoid unnecessary locking */
677- count = opal_list_get_size (& ugni_module -> ep_wait_list );
675+ count = opal_list_get_size (waitlist );
678676 if (0 == count ) {
679677 return 0 ;
680678 }
681679
680+ /* Don't hold the wait-list lock while processing the list as that may lead
681+ * to a deadlock.
682+ * Instead, move the wait_list elements into a temporary list and work on that.*/
683+ OBJ_CONSTRUCT (& tmplist , opal_list_t );
682684 OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
683- count = opal_list_get_size (& ugni_module -> ep_wait_list );
685+ opal_list_join (& tmplist , opal_list_get_end (& tmplist ), waitlist );
686+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
687+ count = opal_list_get_size (& tmplist );
684688 do {
685- endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& ugni_module -> ep_wait_list );
689+ endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& tmplist );
686690 if (endpoint != NULL ) {
687691 rc = mca_btl_ugni_progress_send_wait_list (endpoint );
688692
689693 if (OPAL_SUCCESS != rc ) {
690- opal_list_append (& ugni_module -> ep_wait_list , & endpoint -> super );
694+ opal_list_append (& tmplist , & endpoint -> super );
691695 } else {
692696 endpoint -> wait_listed = false;
693697 }
694698 }
695699 } while (endpoint != NULL && -- count > 0 ) ;
696- OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
700+
701+ /* reinsert unfinished elements into the wait-list */
702+ count = opal_list_get_size (& tmplist );
703+ if (0 < count ) {
704+ OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
705+ opal_list_join (waitlist , opal_list_get_end (waitlist ), & tmplist );
706+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
707+ }
708+ OBJ_DESTRUCT (& tmplist );
697709
698710 return rc ;
699711}
0 commit comments