@@ -664,34 +664,46 @@ static inline int
664664mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t * ugni_module )
665665{
666666 int rc = OPAL_SUCCESS ;
667+ opal_list_t tmplist ;
668+ opal_list_t * waitlist = & ugni_module -> ep_wait_list ;
667669 mca_btl_base_endpoint_t * endpoint = NULL ;
668670 int count ;
669671
670- if (0 == opal_list_get_size (& ugni_module -> ep_wait_list )) {
671- return 0 ;
672- }
673-
674672 /* check the count before taking the lock to avoid unnecessary locking */
675- count = opal_list_get_size (& ugni_module -> ep_wait_list );
673+ count = opal_list_get_size (waitlist );
676674 if (0 == count ) {
677675 return 0 ;
678676 }
679677
678+ /* Don't hold the wait-list lock while processing the list as that may lead
679+ * to a deadlock.
680+ * Instead, move the wait_list elements into a temporary list and work on that.*/
681+ OBJ_CONSTRUCT (& tmplist , opal_list_t );
680682 OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
681- count = opal_list_get_size (& ugni_module -> ep_wait_list );
683+ opal_list_join (& tmplist , opal_list_get_end (& tmplist ), waitlist );
684+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
685+ count = opal_list_get_size (& tmplist );
682686 do {
683- endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& ugni_module -> ep_wait_list );
687+ endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& tmplist );
684688 if (endpoint != NULL ) {
685689 rc = mca_btl_ugni_progress_send_wait_list (endpoint );
686690
687691 if (OPAL_SUCCESS != rc ) {
688- opal_list_append (& ugni_module -> ep_wait_list , & endpoint -> super );
692+ opal_list_append (& tmplist , & endpoint -> super );
689693 } else {
690694 endpoint -> wait_listed = false;
691695 }
692696 }
693697 } while (endpoint != NULL && -- count > 0 ) ;
694- OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
698+
699+ /* reinsert unfinished elements into the wait-list */
700+ count = opal_list_get_size (& tmplist );
701+ if (0 < count ) {
702+ OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
703+ opal_list_join (waitlist , opal_list_get_end (waitlist ), & tmplist );
704+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
705+ }
706+ OBJ_DESTRUCT (& tmplist );
695707
696708 return rc ;
697709}
0 commit comments