@@ -970,7 +970,10 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
970970 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
971971 ORTE_NAME_PRINT (& pop -> peer ));
972972
973- MCA_OOB_TCP_CHECK_SHUTDOWN (pop );
973+ /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
974+ if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered ) {
975+ goto cleanup ;
976+ }
974977
975978 /* Mark that we no longer support this peer */
976979 memcpy (& ui64 , (char * )& pop -> peer , sizeof (uint64_t ));
@@ -984,6 +987,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
984987 ORTE_ERROR_LOG (rc );
985988 }
986989
990+ cleanup :
987991 /* activate the proc state */
988992 if (ORTE_SUCCESS != orte_routed .route_lost (& pop -> peer )) {
989993 ORTE_ACTIVATE_PROC_STATE (& pop -> peer , ORTE_PROC_STATE_LIFELINE_LOST );
@@ -1006,8 +1010,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10061010 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
10071011 ORTE_NAME_PRINT (& mop -> hop ));
10081012
1009- MCA_OOB_TCP_CHECK_SHUTDOWN (mop );
1010-
10111013 /* mark that we cannot reach this hop */
10121014 memcpy (& ui64 , (char * )& (mop -> hop ), sizeof (uint64_t ));
10131015 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (& orte_oob_base .peers ,
@@ -1020,11 +1022,16 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10201022 ORTE_ERROR_LOG (rc );
10211023 }
10221024
1023- /* if this was a lifeline, then alert */
1024- if (ORTE_SUCCESS != orte_routed .route_lost (& mop -> hop )) {
1025- ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_LIFELINE_LOST );
1026- } else {
1027- ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_COMM_FAILED );
1025+ /* report the error back to the OOB and let it try other components
1026+ * or declare a problem
1027+ */
1028+ if (!orte_finalizing && !orte_abnormal_term_ordered ) {
1029+ /* if this was a lifeline, then alert */
1030+ if (ORTE_SUCCESS != orte_routed .route_lost (& mop -> hop )) {
1031+ ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_LIFELINE_LOST );
1032+ } else {
1033+ ORTE_ACTIVATE_PROC_STATE (& mop -> hop , ORTE_PROC_STATE_COMM_FAILED );
1034+ }
10281035 }
10291036
10301037 OBJ_RELEASE (mop );
@@ -1042,7 +1049,11 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
10421049 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
10431050 ORTE_NAME_PRINT (& mop -> hop ));
10441051
1045- MCA_OOB_TCP_CHECK_SHUTDOWN (mop );
1052+ if (orte_finalizing || orte_abnormal_term_ordered ) {
1053+ /* just ignore the problem */
1054+ OBJ_RELEASE (mop );
1055+ return ;
1056+ }
10461057
10471058 /* mark that this component cannot reach this hop */
10481059 memcpy (& ui64 , (char * )& (mop -> hop ), sizeof (uint64_t ));
@@ -1110,7 +1121,11 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
11101121 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
11111122 ORTE_NAME_PRINT (& pop -> peer ));
11121123
1113- MCA_OOB_TCP_CHECK_SHUTDOWN (pop );
1124+ /* if we are terminating, then don't attempt to reconnect */
1125+ if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered ) {
1126+ OBJ_RELEASE (pop );
1127+ return ;
1128+ }
11141129
11151130 /* activate the proc state */
11161131 opal_output_verbose (OOB_TCP_DEBUG_CONNECT , orte_oob_base_framework .framework_output ,
0 commit comments