Skip to content

Commit 1cdc1c1

Browse files
author
Ralph Castain
committed
Revert "Standardize the handling of shutdown in the OOB TCP component"
This reverts commit 12dccaa.
1 parent a04f1cd commit 1cdc1c1

File tree

2 files changed

+25
-22
lines changed

2 files changed

+25
-22
lines changed

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -970,7 +970,10 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
970970
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
971971
ORTE_NAME_PRINT(&pop->peer));
972972

973-
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
973+
/* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
974+
if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
975+
goto cleanup;
976+
}
974977

975978
/* Mark that we no longer support this peer */
976979
memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t));
@@ -984,6 +987,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
984987
ORTE_ERROR_LOG(rc);
985988
}
986989

990+
cleanup:
987991
/* activate the proc state */
988992
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
989993
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
@@ -1006,8 +1010,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10061010
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10071011
ORTE_NAME_PRINT(&mop->hop));
10081012

1009-
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
1010-
10111013
/* mark that we cannot reach this hop */
10121014
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
10131015
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
@@ -1020,11 +1022,16 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10201022
ORTE_ERROR_LOG(rc);
10211023
}
10221024

1023-
/* if this was a lifeline, then alert */
1024-
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1025-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1026-
} else {
1027-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1025+
/* report the error back to the OOB and let it try other components
1026+
* or declare a problem
1027+
*/
1028+
if (!orte_finalizing && !orte_abnormal_term_ordered) {
1029+
/* if this was a lifeline, then alert */
1030+
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1031+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1032+
} else {
1033+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1034+
}
10281035
}
10291036

10301037
OBJ_RELEASE(mop);
@@ -1042,7 +1049,11 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
10421049
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10431050
ORTE_NAME_PRINT(&mop->hop));
10441051

1045-
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
1052+
if (orte_finalizing || orte_abnormal_term_ordered) {
1053+
/* just ignore the problem */
1054+
OBJ_RELEASE(mop);
1055+
return;
1056+
}
10461057

10471058
/* mark that this component cannot reach this hop */
10481059
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
@@ -1110,7 +1121,11 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
11101121
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11111122
ORTE_NAME_PRINT(&pop->peer));
11121123

1113-
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
1124+
/* if we are terminating, then don't attempt to reconnect */
1125+
if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
1126+
OBJ_RELEASE(pop);
1127+
return;
1128+
}
11141129

11151130
/* activate the proc state */
11161131
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

orte/mca/oob/tcp/oob_tcp_component.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,4 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short
9292
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata);
9393
ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata);
9494

95-
/* provide a macro for handling errors reported during shutdown */
96-
#define MCA_OOB_TCP_CHECK_SHUTDOWN(a) \
97-
do { \
98-
if (!orte_enable_recovery || \
99-
orte_orteds_term_ordered || \
100-
orte_finalizing || \
101-
orte_abnormal_term_ordered) { \
102-
OBJ_RELEASE(a); \
103-
return; \
104-
} \
105-
} while(0);
106-
10795
#endif /* _MCA_OOB_TCP_COMPONENT_H_ */

0 commit comments

Comments
 (0)