Skip to content

Commit e45a358

Browse files
author
Ralph Castain
authored
Merge pull request #3647 from rhc54/topic/forced
Provide better help when forced_terminate is invoked
2 parents cde80bb + 2ab4f93 commit e45a358

File tree

5 files changed

+223
-37
lines changed

5 files changed

+223
-37
lines changed

orte/mca/errmgr/base/help-errmgr-base.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,10 @@ then it could be an internal programming error that should be
9898
reported to the developers. In the meantime, a workaround may
9999
be to set the MCA param routed=direct on the command line or
100100
in your environment.
101+
#
102+
[simple-message]
103+
An internal error has occurred in ORTE:
104+
105+
%s
106+
107+
This is something that should be reported to the developers.

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464

6565
static int init(void);
6666
static int finalize(void);
67+
static void hnp_abort(int error_code, char *fmt, ...);
6768

6869
static int predicted_fault(opal_list_t *proc_list,
6970
opal_list_t *node_list,
@@ -83,7 +84,7 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
8384
init,
8485
finalize,
8586
orte_errmgr_base_log,
86-
orte_errmgr_base_abort,
87+
hnp_abort,
8788
orte_errmgr_base_abort_peers,
8889
predicted_fault,
8990
suggest_map_targets,
@@ -125,6 +126,71 @@ static int finalize(void)
125126
return ORTE_SUCCESS;
126127
}
127128

129+
static void wakeup(int sd, short args, void *cbdata)
130+
{
131+
/* nothing more we can do */
132+
orte_quit(0, 0, NULL);
133+
}
134+
135+
/* this function only gets called when FORCED_TERMINATE
136+
* has been invoked, which means that there is some
137+
* internal failure (e.g., to pack/unpack a correct value).
138+
* We could just exit, but that doesn't result in any
139+
* meaningful error message to the user. Likewise, just
140+
* printing something to stdout/stderr won't necessarily
141+
* get back to the user. Instead, we will send an error
142+
* report to mpirun and give it a chance to order our
143+
* termination. In order to ensure we _do_ terminate,
144+
* we set a timer - if it fires before we receive the
145+
* termination command, then we will exit on our own. This
146+
* protects us in the case that the failure is in the
147+
* messaging system itself */
148+
static void hnp_abort(int error_code, char *fmt, ...)
149+
{
150+
va_list arglist;
151+
char *outmsg = NULL;
152+
orte_timer_t *timer;
153+
154+
/* ensure we exit with non-zero status */
155+
ORTE_UPDATE_EXIT_STATUS(error_code);
156+
157+
/* If there was a message, construct it */
158+
va_start(arglist, fmt);
159+
if (NULL != fmt) {
160+
vasprintf(&outmsg, fmt, arglist);
161+
}
162+
va_end(arglist);
163+
164+
/* use the show-help system to get the message out */
165+
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
166+
167+
/* this could have happened very early, so see if it happened
168+
* before we started anything - if so, we can just finalize */
169+
if (orte_never_launched) {
170+
orte_quit(0, 0, NULL);
171+
return;
172+
}
173+
174+
/* tell the daemons to terminate */
175+
if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
176+
orte_quit(0, 0, NULL);
177+
return;
178+
}
179+
180+
/* set a timer for exiting - this also gives the message a chance
181+
* to get out! */
182+
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
183+
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
184+
return;
185+
}
186+
timer->tv.tv_sec = 5;
187+
timer->tv.tv_usec = 0;
188+
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
189+
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
190+
opal_event_evtimer_add(timer->ev, &timer->tv);
191+
}
192+
193+
128194
static void job_errors(int fd, short args, void *cbdata)
129195
{
130196
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

orte/mca/errmgr/default_orted/errmgr_default_orted.c

Lines changed: 116 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
*/
6060
static int init(void);
6161
static int finalize(void);
62-
62+
static void orted_abort(int error_code, char *fmt, ...);
6363
static int predicted_fault(opal_list_t *proc_list,
6464
opal_list_t *node_list,
6565
opal_list_t *suggested_map);
@@ -78,7 +78,7 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
7878
init,
7979
finalize,
8080
orte_errmgr_base_log,
81-
orte_errmgr_base_abort,
81+
orted_abort,
8282
orte_errmgr_base_abort_peers,
8383
predicted_fault,
8484
suggest_map_targets,
@@ -122,6 +122,119 @@ static int finalize(void)
122122
return ORTE_SUCCESS;
123123
}
124124

125+
static void wakeup(int sd, short args, void *cbdata)
126+
{
127+
/* nothing more we can do */
128+
orte_quit(0, 0, NULL);
129+
}
130+
131+
/* this function only gets called when FORCED_TERMINATE
132+
* has been invoked, which means that there is some
133+
* internal failure (e.g., to pack/unpack a correct value).
134+
* We could just exit, but that doesn't result in any
135+
* meaningful error message to the user. Likewise, just
136+
* printing something to stdout/stderr won't necessarily
137+
* get back to the user. Instead, we will send an error
138+
* report to mpirun and give it a chance to order our
139+
* termination. In order to ensure we _do_ terminate,
140+
* we set a timer - if it fires before we receive the
141+
* termination command, then we will exit on our own. This
142+
* protects us in the case that the failure is in the
143+
* messaging system itself */
144+
static void orted_abort(int error_code, char *fmt, ...)
145+
{
146+
va_list arglist;
147+
char *outmsg = NULL;
148+
orte_plm_cmd_flag_t cmd;
149+
opal_buffer_t *alert;
150+
orte_vpid_t null=ORTE_VPID_INVALID;
151+
orte_proc_state_t state = ORTE_PROC_STATE_CALLED_ABORT;
152+
orte_timer_t *timer;
153+
int rc;
154+
155+
/* If there was a message, construct it */
156+
va_start(arglist, fmt);
157+
if (NULL != fmt) {
158+
vasprintf(&outmsg, fmt, arglist);
159+
}
160+
va_end(arglist);
161+
162+
/* use the show-help system to get the message out */
163+
orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
164+
165+
/* tell the HNP we are in distress */
166+
alert = OBJ_NEW(opal_buffer_t);
167+
/* pack update state command */
168+
cmd = ORTE_PLM_UPDATE_PROC_STATE;
169+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
170+
ORTE_ERROR_LOG(rc);
171+
OBJ_RELEASE(alert);
172+
goto cleanup;
173+
}
174+
/* pack the jobid */
175+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->jobid, 1, ORTE_JOBID))) {
176+
ORTE_ERROR_LOG(rc);
177+
OBJ_RELEASE(alert);
178+
goto cleanup;
179+
}
180+
/* pack our vpid */
181+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ORTE_PROC_MY_NAME->vpid, 1, ORTE_VPID))) {
182+
ORTE_ERROR_LOG(rc);
183+
OBJ_RELEASE(alert);
184+
goto cleanup;
185+
}
186+
/* pack our pid */
187+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &orte_process_info.pid, 1, OPAL_PID))) {
188+
ORTE_ERROR_LOG(rc);
189+
OBJ_RELEASE(alert);
190+
goto cleanup;
191+
}
192+
/* pack our state */
193+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &state, 1, ORTE_PROC_STATE))) {
194+
ORTE_ERROR_LOG(rc);
195+
OBJ_RELEASE(alert);
196+
goto cleanup;
197+
}
198+
/* pack our exit code */
199+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &error_code, 1, ORTE_EXIT_CODE))) {
200+
ORTE_ERROR_LOG(rc);
201+
OBJ_RELEASE(alert);
202+
goto cleanup;
203+
}
204+
/* flag that this job is complete so the receiver can know */
205+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
206+
ORTE_ERROR_LOG(rc);
207+
OBJ_RELEASE(alert);
208+
goto cleanup;
209+
}
210+
211+
/* send it */
212+
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
213+
ORTE_PROC_MY_HNP, alert,
214+
ORTE_RML_TAG_PLM,
215+
orte_rml_send_callback, NULL))) {
216+
ORTE_ERROR_LOG(rc);
217+
OBJ_RELEASE(alert);
218+
/* we can't communicate, so give up */
219+
orte_quit(0, 0, NULL);
220+
return;
221+
}
222+
223+
cleanup:
224+
/* set a timer for exiting - this also gives the message a chance
225+
* to get out! */
226+
if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
227+
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
228+
return;
229+
}
230+
timer->tv.tv_sec = 5;
231+
timer->tv.tv_usec = 0;
232+
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
233+
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
234+
opal_event_evtimer_add(timer->ev, &timer->tv);
235+
236+
}
237+
125238
static void job_errors(int fd, short args, void *cbdata)
126239
{
127240
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@@ -259,7 +372,7 @@ static void proc_errors(int fd, short args, void *cbdata)
259372
/* terminate - our routed children will see
260373
* us leave and automatically die
261374
*/
262-
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
375+
orte_quit(0, 0, NULL);
263376
goto cleanup;
264377
}
265378

orte/mca/grpcomm/direct/grpcomm_direct.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
528528
OBJ_RELEASE(item);
529529
continue;
530530
}
531-
if (ORTE_PROC_STATE_RUNNING < rec->state ||
531+
if ((ORTE_PROC_STATE_RUNNING < rec->state &&
532+
ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
532533
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
533534
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
534535
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));

orte/mca/state/state.h

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include "opal/class/opal_list.h"
4949
#include "opal/mca/event/event.h"
5050

51+
#include "orte/mca/errmgr/errmgr.h"
5152
#include "orte/mca/plm/plm_types.h"
5253
#include "orte/runtime/orte_globals.h"
5354

@@ -64,42 +65,40 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
6465
/* For ease in debugging the state machine, it is STRONGLY recommended
6566
* that the functions be accessed using the following macros
6667
*/
67-
#define ORTE_FORCED_TERMINATE(x) \
68-
do { \
69-
if (!orte_abnormal_term_ordered) { \
70-
opal_output_verbose(1, orte_state_base_framework.framework_output, \
71-
"%s FORCE-TERMINATE AT %s:%d", \
72-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
73-
__FILE__, __LINE__); \
74-
ORTE_UPDATE_EXIT_STATUS(x); \
75-
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \
76-
} \
68+
#define ORTE_FORCED_TERMINATE(x) \
69+
do { \
70+
if (!orte_abnormal_term_ordered) { \
71+
orte_errmgr.abort((x), "%s FORCE-TERMINATE AT %s:%d - error %s(%d)", \
72+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
73+
ORTE_ERROR_NAME((x)), (x), \
74+
__FILE__, __LINE__); \
75+
} \
7776
} while(0);
7877

79-
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
80-
do { \
81-
orte_job_t *shadow=(j); \
82-
opal_output_verbose(1, orte_state_base_framework.framework_output, \
83-
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
84-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
85-
(NULL == shadow) ? "NULL" : \
86-
ORTE_JOBID_PRINT(shadow->jobid), \
87-
orte_job_state_to_str((s)), \
88-
__FILE__, __LINE__); \
89-
orte_state.activate_job_state(shadow, (s)); \
78+
#define ORTE_ACTIVATE_JOB_STATE(j, s) \
79+
do { \
80+
orte_job_t *shadow=(j); \
81+
opal_output_verbose(1, orte_state_base_framework.framework_output, \
82+
"%s ACTIVATE JOB %s STATE %s AT %s:%d", \
83+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
84+
(NULL == shadow) ? "NULL" : \
85+
ORTE_JOBID_PRINT(shadow->jobid), \
86+
orte_job_state_to_str((s)), \
87+
__FILE__, __LINE__); \
88+
orte_state.activate_job_state(shadow, (s)); \
9089
} while(0);
9190

92-
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
93-
do { \
94-
orte_process_name_t *shadow=(p); \
95-
opal_output_verbose(1, orte_state_base_framework.framework_output, \
96-
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
97-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
98-
(NULL == shadow) ? "NULL" : \
99-
ORTE_NAME_PRINT(shadow), \
100-
orte_proc_state_to_str((s)), \
101-
__FILE__, __LINE__); \
102-
orte_state.activate_proc_state(shadow, (s)); \
91+
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
92+
do { \
93+
orte_process_name_t *shadow=(p); \
94+
opal_output_verbose(1, orte_state_base_framework.framework_output, \
95+
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
96+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
97+
(NULL == shadow) ? "NULL" : \
98+
ORTE_NAME_PRINT(shadow), \
99+
orte_proc_state_to_str((s)), \
100+
__FILE__, __LINE__); \
101+
orte_state.activate_proc_state(shadow, (s)); \
103102
} while(0);
104103

105104
/**

0 commit comments

Comments
 (0)