Skip to content

Commit 0db9188

Browse files
author
Ralph Castain
authored
Merge pull request #3018 from naughtont3/tjn-dvmerrmgr-issue2987
debug fix for DVM early quit
2 parents f827b6b + beb5b25 commit 0db9188

File tree

1 file changed

+54
-99
lines changed

1 file changed

+54
-99
lines changed

orte/mca/errmgr/dvm/errmgr_dvm.c

Lines changed: 54 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
* Copyright (c) 2009-2011 The Trustees of Indiana University.
33
* All rights reserved.
44
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
5-
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
5+
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
66
* Copyright (c) 2004-2011 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1313
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1414
* $COPYRIGHT$
1515
*
@@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
142142
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
143143
orte_job_t *jdata;
144144
orte_job_state_t jobstate;
145-
orte_exit_code_t sts;
146-
orte_proc_t *aborted_proc;
147145
opal_buffer_t *answer;
148146
int32_t rc, ret;
149147
int room, *rmptr;
@@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
175173
ORTE_JOBID_PRINT(jdata->jobid),
176174
orte_job_state_to_str(jobstate)));
177175

178-
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
179-
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
180-
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
181-
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
182-
/* disable routing as we may not have performed the daemon
183-
* wireup - e.g., in a managed environment, all the daemons
184-
* "phone home", but don't actually wireup into the routed
185-
* network until they receive the launch message
186-
*/
187-
orte_routing_is_enabled = false;
188-
jdata->num_terminated = jdata->num_procs;
189-
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
190-
/* if it was a dynamic spawn, then we better tell them this didn't work */
191-
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
192-
rc = jobstate;
193-
answer = OBJ_NEW(opal_buffer_t);
194-
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
195-
ORTE_ERROR_LOG(ret);
196-
OBJ_RELEASE(caddy);
197-
return;
198-
}
199-
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
200-
ORTE_ERROR_LOG(ret);
201-
OBJ_RELEASE(caddy);
202-
return;
203-
}
204-
/* pack the room number */
205-
rmptr = &room;
206-
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
207-
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
208-
ORTE_ERROR_LOG(ret);
209-
OBJ_RELEASE(caddy);
210-
return;
211-
}
212-
}
213-
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
214-
"%s errmgr:dvm sending dyn error release of job %s to %s",
215-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
216-
ORTE_JOBID_PRINT(jdata->jobid),
217-
ORTE_NAME_PRINT(&jdata->originator)));
218-
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
219-
&jdata->originator, answer,
220-
ORTE_RML_TAG_LAUNCH_RESP,
221-
orte_rml_send_callback, NULL))) {
222-
ORTE_ERROR_LOG(ret);
223-
OBJ_RELEASE(answer);
224-
}
176+
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
177+
/* if the daemon job aborted and we haven't heard from everyone yet,
178+
* then this could well have been caused by a daemon not finding
179+
* a way back to us. In this case, output a message indicating a daemon
180+
* died without reporting. Otherwise, say nothing as we
181+
* likely already output an error message */
182+
if (ORTE_JOB_STATE_ABORTED == jobstate &&
183+
jdata->num_procs != jdata->num_reported) {
184+
orte_routing_is_enabled = false;
185+
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
225186
}
187+
/* there really isn't much else we can do since the problem
188+
* is in the DVM itself, so best just to terminate */
189+
jdata->num_terminated = jdata->num_procs;
190+
/* activate the terminated state so we can exit */
191+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
226192
OBJ_RELEASE(caddy);
227193
return;
228194
}
229195

230-
if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
231-
ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
232-
/* the job object for this job will have been NULL'd
233-
* in the array if the job was solely local. If it isn't
234-
* NULL, then we need to tell everyone else to die
235-
*/
236-
aborted_proc = NULL;
237-
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
238-
sts = aborted_proc->exit_code;
239-
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
240-
if (WIFSIGNALED(sts)) { /* died on signal */
241-
#ifdef WCOREDUMP
242-
if (WCOREDUMP(sts)) {
243-
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
244-
WTERMSIG(sts));
245-
sts = WTERMSIG(sts);
246-
} else {
247-
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
248-
WTERMSIG(sts));
249-
sts = WTERMSIG(sts);
250-
}
251-
#else
252-
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
253-
WTERMSIG(sts));
254-
sts = WTERMSIG(sts);
255-
#endif /* WCOREDUMP */
256-
} else {
257-
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
258-
WEXITSTATUS(sts));
259-
sts = WEXITSTATUS(sts);
260-
}
261-
}
262-
}
263-
/* if this is the daemon job, then we need to ensure we
264-
* output an error message indicating we couldn't launch the
265-
* daemons */
266-
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
267-
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
196+
/* all other cases involve jobs submitted to the DVM - therefore,
197+
* we only inform the submitter of the problem, but do NOT terminate
198+
* the DVM itself */
199+
200+
rc = jobstate;
201+
answer = OBJ_NEW(opal_buffer_t);
202+
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
203+
ORTE_ERROR_LOG(ret);
204+
OBJ_RELEASE(caddy);
205+
return;
206+
}
207+
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
208+
ORTE_ERROR_LOG(ret);
209+
OBJ_RELEASE(caddy);
210+
return;
211+
}
212+
/* pack the room number */
213+
rmptr = &room;
214+
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
215+
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
216+
ORTE_ERROR_LOG(ret);
217+
OBJ_RELEASE(caddy);
218+
return;
268219
}
269220
}
270-
271-
/* if the daemon job aborted and we haven't heard from everyone yet,
272-
* then this could well have been caused by a daemon not finding
273-
* a way back to us. In this case, output a message indicating a daemon
274-
* died without reporting. Otherwise, say nothing as we
275-
* likely already output an error message */
276-
if (ORTE_JOB_STATE_ABORTED == jobstate &&
277-
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
278-
jdata->num_procs != jdata->num_reported) {
279-
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
221+
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
222+
"%s errmgr:dvm sending notification of job %s failure to %s",
223+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
224+
ORTE_JOBID_PRINT(jdata->jobid),
225+
ORTE_NAME_PRINT(&jdata->originator)));
226+
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
227+
&jdata->originator, answer,
228+
ORTE_RML_TAG_LAUNCH_RESP,
229+
orte_rml_send_callback, NULL))) {
230+
ORTE_ERROR_LOG(ret);
231+
OBJ_RELEASE(answer);
280232
}
233+
/* ensure we terminate any processes left running in the DVM */
234+
_terminate_job(jdata->jobid);
281235

236+
/* cleanup */
282237
OBJ_RELEASE(caddy);
283238
}
284239

0 commit comments

Comments
 (0)