Skip to content

Commit 6178dd7

Browse files
author
Ralph Castain
committed
Stop hanging when intercomm operations fail to launch due to lack of resources
1 parent 92cfb04 commit 6178dd7

File tree

1 file changed

+27
-19
lines changed

1 file changed

+27
-19
lines changed

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
/*
22
* Copyright (c) 2009-2011 The Trustees of Indiana University.
33
* All rights reserved.
4-
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
4+
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
55
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
66
* Copyright (c) 2004-2011 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
9-
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
9+
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
1212
* Copyright (c) 2014 Intel, Inc. All rights reserved.
1313
* $COPYRIGHT$
14-
*
14+
*
1515
* Additional copyrights may follow
16-
*
16+
*
1717
* $HEADER$
1818
*/
1919

@@ -161,18 +161,26 @@ static void job_errors(int fd, short args, void *cbdata)
161161
orte_job_state_to_str(jobstate)));
162162

163163
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
164-
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
165-
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
164+
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
165+
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
166166
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
167-
orte_never_launched = true;
168-
/* disable routing as we may not have performed the daemon
169-
* wireup - e.g., in a managed environment, all the daemons
170-
* "phone home", but don't actually wireup into the routed
171-
* network until they receive the launch message
172-
*/
173-
orte_routing_is_enabled = false;
167+
/* mark this job as terminated */
174168
jdata->num_terminated = jdata->num_procs;
175-
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
169+
/* if this is a dynamic spawn, then abort the primary job */
170+
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
171+
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
172+
/* set the global abnormal exit flag */
173+
orte_abnormal_term_ordered = true;
174+
} else {
175+
orte_never_launched = true;
176+
/* disable routing as we may not have performed the daemon
177+
* wireup - e.g., in a managed environment, all the daemons
178+
* "phone home", but don't actually wireup into the routed
179+
* network until they receive the launch message
180+
*/
181+
orte_routing_is_enabled = false;
182+
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
183+
}
176184
OBJ_RELEASE(caddy);
177185
return;
178186
}
@@ -227,7 +235,7 @@ static void job_errors(int fd, short args, void *cbdata)
227235
jdata->num_procs != jdata->num_reported) {
228236
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
229237
}
230-
238+
231239
/* abort the job */
232240
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
233241
/* set the global abnormal exit flag */
@@ -269,7 +277,7 @@ static void proc_errors(int fd, short args, void *cbdata)
269277
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
270278
ORTE_NAME_PRINT(proc),
271279
orte_proc_state_to_str(state)));
272-
280+
273281
/*
274282
* if orte is trying to shutdown, just let it
275283
*/
@@ -642,10 +650,10 @@ static void default_hnp_abort(orte_job_t *jdata)
642650
"%s errmgr:default_hnp: abort called on job %s",
643651
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
644652
ORTE_JOBID_PRINT(jdata->jobid)));
645-
653+
646654
/* the job aborted - turn off any sensors on this job */
647655
orte_sensor.stop(jdata->jobid);
648-
656+
649657
/* set control params to indicate we are terminating */
650658
orte_job_term_ordered = true;
651659
orte_enable_recovery = false;
@@ -674,7 +682,7 @@ static void default_hnp_abort(orte_job_t *jdata)
674682
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
675683
"%s errmgr:default_hnp: ordering orted termination",
676684
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
677-
685+
678686
/* tell the plm to terminate the orteds - they will automatically
679687
* kill their local procs
680688
*/

0 commit comments

Comments
 (0)