|
1 | 1 | /* |
2 | 2 | * Copyright (c) 2009-2011 The Trustees of Indiana University. |
3 | 3 | * All rights reserved. |
4 | | - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. |
| 4 | + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. |
5 | 5 | * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. |
6 | 6 | * Copyright (c) 2004-2011 The University of Tennessee and The University |
7 | 7 | * of Tennessee Research Foundation. All rights |
8 | 8 | * reserved. |
9 | | - * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. |
| 9 | + * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. |
10 | 10 | * Copyright (c) 2011-2013 Los Alamos National Security, LLC. |
11 | 11 | * All rights reserved. |
12 | 12 | * Copyright (c) 2014 Intel, Inc. All rights reserved. |
13 | 13 | * $COPYRIGHT$ |
14 | | - * |
| 14 | + * |
15 | 15 | * Additional copyrights may follow |
16 | | - * |
| 16 | + * |
17 | 17 | * $HEADER$ |
18 | 18 | */ |
19 | 19 |
|
@@ -161,18 +161,26 @@ static void job_errors(int fd, short args, void *cbdata) |
161 | 161 | orte_job_state_to_str(jobstate))); |
162 | 162 |
|
163 | 163 | if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || |
164 | | - ORTE_JOB_STATE_ALLOC_FAILED == jobstate || |
165 | | - ORTE_JOB_STATE_MAP_FAILED == jobstate || |
| 164 | + ORTE_JOB_STATE_ALLOC_FAILED == jobstate || |
| 165 | + ORTE_JOB_STATE_MAP_FAILED == jobstate || |
166 | 166 | ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) { |
167 | | - orte_never_launched = true; |
168 | | - /* disable routing as we may not have performed the daemon |
169 | | - * wireup - e.g., in a managed environment, all the daemons |
170 | | - * "phone home", but don't actually wireup into the routed |
171 | | - * network until they receive the launch message |
172 | | - */ |
173 | | - orte_routing_is_enabled = false; |
| 167 | + /* mark this job as terminated */ |
174 | 168 | jdata->num_terminated = jdata->num_procs; |
175 | | - ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); |
| 169 | + /* if this is a dynamic spawn, then abort the primary job */ |
| 170 | + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { |
| 171 | + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); |
| 172 | + /* set the global abnormal exit flag */ |
| 173 | + orte_abnormal_term_ordered = true; |
| 174 | + } else { |
| 175 | + orte_never_launched = true; |
| 176 | + /* disable routing as we may not have performed the daemon |
| 177 | + * wireup - e.g., in a managed environment, all the daemons |
| 178 | + * "phone home", but don't actually wireup into the routed |
| 179 | + * network until they receive the launch message |
| 180 | + */ |
| 181 | + orte_routing_is_enabled = false; |
| 182 | + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); |
| 183 | + } |
176 | 184 | OBJ_RELEASE(caddy); |
177 | 185 | return; |
178 | 186 | } |
@@ -227,7 +235,7 @@ static void job_errors(int fd, short args, void *cbdata) |
227 | 235 | jdata->num_procs != jdata->num_reported) { |
228 | 236 | orte_show_help("help-errmgr-base.txt", "failed-daemon", true); |
229 | 237 | } |
230 | | - |
| 238 | + |
231 | 239 | /* abort the job */ |
232 | 240 | ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); |
233 | 241 | /* set the global abnormal exit flag */ |
@@ -269,7 +277,7 @@ static void proc_errors(int fd, short args, void *cbdata) |
269 | 277 | ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
270 | 278 | ORTE_NAME_PRINT(proc), |
271 | 279 | orte_proc_state_to_str(state))); |
272 | | - |
| 280 | + |
273 | 281 | /* |
274 | 282 | * if orte is trying to shutdown, just let it |
275 | 283 | */ |
@@ -642,10 +650,10 @@ static void default_hnp_abort(orte_job_t *jdata) |
642 | 650 | "%s errmgr:default_hnp: abort called on job %s", |
643 | 651 | ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
644 | 652 | ORTE_JOBID_PRINT(jdata->jobid))); |
645 | | - |
| 653 | + |
646 | 654 | /* the job aborted - turn off any sensors on this job */ |
647 | 655 | orte_sensor.stop(jdata->jobid); |
648 | | - |
| 656 | + |
649 | 657 | /* set control params to indicate we are terminating */ |
650 | 658 | orte_job_term_ordered = true; |
651 | 659 | orte_enable_recovery = false; |
@@ -674,7 +682,7 @@ static void default_hnp_abort(orte_job_t *jdata) |
674 | 682 | OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, |
675 | 683 | "%s errmgr:default_hnp: ordering orted termination", |
676 | 684 | ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); |
677 | | - |
| 685 | + |
678 | 686 | /* tell the plm to terminate the orteds - they will automatically |
679 | 687 | * kill their local procs |
680 | 688 | */ |
|
0 commit comments