|
2 | 2 | * Copyright (c) 2009-2011 The Trustees of Indiana University.
|
3 | 3 | * All rights reserved.
|
4 | 4 | * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
5 |
| - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. |
| 5 | + * Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved. |
6 | 6 | * Copyright (c) 2004-2011 The University of Tennessee and The University
|
7 | 7 | * of Tennessee Research Foundation. All rights
|
8 | 8 | * reserved.
|
9 | 9 | * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
10 | 10 | * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
11 | 11 | * All rights reserved.
|
12 |
| - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. |
| 12 | + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. |
13 | 13 | * Copyright (c) 2017 IBM Corporation. All rights reserved.
|
14 | 14 | * $COPYRIGHT$
|
15 | 15 | *
|
@@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
|
142 | 142 | orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
143 | 143 | orte_job_t *jdata;
|
144 | 144 | orte_job_state_t jobstate;
|
145 |
| - orte_exit_code_t sts; |
146 |
| - orte_proc_t *aborted_proc; |
147 | 145 | opal_buffer_t *answer;
|
148 | 146 | int32_t rc, ret;
|
149 | 147 | int room, *rmptr;
|
@@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
|
175 | 173 | ORTE_JOBID_PRINT(jdata->jobid),
|
176 | 174 | orte_job_state_to_str(jobstate)));
|
177 | 175 |
|
178 |
| - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || |
179 |
| - ORTE_JOB_STATE_ALLOC_FAILED == jobstate || |
180 |
| - ORTE_JOB_STATE_MAP_FAILED == jobstate || |
181 |
| - ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) { |
182 |
| - /* disable routing as we may not have performed the daemon |
183 |
| - * wireup - e.g., in a managed environment, all the daemons |
184 |
| - * "phone home", but don't actually wireup into the routed |
185 |
| - * network until they receive the launch message |
186 |
| - */ |
187 |
| - orte_routing_is_enabled = false; |
188 |
| - jdata->num_terminated = jdata->num_procs; |
189 |
| - ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); |
190 |
| - /* if it was a dynamic spawn, then we better tell them this didn't work */ |
191 |
| - if (ORTE_JOBID_INVALID != jdata->originator.jobid) { |
192 |
| - rc = jobstate; |
193 |
| - answer = OBJ_NEW(opal_buffer_t); |
194 |
| - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { |
195 |
| - ORTE_ERROR_LOG(ret); |
196 |
| - OBJ_RELEASE(caddy); |
197 |
| - return; |
198 |
| - } |
199 |
| - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { |
200 |
| - ORTE_ERROR_LOG(ret); |
201 |
| - OBJ_RELEASE(caddy); |
202 |
| - return; |
203 |
| - } |
204 |
| - /* pack the room number */ |
205 |
| - rmptr = &room; |
206 |
| - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { |
207 |
| - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { |
208 |
| - ORTE_ERROR_LOG(ret); |
209 |
| - OBJ_RELEASE(caddy); |
210 |
| - return; |
211 |
| - } |
212 |
| - } |
213 |
| - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, |
214 |
| - "%s errmgr:dvm sending dyn error release of job %s to %s", |
215 |
| - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
216 |
| - ORTE_JOBID_PRINT(jdata->jobid), |
217 |
| - ORTE_NAME_PRINT(&jdata->originator))); |
218 |
| - if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, |
219 |
| - &jdata->originator, answer, |
220 |
| - ORTE_RML_TAG_LAUNCH_RESP, |
221 |
| - orte_rml_send_callback, NULL))) { |
222 |
| - ORTE_ERROR_LOG(ret); |
223 |
| - OBJ_RELEASE(answer); |
224 |
| - } |
| 176 | + if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { |
| 177 | + /* if the daemon job aborted and we haven't heard from everyone yet, |
| 178 | + * then this could well have been caused by a daemon not finding |
| 179 | + * a way back to us. In this case, output a message indicating a daemon |
| 180 | + * died without reporting. Otherwise, say nothing as we |
| 181 | + * likely already output an error message */ |
| 182 | + if (ORTE_JOB_STATE_ABORTED == jobstate && |
| 183 | + jdata->num_procs != jdata->num_reported) { |
| 184 | + orte_routing_is_enabled = false; |
| 185 | + orte_show_help("help-errmgr-base.txt", "failed-daemon", true); |
225 | 186 | }
|
| 187 | + /* there really isn't much else we can do since the problem |
| 188 | + * is in the DVM itself, so best just to terminate */ |
| 189 | + jdata->num_terminated = jdata->num_procs; |
| 190 | + /* activate the terminated state so we can exit */ |
| 191 | + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); |
226 | 192 | OBJ_RELEASE(caddy);
|
227 | 193 | return;
|
228 | 194 | }
|
229 | 195 |
|
230 |
| - if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || |
231 |
| - ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { |
232 |
| - /* the job object for this job will have been NULL'd |
233 |
| - * in the array if the job was solely local. If it isn't |
234 |
| - * NULL, then we need to tell everyone else to die |
235 |
| - */ |
236 |
| - aborted_proc = NULL; |
237 |
| - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) { |
238 |
| - sts = aborted_proc->exit_code; |
239 |
| - if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { |
240 |
| - if (WIFSIGNALED(sts)) { /* died on signal */ |
241 |
| -#ifdef WCOREDUMP |
242 |
| - if (WCOREDUMP(sts)) { |
243 |
| - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, |
244 |
| - WTERMSIG(sts)); |
245 |
| - sts = WTERMSIG(sts); |
246 |
| - } else { |
247 |
| - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, |
248 |
| - WTERMSIG(sts)); |
249 |
| - sts = WTERMSIG(sts); |
250 |
| - } |
251 |
| -#else |
252 |
| - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, |
253 |
| - WTERMSIG(sts)); |
254 |
| - sts = WTERMSIG(sts); |
255 |
| -#endif /* WCOREDUMP */ |
256 |
| - } else { |
257 |
| - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, |
258 |
| - WEXITSTATUS(sts)); |
259 |
| - sts = WEXITSTATUS(sts); |
260 |
| - } |
261 |
| - } |
262 |
| - } |
263 |
| - /* if this is the daemon job, then we need to ensure we |
264 |
| - * output an error message indicating we couldn't launch the |
265 |
| - * daemons */ |
266 |
| - if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { |
267 |
| - orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); |
| 196 | + /* all other cases involve jobs submitted to the DVM - therefore, |
| 197 | + * we only inform the submitter of the problem, but do NOT terminate |
| 198 | + * the DVM itself */ |
| 199 | + |
| 200 | + rc = jobstate; |
| 201 | + answer = OBJ_NEW(opal_buffer_t); |
| 202 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { |
| 203 | + ORTE_ERROR_LOG(ret); |
| 204 | + OBJ_RELEASE(caddy); |
| 205 | + return; |
| 206 | + } |
| 207 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { |
| 208 | + ORTE_ERROR_LOG(ret); |
| 209 | + OBJ_RELEASE(caddy); |
| 210 | + return; |
| 211 | + } |
| 212 | + /* pack the room number */ |
| 213 | + rmptr = &room; |
| 214 | + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { |
| 215 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { |
| 216 | + ORTE_ERROR_LOG(ret); |
| 217 | + OBJ_RELEASE(caddy); |
| 218 | + return; |
268 | 219 | }
|
269 | 220 | }
|
270 |
| - |
271 |
| - /* if the daemon job aborted and we haven't heard from everyone yet, |
272 |
| - * then this could well have been caused by a daemon not finding |
273 |
| - * a way back to us. In this case, output a message indicating a daemon |
274 |
| - * died without reporting. Otherwise, say nothing as we |
275 |
| - * likely already output an error message */ |
276 |
| - if (ORTE_JOB_STATE_ABORTED == jobstate && |
277 |
| - jdata->jobid == ORTE_PROC_MY_NAME->jobid && |
278 |
| - jdata->num_procs != jdata->num_reported) { |
279 |
| - orte_show_help("help-errmgr-base.txt", "failed-daemon", true); |
| 221 | + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, |
| 222 | + "%s errmgr:dvm sending notification of job %s failure to %s", |
| 223 | + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
| 224 | + ORTE_JOBID_PRINT(jdata->jobid), |
| 225 | + ORTE_NAME_PRINT(&jdata->originator))); |
| 226 | + if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, |
| 227 | + &jdata->originator, answer, |
| 228 | + ORTE_RML_TAG_LAUNCH_RESP, |
| 229 | + orte_rml_send_callback, NULL))) { |
| 230 | + ORTE_ERROR_LOG(ret); |
| 231 | + OBJ_RELEASE(answer); |
280 | 232 | }
|
| 233 | + /* ensure we terminate any processes left running in the DVM */ |
| 234 | + _terminate_job(jdata->jobid); |
281 | 235 |
|
| 236 | + /* cleanup */ |
282 | 237 | OBJ_RELEASE(caddy);
|
283 | 238 | }
|
284 | 239 |
|
|
0 commit comments