|
2 | 2 | * Copyright (c) 2009-2011 The Trustees of Indiana University. |
3 | 3 | * All rights reserved. |
4 | 4 | * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. |
5 | | - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. |
| 5 | + * Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved. |
6 | 6 | * Copyright (c) 2004-2011 The University of Tennessee and The University |
7 | 7 | * of Tennessee Research Foundation. All rights |
8 | 8 | * reserved. |
9 | 9 | * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. |
10 | 10 | * Copyright (c) 2011-2013 Los Alamos National Security, LLC. |
11 | 11 | * All rights reserved. |
12 | | - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. |
| 12 | + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. |
13 | 13 | * Copyright (c) 2017 IBM Corporation. All rights reserved. |
14 | 14 | * $COPYRIGHT$ |
15 | 15 | * |
@@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata) |
142 | 142 | orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; |
143 | 143 | orte_job_t *jdata; |
144 | 144 | orte_job_state_t jobstate; |
145 | | - orte_exit_code_t sts; |
146 | | - orte_proc_t *aborted_proc; |
147 | 145 | opal_buffer_t *answer; |
148 | 146 | int32_t rc, ret; |
149 | 147 | int room, *rmptr; |
@@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata) |
175 | 173 | ORTE_JOBID_PRINT(jdata->jobid), |
176 | 174 | orte_job_state_to_str(jobstate))); |
177 | 175 |
|
178 | | - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || |
179 | | - ORTE_JOB_STATE_ALLOC_FAILED == jobstate || |
180 | | - ORTE_JOB_STATE_MAP_FAILED == jobstate || |
181 | | - ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) { |
182 | | - /* disable routing as we may not have performed the daemon |
183 | | - * wireup - e.g., in a managed environment, all the daemons |
184 | | - * "phone home", but don't actually wireup into the routed |
185 | | - * network until they receive the launch message |
186 | | - */ |
187 | | - orte_routing_is_enabled = false; |
188 | | - jdata->num_terminated = jdata->num_procs; |
189 | | - ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); |
190 | | - /* if it was a dynamic spawn, then we better tell them this didn't work */ |
191 | | - if (ORTE_JOBID_INVALID != jdata->originator.jobid) { |
192 | | - rc = jobstate; |
193 | | - answer = OBJ_NEW(opal_buffer_t); |
194 | | - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { |
195 | | - ORTE_ERROR_LOG(ret); |
196 | | - OBJ_RELEASE(caddy); |
197 | | - return; |
198 | | - } |
199 | | - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { |
200 | | - ORTE_ERROR_LOG(ret); |
201 | | - OBJ_RELEASE(caddy); |
202 | | - return; |
203 | | - } |
204 | | - /* pack the room number */ |
205 | | - rmptr = &room; |
206 | | - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { |
207 | | - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { |
208 | | - ORTE_ERROR_LOG(ret); |
209 | | - OBJ_RELEASE(caddy); |
210 | | - return; |
211 | | - } |
212 | | - } |
213 | | - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, |
214 | | - "%s errmgr:dvm sending dyn error release of job %s to %s", |
215 | | - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
216 | | - ORTE_JOBID_PRINT(jdata->jobid), |
217 | | - ORTE_NAME_PRINT(&jdata->originator))); |
218 | | - if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, |
219 | | - &jdata->originator, answer, |
220 | | - ORTE_RML_TAG_LAUNCH_RESP, |
221 | | - orte_rml_send_callback, NULL))) { |
222 | | - ORTE_ERROR_LOG(ret); |
223 | | - OBJ_RELEASE(answer); |
224 | | - } |
| 176 | + if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { |
| 177 | + /* if the daemon job aborted and we haven't heard from everyone yet, |
| 178 | + * then this could well have been caused by a daemon not finding |
| 179 | + * a way back to us. In this case, output a message indicating a daemon |
| 180 | + * died without reporting. Otherwise, say nothing as we |
| 181 | + * likely already output an error message */ |
| 182 | + if (ORTE_JOB_STATE_ABORTED == jobstate && |
| 183 | + jdata->num_procs != jdata->num_reported) { |
| 184 | + orte_routing_is_enabled = false; |
| 185 | + orte_show_help("help-errmgr-base.txt", "failed-daemon", true); |
225 | 186 | } |
| 187 | + /* there really isn't much else we can do since the problem |
| 188 | + * is in the DVM itself, so best just to terminate */ |
| 189 | + jdata->num_terminated = jdata->num_procs; |
| 190 | + /* activate the terminated state so we can exit */ |
| 191 | + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); |
226 | 192 | OBJ_RELEASE(caddy); |
227 | 193 | return; |
228 | 194 | } |
229 | 195 |
|
230 | | - if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || |
231 | | - ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { |
232 | | - /* the job object for this job will have been NULL'd |
233 | | - * in the array if the job was solely local. If it isn't |
234 | | - * NULL, then we need to tell everyone else to die |
235 | | - */ |
236 | | - aborted_proc = NULL; |
237 | | - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) { |
238 | | - sts = aborted_proc->exit_code; |
239 | | - if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { |
240 | | - if (WIFSIGNALED(sts)) { /* died on signal */ |
241 | | -#ifdef WCOREDUMP |
242 | | - if (WCOREDUMP(sts)) { |
243 | | - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, |
244 | | - WTERMSIG(sts)); |
245 | | - sts = WTERMSIG(sts); |
246 | | - } else { |
247 | | - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, |
248 | | - WTERMSIG(sts)); |
249 | | - sts = WTERMSIG(sts); |
250 | | - } |
251 | | -#else |
252 | | - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, |
253 | | - WTERMSIG(sts)); |
254 | | - sts = WTERMSIG(sts); |
255 | | -#endif /* WCOREDUMP */ |
256 | | - } else { |
257 | | - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, |
258 | | - WEXITSTATUS(sts)); |
259 | | - sts = WEXITSTATUS(sts); |
260 | | - } |
261 | | - } |
262 | | - } |
263 | | - /* if this is the daemon job, then we need to ensure we |
264 | | - * output an error message indicating we couldn't launch the |
265 | | - * daemons */ |
266 | | - if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { |
267 | | - orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); |
| 196 | + /* all other cases involve jobs submitted to the DVM - therefore, |
| 197 | + * we only inform the submitter of the problem, but do NOT terminate |
| 198 | + * the DVM itself */ |
| 199 | + |
| 200 | + rc = jobstate; |
| 201 | + answer = OBJ_NEW(opal_buffer_t); |
| 202 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { |
| 203 | + ORTE_ERROR_LOG(ret); |
| 204 | + OBJ_RELEASE(caddy); |
| 205 | + return; |
| 206 | + } |
| 207 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { |
| 208 | + ORTE_ERROR_LOG(ret); |
| 209 | + OBJ_RELEASE(caddy); |
| 210 | + return; |
| 211 | + } |
| 212 | + /* pack the room number */ |
| 213 | + rmptr = &room; |
| 214 | + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { |
| 215 | + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { |
| 216 | + ORTE_ERROR_LOG(ret); |
| 217 | + OBJ_RELEASE(caddy); |
| 218 | + return; |
268 | 219 | } |
269 | 220 | } |
270 | | - |
271 | | - /* if the daemon job aborted and we haven't heard from everyone yet, |
272 | | - * then this could well have been caused by a daemon not finding |
273 | | - * a way back to us. In this case, output a message indicating a daemon |
274 | | - * died without reporting. Otherwise, say nothing as we |
275 | | - * likely already output an error message */ |
276 | | - if (ORTE_JOB_STATE_ABORTED == jobstate && |
277 | | - jdata->jobid == ORTE_PROC_MY_NAME->jobid && |
278 | | - jdata->num_procs != jdata->num_reported) { |
279 | | - orte_show_help("help-errmgr-base.txt", "failed-daemon", true); |
| 221 | + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, |
| 222 | + "%s errmgr:dvm sending notification of job %s failure to %s", |
| 223 | + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), |
| 224 | + ORTE_JOBID_PRINT(jdata->jobid), |
| 225 | + ORTE_NAME_PRINT(&jdata->originator))); |
| 226 | + if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, |
| 227 | + &jdata->originator, answer, |
| 228 | + ORTE_RML_TAG_LAUNCH_RESP, |
| 229 | + orte_rml_send_callback, NULL))) { |
| 230 | + ORTE_ERROR_LOG(ret); |
| 231 | + OBJ_RELEASE(answer); |
280 | 232 | } |
| 233 | + /* ensure we terminate any processes left running in the DVM */ |
| 234 | + _terminate_job(jdata->jobid); |
281 | 235 |
|
| 236 | + /* cleanup */ |
282 | 237 | OBJ_RELEASE(caddy); |
283 | 238 | } |
284 | 239 |
|
|
0 commit comments