Skip to content

Commit afcc338

Browse files
author
Ralph Castain
authored
Merge pull request #3197 from rhc54/topic/errors
Provide a little more help on the error messages when an executable i…
2 parents 45b46dc + dc85e7f commit afcc338

File tree

7 files changed

+32
-32
lines changed

7 files changed

+32
-32
lines changed

orte/mca/odls/base/help-orte-odls-base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
77
# Copyright (c) 2014 Research Organization for Information Science
88
# and Technology (RIST). All rights reserved.
9+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
910
# $COPYRIGHT$
1011
#
1112
# Additional copyrights may follow
@@ -46,6 +47,7 @@ Will continue attempting to launch the process.
4647
The xterm option was asked to display a rank that is larger
4748
than the number of procs in the job:
4849

50+
Node: %s
4951
Rank: %d
5052
Num procs: %d
5153

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
634634
char **env = NULL, **argv = NULL, *cmd = NULL;
635635
int rc, i;
636636
bool found;
637+
orte_proc_state_t state;
637638

638639
/* thread-protect common values */
639640
env = opal_argv_copy(app->env);
640641

642+
/* ensure we clear any prior info regarding state or exit status in
643+
* case this is a restart
644+
*/
645+
child->exit_code = 0;
646+
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
647+
641648
/* setup the pmix environment */
642649
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
643650
ORTE_ERROR_LOG(rc);
651+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
644652
goto errorout;
645653
}
646654

647-
/* ensure we clear any prior info regarding state or exit status in
648-
* case this is a restart
649-
*/
650-
child->exit_code = 0;
651-
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
652655
/* if we are not forwarding output for this job, then
653656
* flag iof as complete
654657
*/
@@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
693696
/* can't be done! */
694697
orte_show_help("help-orte-odls-base.txt",
695698
"orte-odls-base:xterm-rank-out-of-bounds",
696-
true, nm->name.vpid, jobdat->num_procs);
697-
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
699+
true, orte_process_info.nodename,
700+
nm->name.vpid, jobdat->num_procs);
701+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
698702
goto errorout;
699703
}
700704
}
@@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
717721
orte_show_help("help-orte-odls-base.txt",
718722
"orte-odls-base:fork-agent-not-found",
719723
true, orte_process_info.nodename, orte_fork_agent[0]);
720-
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
724+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
721725
goto errorout;
722726
}
723727
} else {
@@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
730734
*/
731735
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
732736
ORTE_ERROR_LOG(rc);
733-
child->exit_code = rc;
737+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
734738
goto errorout;
735739
}
736740

@@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
754758
}
755759

756760
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
757-
child->exit_code = rc; /* error message already output */
758-
goto errorout;
759-
}
760-
if (ORTE_SUCCESS != rc) {
761-
/* do NOT ERROR_LOG this error - it generates
762-
* a message/node as most errors will be common
763-
* across the entire cluster. Instead, we let orterun
764-
* output a consolidated error message for us
765-
*/
766-
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
767-
child->exit_code = rc; /* error message already output */
761+
/* error message already output */
762+
state = ORTE_PROC_STATE_FAILED_TO_START;
768763
goto errorout;
769764
}
770765

@@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
782777
return;
783778

784779
errorout:
785-
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
780+
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
781+
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
786782
if (NULL != env) {
787783
opal_argv_free(env);
788784
}

orte/mca/odls/default/help-orte-odls-default.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# All rights reserved.
1313
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1414
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
15+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
# $COPYRIGHT$
1617
#
1718
# Additional copyrights may follow
@@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now
2930
abort.
3031

3132
Local host: %s
33+
Working dir: %s
3234
Application name: %s
3335
Error: %s
3436
#

orte/mca/odls/default/odls_default_module.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ static int do_child(orte_proc_t *child,
328328
int i;
329329
sigset_t sigs;
330330
long fd, fdmax = sysconf(_SC_OPEN_MAX);
331+
char dir[MAXPATHLEN];
331332

332333
#if HAVE_SETPGID
333334
/* Set a new process group for this child, so that any
@@ -425,9 +426,10 @@ static int do_child(orte_proc_t *child,
425426
/* Exec the new executable */
426427

427428
execve(app, argv, environ_copy);
429+
getcwd(dir, sizeof(dir));
428430
send_error_show_help(write_fd, 1,
429431
"help-orte-odls-default.txt", "execve error",
430-
orte_process_info.nodename, app, strerror(errno));
432+
orte_process_info.nodename, dir, app, strerror(errno));
431433
/* Does not return */
432434
}
433435

orte/runtime/orte_quit.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
18-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job,
258258
default:
259259
if (0 != proc->exit_code) {
260260
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
261-
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
262-
(unsigned long)proc->name.vpid);
261+
orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
262+
node->name, (unsigned long)proc->name.vpid);
263263
} else {
264264
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
265265
orte_basename, node->name);

orte/tools/orterun/help-orterun.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# All rights reserved.
1313
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
1414
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
15+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
# $COPYRIGHT$
1617
#
1718
# Additional copyrights may follow
@@ -296,6 +297,7 @@ while attempting to start process rank %lu.
296297
%s was unable to start the specified application as it encountered an
297298
error:
298299

300+
Error code: %d
299301
Error name: %s
300302
Node: %s
301303

orte/util/error_strings.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg)
8989
if (orte_report_silent_errors) {
9090
retval = "Silent error";
9191
} else {
92-
retval = NULL;
92+
retval = "";
9393
}
9494
break;
9595
case ORTE_ERR_ADDRESSEE_UNKNOWN:
@@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg)
174174
if (orte_report_silent_errors) {
175175
retval = "Next option";
176176
} else {
177-
retval = NULL;
177+
retval = "";
178178
}
179179
break;
180180
case ORTE_ERR_SENSOR_LIMIT_EXCEEDED:
@@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg)
244244
retval = "Partial success";
245245
break;
246246
default:
247-
if (orte_report_silent_errors) {
248-
retval = "Unknown error";
249-
} else {
250-
retval = NULL;
251-
}
247+
retval = "Unknown error";
252248
}
253249

254250
*errmsg = retval;

0 commit comments

Comments
 (0)