Skip to content

Commit 055ce80

Browse files
author
Ralph Castain
authored
Merge pull request #3771 from rhc54/topic/recovery
Enable ORTE to continue running when a node fails
2 parents f8ffec9 + 0b9d8f8 commit 055ce80

File tree

4 files changed

+30
-17
lines changed

4 files changed

+30
-17
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,11 @@ test/event/event-test
621621
test/event/time-test
622622

623623
test/monitoring/monitoring_test
624+
test/monitoring/check_monitoring
625+
test/monitoring/example_reduce_count
626+
test/monitoring/test_overhead
627+
test/monitoring/test_pvar_access
628+
624629

625630
test/mpi/environment/chello
626631

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -443,27 +443,32 @@ static void proc_errors(int fd, short args, void *cbdata)
443443
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
444444
/* record the first one to fail */
445445
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
446-
/* output an error message so the user knows what happened */
447-
orte_show_help("help-errmgr-base.txt", "node-died", true,
448-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
449-
orte_process_info.nodename,
450-
ORTE_NAME_PRINT(proc),
451-
pptr->node->name);
452446
/* mark the daemon job as failed */
453447
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
454448
/* point to the lowest rank to cause the problem */
455449
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
456450
/* retain the object so it doesn't get free'd */
457451
OBJ_RETAIN(pptr);
458452
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
459-
/* update our exit code */
460-
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
461-
/* just in case the exit code hadn't been set, do it here - this
462-
* won't override any reported exit code */
463-
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
453+
if (!orte_enable_recovery) {
454+
/* output an error message so the user knows what happened */
455+
orte_show_help("help-errmgr-base.txt", "node-died", true,
456+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
457+
orte_process_info.nodename,
458+
ORTE_NAME_PRINT(proc),
459+
pptr->node->name);
460+
/* update our exit code */
461+
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
462+
/* just in case the exit code hadn't been set, do it here - this
463+
* won't override any reported exit code */
464+
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
465+
}
466+
}
467+
/* if recovery is enabled, then we are done - otherwise,
468+
* abort the system */
469+
if (!orte_enable_recovery) {
470+
default_hnp_abort(jdata);
464471
}
465-
/* abort the system */
466-
default_hnp_abort(jdata);
467472
goto cleanup;
468473
}
469474

@@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
498503
keep_going:
499504
/* if this is a continuously operating job, then there is nothing more
500505
* to do - we let the job continue to run */
501-
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
506+
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
507+
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
502508
/* always mark the waitpid as having fired */
503509
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
504510
/* if this is a remote proc, we won't hear anything more about it

orte/mca/plm/slurm/plm_slurm_module.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
267267
/* start one orted on each node */
268268
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
269269

270-
/* alert us if any orteds die during startup */
271-
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
270+
if (!orte_enable_recovery) {
271+
/* kill the job if any orteds die */
272+
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
273+
}
272274

273275
/* ensure the orteds are not bound to a single processor,
274276
* just in case the TaskAffinity option is set by default.

orte/tools/orte-clean/orte-clean.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ main(int argc, char *argv[])
183183
free(legacy);
184184

185185
/* and finally get rid of any lingering pmix-related artifacts */
186-
asprintf(&legacy, "rm -f %s/pmix*", orte_process_info.tmpdir_base);
186+
asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
187187
system(legacy);
188188
free(legacy);
189189

0 commit comments

Comments
 (0)