@@ -443,27 +443,32 @@ static void proc_errors(int fd, short args, void *cbdata)
443
443
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (proc )));
444
444
/* record the first one to fail */
445
445
if (!ORTE_FLAG_TEST (jdata , ORTE_JOB_FLAG_ABORTED )) {
446
- /* output an error message so the user knows what happened */
447
- orte_show_help ("help-errmgr-base.txt" , "node-died" , true,
448
- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
449
- orte_process_info .nodename ,
450
- ORTE_NAME_PRINT (proc ),
451
- pptr -> node -> name );
452
446
/* mark the daemon job as failed */
453
447
jdata -> state = ORTE_JOB_STATE_COMM_FAILED ;
454
448
/* point to the lowest rank to cause the problem */
455
449
orte_set_attribute (& jdata -> attributes , ORTE_JOB_ABORTED_PROC , ORTE_ATTR_LOCAL , pptr , OPAL_PTR );
456
450
/* retain the object so it doesn't get free'd */
457
451
OBJ_RETAIN (pptr );
458
452
ORTE_FLAG_SET (jdata , ORTE_JOB_FLAG_ABORTED );
459
- /* update our exit code */
460
- ORTE_UPDATE_EXIT_STATUS (pptr -> exit_code );
461
- /* just in case the exit code hadn't been set, do it here - this
462
- * won't override any reported exit code */
463
- ORTE_UPDATE_EXIT_STATUS (ORTE_ERR_COMM_FAILURE );
453
+ if (!orte_enable_recovery ) {
454
+ /* output an error message so the user knows what happened */
455
+ orte_show_help ("help-errmgr-base.txt" , "node-died" , true,
456
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
457
+ orte_process_info .nodename ,
458
+ ORTE_NAME_PRINT (proc ),
459
+ pptr -> node -> name );
460
+ /* update our exit code */
461
+ ORTE_UPDATE_EXIT_STATUS (pptr -> exit_code );
462
+ /* just in case the exit code hadn't been set, do it here - this
463
+ * won't override any reported exit code */
464
+ ORTE_UPDATE_EXIT_STATUS (ORTE_ERR_COMM_FAILURE );
465
+ }
466
+ }
467
+ /* if recovery is enabled, then we are done - otherwise,
468
+ * abort the system */
469
+ if (!orte_enable_recovery ) {
470
+ default_hnp_abort (jdata );
464
471
}
465
- /* abort the system */
466
- default_hnp_abort (jdata );
467
472
goto cleanup ;
468
473
}
469
474
@@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
498
503
keep_going :
499
504
/* if this is a continuously operating job, then there is nothing more
500
505
* to do - we let the job continue to run */
501
- if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_CONTINUOUS_OP , NULL , OPAL_BOOL )) {
506
+ if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_CONTINUOUS_OP , NULL , OPAL_BOOL ) ||
507
+ ORTE_FLAG_TEST (jdata , ORTE_JOB_FLAG_RECOVERABLE )) {
502
508
/* always mark the waitpid as having fired */
503
509
ORTE_ACTIVATE_PROC_STATE (& pptr -> name , ORTE_PROC_STATE_WAITPID_FIRED );
504
510
/* if this is a remote proc, we won't hear anything more about it
0 commit comments