@@ -443,27 +443,32 @@ static void proc_errors(int fd, short args, void *cbdata)
443443 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (proc )));
444444 /* record the first one to fail */
445445 if (!ORTE_FLAG_TEST (jdata , ORTE_JOB_FLAG_ABORTED )) {
446- /* output an error message so the user knows what happened */
447- orte_show_help ("help-errmgr-base.txt" , "node-died" , true,
448- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
449- orte_process_info .nodename ,
450- ORTE_NAME_PRINT (proc ),
451- pptr -> node -> name );
452446 /* mark the daemon job as failed */
453447 jdata -> state = ORTE_JOB_STATE_COMM_FAILED ;
454448 /* point to the lowest rank to cause the problem */
455449 orte_set_attribute (& jdata -> attributes , ORTE_JOB_ABORTED_PROC , ORTE_ATTR_LOCAL , pptr , OPAL_PTR );
456450 /* retain the object so it doesn't get free'd */
457451 OBJ_RETAIN (pptr );
458452 ORTE_FLAG_SET (jdata , ORTE_JOB_FLAG_ABORTED );
459- /* update our exit code */
460- ORTE_UPDATE_EXIT_STATUS (pptr -> exit_code );
461- /* just in case the exit code hadn't been set, do it here - this
462- * won't override any reported exit code */
463- ORTE_UPDATE_EXIT_STATUS (ORTE_ERR_COMM_FAILURE );
453+ if (!orte_enable_recovery ) {
454+ /* output an error message so the user knows what happened */
455+ orte_show_help ("help-errmgr-base.txt" , "node-died" , true,
456+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
457+ orte_process_info .nodename ,
458+ ORTE_NAME_PRINT (proc ),
459+ pptr -> node -> name );
460+ /* update our exit code */
461+ ORTE_UPDATE_EXIT_STATUS (pptr -> exit_code );
462+ /* just in case the exit code hadn't been set, do it here - this
463+ * won't override any reported exit code */
464+ ORTE_UPDATE_EXIT_STATUS (ORTE_ERR_COMM_FAILURE );
465+ }
466+ }
467+ /* if recovery is enabled, then we are done - otherwise,
468+ * abort the system */
469+ if (!orte_enable_recovery ) {
470+ default_hnp_abort (jdata );
464471 }
465- /* abort the system */
466- default_hnp_abort (jdata );
467472 goto cleanup ;
468473 }
469474
@@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
498503 keep_going :
499504 /* if this is a continuously operating job, then there is nothing more
500505 * to do - we let the job continue to run */
501- if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_CONTINUOUS_OP , NULL , OPAL_BOOL )) {
506+ if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_CONTINUOUS_OP , NULL , OPAL_BOOL ) ||
507+ ORTE_FLAG_TEST (jdata , ORTE_JOB_FLAG_RECOVERABLE )) {
502508 /* always mark the waitpid as having fired */
503509 ORTE_ACTIVATE_PROC_STATE (& pptr -> name , ORTE_PROC_STATE_WAITPID_FIRED );
504510 /* if this is a remote proc, we won't hear anything more about it
0 commit comments