@@ -339,8 +339,8 @@ static void proc_errors(int fd, short args, void *cbdata)
339339 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), ORTE_NAME_PRINT (proc )));
340340 /* remove from dependent routes, if it is one */
341341 orte_routed .route_lost (proc );
342- /* if all my routes and local children are gone, then terminate ourselves */
343- if (0 == orte_routed .num_routes ()) {
342+ /* if all my routes and local children are gone, then terminate ourselves */
343+ if (0 == orte_routed .num_routes ()) {
344344 for (i = 0 ; i < orte_local_children -> size ; i ++ ) {
345345 if (NULL != (proct = (orte_proc_t * )opal_pointer_array_get_item (orte_local_children , i )) &&
346346 ORTE_FLAG_TEST (pptr , ORTE_PROC_FLAG_ALIVE ) && proct -> state < ORTE_PROC_STATE_UNTERMINATED ) {
@@ -357,7 +357,7 @@ static void proc_errors(int fd, short args, void *cbdata)
357357 "%s errmgr_hnp: all routes and children gone - ordering exit" ,
358358 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME )));
359359 ORTE_ACTIVATE_JOB_STATE (NULL , ORTE_JOB_STATE_DAEMONS_TERMINATED );
360- } else {
360+ } else {
361361 OPAL_OUTPUT_VERBOSE ((5 , orte_errmgr_base_framework .framework_output ,
362362 "%s Comm failure: %d routes remain alive" ,
363363 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
@@ -398,7 +398,7 @@ static void proc_errors(int fd, short args, void *cbdata)
398398 }
399399
400400 /* if we were ordered to terminate, mark this proc as dead and see if
401- * any of our routes or local children remain alive - if not, then
401+ * any of our routes or local children remain alive - if not, then
402402 * terminate ourselves. */
403403 if (orte_orteds_term_ordered ) {
404404 for (i = 0 ; i < orte_local_children -> size ; i ++ ) {
@@ -419,6 +419,14 @@ static void proc_errors(int fd, short args, void *cbdata)
419419 }
420420
421421 keep_going :
422+ /* if this is a continuously operating job, then there is nothing more
423+ * to do - we let the job continue to run */
424+ if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_CONTINUOUS_OP , NULL , OPAL_BOOL )) {
425+ /* always mark the waitpid as having fired */
426+ ORTE_ACTIVATE_PROC_STATE (& pptr -> name , ORTE_PROC_STATE_WAITPID_FIRED );
427+ goto cleanup ;
428+ }
429+
422430 /* ensure we record the failed proc properly so we can report
423431 * the error once we terminate
424432 */
@@ -490,7 +498,7 @@ static void proc_errors(int fd, short args, void *cbdata)
490498 /* this job has terminated */
491499 ORTE_ACTIVATE_JOB_STATE (jdata , ORTE_JOB_STATE_TERMINATED );
492500 }
493- }
501+ }
494502 break ;
495503
496504 case ORTE_PROC_STATE_TERM_WO_SYNC :
0 commit comments