@@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
634634 char * * env = NULL , * * argv = NULL , * cmd = NULL ;
635635 int rc , i ;
636636 bool found ;
637+ orte_proc_state_t state ;
637638
638639 /* thread-protect common values */
639640 env = opal_argv_copy (app -> env );
640641
642+ /* ensure we clear any prior info regarding state or exit status in
643+ * case this is a restart
644+ */
645+ child -> exit_code = 0 ;
646+ ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_WAITPID );
647+
641648 /* setup the pmix environment */
642649 if (OPAL_SUCCESS != (rc = opal_pmix .server_setup_fork (& child -> name , & env ))) {
643650 ORTE_ERROR_LOG (rc );
651+ state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
644652 goto errorout ;
645653 }
646654
647- /* ensure we clear any prior info regarding state or exit status in
648- * case this is a restart
649- */
650- child -> exit_code = 0 ;
651- ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_WAITPID );
652655 /* if we are not forwarding output for this job, then
653656 * flag iof as complete
654657 */
@@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
693696 /* can't be done! */
694697 orte_show_help ("help-orte-odls-base.txt" ,
695698 "orte-odls-base:xterm-rank-out-of-bounds" ,
696- true, nm -> name .vpid , jobdat -> num_procs );
697- child -> exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
699+ true, orte_process_info .nodename ,
700+ nm -> name .vpid , jobdat -> num_procs );
701+ state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
698702 goto errorout ;
699703 }
700704 }
@@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
717721 orte_show_help ("help-orte-odls-base.txt" ,
718722 "orte-odls-base:fork-agent-not-found" ,
719723 true, orte_process_info .nodename , orte_fork_agent [0 ]);
720- child -> exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
724+ state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
721725 goto errorout ;
722726 }
723727 } else {
@@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
730734 */
731735 if (ORTE_SUCCESS != (rc = orte_schizo .setup_child (jobdat , child , app , & env ))) {
732736 ORTE_ERROR_LOG (rc );
733- child -> exit_code = rc ;
737+ state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
734738 goto errorout ;
735739 }
736740
@@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
754758 }
755759
756760 if (ORTE_SUCCESS != (rc = cd -> fork_local (child , cmd , argv , env , jobdat , cd -> opts ))) {
757- child -> exit_code = rc ; /* error message already output */
758- goto errorout ;
759- }
760- if (ORTE_SUCCESS != rc ) {
761- /* do NOT ERROR_LOG this error - it generates
762- * a message/node as most errors will be common
763- * across the entire cluster. Instead, we let orterun
764- * output a consolidated error message for us
765- */
766- ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_ALIVE );
767- child -> exit_code = rc ; /* error message already output */
761+ /* error message already output */
762+ state = ORTE_PROC_STATE_FAILED_TO_START ;
768763 goto errorout ;
769764 }
770765
@@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
782777 return ;
783778
784779 errorout :
785- ORTE_ACTIVATE_PROC_STATE (& child -> name , ORTE_PROC_STATE_FAILED_TO_START );
780+ ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_ALIVE );
781+ ORTE_ACTIVATE_PROC_STATE (& child -> name , state );
786782 if (NULL != env ) {
787783 opal_argv_free (env );
788784 }
0 commit comments