@@ -352,63 +352,70 @@ static void perilog_proc_delete (struct perilog_proc *proc)
352352 }
353353}
354354
355+ static char * exception_errmsg (struct perilog_proc * proc , int status )
356+ {
357+ int rc ;
358+ flux_t * h = flux_jobtap_get_flux (proc -> p );
359+ int code = WIFEXITED (status ) ? WEXITSTATUS (status ) : -1 ;
360+ int sig ;
361+ const char * name = perilog_proc_name (proc );
362+ char * errmsg ;
363+ char * hosts = NULL ;
364+
365+ if (!(hosts = flux_hostmap_lookup (h , proc -> failed_ranks , NULL )))
366+ hosts = strdup ("unknown" );
367+
368+ if (proc -> cancel_timeout ) {
369+ rc = asprintf (& errmsg ,
370+ "%s canceled then timed out on %s (rank %s)" ,
371+ name ,
372+ hosts ,
373+ proc -> failed_ranks );
374+ }
375+ else if (proc -> timedout ) {
376+ rc = asprintf (& errmsg ,
377+ "%s timed out on %s (rank %s)" ,
378+ name ,
379+ hosts ,
380+ proc -> failed_ranks );
381+ }
382+ /* Report that proc was signaled if WIFSIGNALED() is true, or
383+ * exit code > 128 (where standard exit code is 127+signo from
384+ * most shells)
385+ */
386+ else if (WIFSIGNALED (status ) || code > 128 ) {
387+ sig = WIFSIGNALED (status ) ? WTERMSIG (status ) : code - 128 ;
388+ rc = asprintf (& errmsg ,
389+ "%s killed by signal %d on %s (rank %s)" ,
390+ name ,
391+ sig ,
392+ hosts ? hosts : "unknown" ,
393+ proc -> failed_ranks );
394+ }
395+ else
396+ rc = asprintf (& errmsg ,
397+ "%s exited with code=%d on %s (rank %s)" ,
398+ name ,
399+ code ,
400+ hosts ? hosts : "unknown" ,
401+ proc -> failed_ranks );
402+
403+ free (hosts );
404+ return rc < 0 ? NULL : errmsg ;
405+ }
406+
355407static void emit_finish_event (struct perilog_proc * proc ,
356408 struct bulk_exec * bulk_exec )
357409{
358410 int status = bulk_exec_rc (bulk_exec );
359411 if (proc -> prolog ) {
360- int rc ;
361-
362412 /*
363413 * If prolog failed, raise job exception before prolog-finish
364414 * event is emitted to ensure job isn't halfway started before
365415 * the exception is raised:
366416 */
367417 if ((status != 0 && !proc -> canceled ) || proc -> cancel_timeout ) {
368- flux_t * h = flux_jobtap_get_flux (proc -> p );
369- int code = WIFEXITED (status ) ? WEXITSTATUS (status ) : -1 ;
370- int sig ;
371- char * errmsg ;
372- char * hosts = NULL ;
373-
374- if (!(hosts = flux_hostmap_lookup (h , proc -> failed_ranks , NULL )))
375- hosts = strdup ("unknown" );
376-
377- if (proc -> cancel_timeout ) {
378- rc = asprintf (& errmsg ,
379- "prolog canceled then timed out on %s (rank %s)" ,
380- hosts ,
381- proc -> failed_ranks );
382- status = 1 ;
383- }
384- else if (proc -> timedout ) {
385- rc = asprintf (& errmsg ,
386- "prolog timed out on %s (rank %s)" ,
387- hosts ,
388- proc -> failed_ranks );
389- }
390- /* Report that prolog was signaled if WIFSIGNALED() is true, or
391- * exit code > 128 (where standard exit code is 127+signo from
392- * most shells)
393- */
394- else if (WIFSIGNALED (status ) || code > 128 ) {
395- sig = WIFSIGNALED (status ) ? WTERMSIG (status ) : code - 128 ;
396- rc = asprintf (& errmsg ,
397- "prolog killed by signal %d on %s (rank %s)" ,
398- sig ,
399- hosts ? hosts : "unknown" ,
400- proc -> failed_ranks );
401- }
402- else
403- rc = asprintf (& errmsg ,
404- "prolog exited with code=%d on %s (rank %s)" ,
405- code ,
406- hosts ? hosts : "unknown" ,
407- proc -> failed_ranks );
408-
409- free (hosts );
410- if (rc < 0 )
411- errmsg = NULL ;
418+ char * errmsg = exception_errmsg (proc , status );
412419 if (flux_jobtap_raise_exception (proc -> p ,
413420 proc -> id ,
414421 "prolog" ,
@@ -431,13 +438,28 @@ static void emit_finish_event (struct perilog_proc *proc,
431438 status );
432439 }
433440 else {
441+ /*
442+ * Raise a non-fatal job exception on failure so that the job
443+ * status reflects the status of the user job and not the status
444+ * of the epilog.
445+ */
446+ if ((status != 0 && !proc -> canceled ) || proc -> cancel_timeout ) {
447+ char * errmsg = exception_errmsg (proc , status );
448+ if (flux_jobtap_raise_exception (proc -> p ,
449+ proc -> id ,
450+ "epilog" ,
451+ 2 ,
452+ "%s" ,
453+ errmsg ?
454+ errmsg :
455+ "job epilog failed" ) < 0 )
456+ flux_log_error (flux_jobtap_get_flux (proc -> p ),
457+ "epilog-finish: jobtap_raise_exception" );
458+ free (errmsg );
459+ }
434460 /*
435461 * Epilog complete: unsubscribe this plugin from the
436462 * finished job and post an epilog-finish event.
437- *
438- * No job exception is raised since the job is already exiting,
439- * and it is expected that the actual epilog script will
440- * drain nodes or take other action on failure if necessary.
441463 */
442464 flux_jobtap_job_unsubscribe (proc -> p , proc -> id );
443465 if (flux_jobtap_epilog_finish (proc -> p ,
0 commit comments