Skip to content

Commit c57d3b1

Browse files
authored
Merge pull request #6669 from grondo/issue#6349
raise non-fatal exception on epilog failure
2 parents 26e39f2 + 513cc87 commit c57d3b1

File tree

2 files changed

+75
-50
lines changed

2 files changed

+75
-50
lines changed

src/modules/job-manager/plugins/perilog.c

Lines changed: 72 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -352,63 +352,70 @@ static void perilog_proc_delete (struct perilog_proc *proc)
352352
}
353353
}
354354

355+
static char *exception_errmsg (struct perilog_proc *proc, int status)
356+
{
357+
int rc;
358+
flux_t *h = flux_jobtap_get_flux (proc->p);
359+
int code = WIFEXITED (status) ? WEXITSTATUS (status) : -1;
360+
int sig;
361+
const char *name = perilog_proc_name (proc);
362+
char *errmsg;
363+
char *hosts = NULL;
364+
365+
if (!(hosts = flux_hostmap_lookup (h, proc->failed_ranks, NULL)))
366+
hosts = strdup ("unknown");
367+
368+
if (proc->cancel_timeout) {
369+
rc = asprintf (&errmsg,
370+
"%s canceled then timed out on %s (rank %s)",
371+
name,
372+
hosts,
373+
proc->failed_ranks);
374+
}
375+
else if (proc->timedout) {
376+
rc = asprintf (&errmsg,
377+
"%s timed out on %s (rank %s)",
378+
name,
379+
hosts,
380+
proc->failed_ranks);
381+
}
382+
/* Report that proc was signaled if WIFSIGNALED() is true, or
383+
* exit code > 128 (where standard exit code is 127+signo from
384+
* most shells)
385+
*/
386+
else if (WIFSIGNALED (status) || code > 128) {
387+
sig = WIFSIGNALED (status) ? WTERMSIG (status) : code - 128;
388+
rc = asprintf (&errmsg,
389+
"%s killed by signal %d on %s (rank %s)",
390+
name,
391+
sig,
392+
hosts ? hosts : "unknown",
393+
proc->failed_ranks);
394+
}
395+
else
396+
rc = asprintf (&errmsg,
397+
"%s exited with code=%d on %s (rank %s)",
398+
name,
399+
code,
400+
hosts ? hosts : "unknown",
401+
proc->failed_ranks);
402+
403+
free (hosts);
404+
return rc < 0 ? NULL : errmsg;
405+
}
406+
355407
static void emit_finish_event (struct perilog_proc *proc,
356408
struct bulk_exec *bulk_exec)
357409
{
358410
int status = bulk_exec_rc (bulk_exec);
359411
if (proc->prolog) {
360-
int rc;
361-
362412
/*
363413
* If prolog failed, raise job exception before prolog-finish
364414
* event is emitted to ensure job isn't halfway started before
365415
* the exception is raised:
366416
*/
367417
if ((status != 0 && !proc->canceled) || proc->cancel_timeout) {
368-
flux_t *h = flux_jobtap_get_flux (proc->p);
369-
int code = WIFEXITED (status) ? WEXITSTATUS (status) : -1;
370-
int sig;
371-
char *errmsg;
372-
char *hosts = NULL;
373-
374-
if (!(hosts = flux_hostmap_lookup (h, proc->failed_ranks, NULL)))
375-
hosts = strdup ("unknown");
376-
377-
if (proc->cancel_timeout) {
378-
rc = asprintf (&errmsg,
379-
"prolog canceled then timed out on %s (rank %s)",
380-
hosts,
381-
proc->failed_ranks);
382-
status = 1;
383-
}
384-
else if (proc->timedout) {
385-
rc = asprintf (&errmsg,
386-
"prolog timed out on %s (rank %s)",
387-
hosts,
388-
proc->failed_ranks);
389-
}
390-
/* Report that prolog was signaled if WIFSIGNALED() is true, or
391-
* exit code > 128 (where standard exit code is 127+signo from
392-
* most shells)
393-
*/
394-
else if (WIFSIGNALED (status) || code > 128) {
395-
sig = WIFSIGNALED (status) ? WTERMSIG (status) : code - 128;
396-
rc = asprintf (&errmsg,
397-
"prolog killed by signal %d on %s (rank %s)",
398-
sig,
399-
hosts ? hosts : "unknown",
400-
proc->failed_ranks);
401-
}
402-
else
403-
rc = asprintf (&errmsg,
404-
"prolog exited with code=%d on %s (rank %s)",
405-
code,
406-
hosts ? hosts : "unknown",
407-
proc->failed_ranks);
408-
409-
free (hosts);
410-
if (rc < 0)
411-
errmsg = NULL;
418+
char *errmsg = exception_errmsg (proc, status);
412419
if (flux_jobtap_raise_exception (proc->p,
413420
proc->id,
414421
"prolog",
@@ -431,13 +438,28 @@ static void emit_finish_event (struct perilog_proc *proc,
431438
status);
432439
}
433440
else {
441+
/*
442+
* Raise a non-fatal job exception on failure so that the job
443+
* status reflects the status of the user job and not the status
444+
* of the epilog.
445+
*/
446+
if ((status != 0 && !proc->canceled) || proc->cancel_timeout) {
447+
char *errmsg = exception_errmsg (proc, status);
448+
if (flux_jobtap_raise_exception (proc->p,
449+
proc->id,
450+
"epilog",
451+
2,
452+
"%s",
453+
errmsg ?
454+
errmsg :
455+
"job epilog failed") < 0)
456+
flux_log_error (flux_jobtap_get_flux (proc->p),
457+
"epilog-finish: jobtap_raise_exception");
458+
free (errmsg);
459+
}
434460
/*
435461
* Epilog complete: unsubscribe this plugin from the
436462
* finished job and post an epilog-finish event.
437-
*
438-
* No job exception is raised since the job is already exiting,
439-
* and it is expected that the actual epilog script will
440-
* drain nodes or take other action on failure if necessary.
441463
*/
442464
flux_jobtap_job_unsubscribe (proc->p, proc->id);
443465
if (flux_jobtap_epilog_finish (proc->p,

t/t2274-manager-perilog-per-rank.t

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,9 @@ test_expect_success 'perilog: epilog failure drains ranks' '
401401
test "$(flux resource drain -no {reason})" = "epilog failed for job $jobid" &&
402402
undrain_all
403403
'
404+
test_expect_success 'perilog: epilog failure raises non-fatal job exception' '
405+
flux job wait-event -vHt 30 $jobid exception
406+
'
404407
test_expect_success 'perilog: job does not start when prolog cancel times out' '
405408
undrain_all &&
406409
flux config load <<-EOF &&

0 commit comments

Comments
 (0)