Skip to content

Commit fce96cd

Browse files
committed
perilog: abstract exception message generation
Problem: The generation of the prolog exception message is embedded in the `finish` event callback and is specific to the job prolog. This makes it not reusable for the job epilog. Abstract the error message generation into an exception_errmsg() function which can be reused for the epilog.
1 parent 26e39f2 commit fce96cd

File tree

1 file changed

+53
-46
lines changed

1 file changed

+53
-46
lines changed

src/modules/job-manager/plugins/perilog.c

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -352,63 +352,70 @@ static void perilog_proc_delete (struct perilog_proc *proc)
352352
}
353353
}
354354

355+
static char *exception_errmsg (struct perilog_proc *proc, int status)
356+
{
357+
int rc;
358+
flux_t *h = flux_jobtap_get_flux (proc->p);
359+
int code = WIFEXITED (status) ? WEXITSTATUS (status) : -1;
360+
int sig;
361+
const char *name = perilog_proc_name (proc);
362+
char *errmsg;
363+
char *hosts = NULL;
364+
365+
if (!(hosts = flux_hostmap_lookup (h, proc->failed_ranks, NULL)))
366+
hosts = strdup ("unknown");
367+
368+
if (proc->cancel_timeout) {
369+
rc = asprintf (&errmsg,
370+
"%s canceled then timed out on %s (rank %s)",
371+
name,
372+
hosts,
373+
proc->failed_ranks);
374+
}
375+
else if (proc->timedout) {
376+
rc = asprintf (&errmsg,
377+
"%s timed out on %s (rank %s)",
378+
name,
379+
hosts,
380+
proc->failed_ranks);
381+
}
382+
/* Report that proc was signaled if WIFSIGNALED() is true, or
383+
* exit code > 128 (where standard exit code is 127+signo from
384+
* most shells)
385+
*/
386+
else if (WIFSIGNALED (status) || code > 128) {
387+
sig = WIFSIGNALED (status) ? WTERMSIG (status) : code - 128;
388+
rc = asprintf (&errmsg,
389+
"%s killed by signal %d on %s (rank %s)",
390+
name,
391+
sig,
392+
hosts ? hosts : "unknown",
393+
proc->failed_ranks);
394+
}
395+
else
396+
rc = asprintf (&errmsg,
397+
"%s exited with code=%d on %s (rank %s)",
398+
name,
399+
code,
400+
hosts ? hosts : "unknown",
401+
proc->failed_ranks);
402+
403+
free (hosts);
404+
return rc < 0 ? NULL : errmsg;
405+
}
406+
355407
static void emit_finish_event (struct perilog_proc *proc,
356408
struct bulk_exec *bulk_exec)
357409
{
358410
int status = bulk_exec_rc (bulk_exec);
359411
if (proc->prolog) {
360-
int rc;
361-
362412
/*
363413
* If prolog failed, raise job exception before prolog-finish
364414
* event is emitted to ensure job isn't halfway started before
365415
* the exception is raised:
366416
*/
367417
if ((status != 0 && !proc->canceled) || proc->cancel_timeout) {
368-
flux_t *h = flux_jobtap_get_flux (proc->p);
369-
int code = WIFEXITED (status) ? WEXITSTATUS (status) : -1;
370-
int sig;
371-
char *errmsg;
372-
char *hosts = NULL;
373-
374-
if (!(hosts = flux_hostmap_lookup (h, proc->failed_ranks, NULL)))
375-
hosts = strdup ("unknown");
376-
377-
if (proc->cancel_timeout) {
378-
rc = asprintf (&errmsg,
379-
"prolog canceled then timed out on %s (rank %s)",
380-
hosts,
381-
proc->failed_ranks);
382-
status = 1;
383-
}
384-
else if (proc->timedout) {
385-
rc = asprintf (&errmsg,
386-
"prolog timed out on %s (rank %s)",
387-
hosts,
388-
proc->failed_ranks);
389-
}
390-
/* Report that prolog was signaled if WIFSIGNALED() is true, or
391-
* exit code > 128 (where standard exit code is 127+signo from
392-
* most shells)
393-
*/
394-
else if (WIFSIGNALED (status) || code > 128) {
395-
sig = WIFSIGNALED (status) ? WTERMSIG (status) : code - 128;
396-
rc = asprintf (&errmsg,
397-
"prolog killed by signal %d on %s (rank %s)",
398-
sig,
399-
hosts ? hosts : "unknown",
400-
proc->failed_ranks);
401-
}
402-
else
403-
rc = asprintf (&errmsg,
404-
"prolog exited with code=%d on %s (rank %s)",
405-
code,
406-
hosts ? hosts : "unknown",
407-
proc->failed_ranks);
408-
409-
free (hosts);
410-
if (rc < 0)
411-
errmsg = NULL;
418+
char *errmsg = exception_errmsg (proc, status);
412419
if (flux_jobtap_raise_exception (proc->p,
413420
proc->id,
414421
"prolog",

0 commit comments

Comments
 (0)