Skip to content

Commit c0e977c

Browse files
authored
Merge pull request #5055 from chu11/issue4946_flux_jobs_inactive_reason
flux-jobs: support new "inactive_reason" output field and "endreason" format
2 parents 9cc7782 + 8f704e2 commit c0e977c

File tree

11 files changed

+367
-18
lines changed

11 files changed

+367
-18
lines changed

doc/man1/flux-jobs.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,13 @@ the state of the job or other context:
453453
Returns the job runtime for jobs in RUN state or later, otherwise the
454454
job duration (if set) is returned.
455455

456+
**inactive_reason**
457+
If the job is inactive, returns the reason that the job is no
458+
longer active. Generally speaking, will output "Exit", "Timeout",
459+
"Canceled", or signal. If available, other contextual information
460+
will also be provided such as the exit ``returncode`` or
461+
cancellation message.
462+
456463
CONFIGURATION
457464
=============
458465

src/bindings/python/flux/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ nobase_fluxpy_PYTHON = \
66
message.py \
77
constants.py \
88
util.py \
9+
compat36.py \
910
future.py \
1011
memoized_property.py \
1112
debugged.py \
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
###############################################################
2+
# Copyright 2023 Lawrence Livermore National Security, LLC
3+
# (c.f. AUTHORS, NOTICE.LLNS, COPYING)
4+
#
5+
# This file is part of the Flux resource manager framework.
6+
# For details, see https://github.com/flux-framework.
7+
#
8+
# SPDX-License-Identifier: LGPL-3.0
9+
###############################################################
10+
11+
import signal
12+
13+
14+
# strsignal() is only available on Python 3.8 and up
15+
def strsignal(signum):
16+
if signum == signal.SIGHUP:
17+
return "Hangup"
18+
elif signum == signal.SIGINT:
19+
return "Interrupt"
20+
elif signum == signal.SIGQUIT:
21+
return "Quit"
22+
elif signum == signal.SIGILL:
23+
return "Illegal instruction"
24+
elif signum == signal.SIGTRAP:
25+
return "Trace/breakpoint trap"
26+
elif signum == signal.SIGABRT or signum == signal.SIGIOT:
27+
return "Aborted"
28+
elif signum == signal.SIGBUS:
29+
return "Bus error"
30+
elif signum == signal.SIGFPE:
31+
return "Floating point exception"
32+
elif signum == signal.SIGKILL:
33+
return "Killed"
34+
elif signum == signal.SIGUSR1:
35+
return "User defined signal 1"
36+
elif signum == signal.SIGSEGV:
37+
return "Segmentation Fault"
38+
elif signum == signal.SIGUSR2:
39+
return "User defined signal 2"
40+
elif signum == signal.SIGPIPE:
41+
return "Broken pipe"
42+
elif signum == signal.SIGALRM:
43+
return "Alarm clock"
44+
elif signum == signal.SIGTERM:
45+
return "Terminated"
46+
# N.B. signal.SIGSTKFLT not defined until Python 3.11
47+
elif "SIGSTKFLT" in dir(signal) and signum == signal.SIGSTKFLT: # novermin
48+
return "Stack fault"
49+
elif signum == signal.SIGCHLD:
50+
return "Child exited"
51+
elif signum == signal.SIGCONT:
52+
return "Continued"
53+
elif signum == signal.SIGSTOP:
54+
return "Stopped (signal)"
55+
elif signum == signal.SIGTSTP:
56+
return "Stopped"
57+
elif signum == signal.SIGTTIN:
58+
return "Stopped (tty input)"
59+
elif signum == signal.SIGTTOU:
60+
return "Stopped (tty output)"
61+
elif signum == signal.SIGURG:
62+
return "Urgent I/O condition"
63+
elif signum == signal.SIGXCPU:
64+
return "CPU time limit exceeded"
65+
elif signum == signal.SIGXFSZ:
66+
return "File size limit exceeded"
67+
elif signum == signal.SIGVTALRM:
68+
return "Virtual timer expired"
69+
elif signum == signal.SIGPROF:
70+
return "Profiling timer expired"
71+
elif signum == signal.SIGWINCH:
72+
return "Window changed"
73+
elif signum == signal.SIGIO or signum == signal.SIGPOLL:
74+
return "I/O possible"
75+
elif signum == signal.SIGPWR:
76+
return "Power failure"
77+
elif signum == signal.SIGSYS:
78+
return "Bad system call"
79+
raise ValueError

src/bindings/python/flux/job/info.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@
2929
except ImportError:
3030
SchedResourceList = None
3131

32+
# strsignal() is only available in Python 3.8 and up.
33+
# flux-core's minimum is 3.6. Use compat library if not available.
34+
try:
35+
from signal import strsignal # novermin
36+
except ImportError:
37+
from flux.compat36 import strsignal
38+
3239

3340
def statetostr(stateid, fmt="L"):
3441
return raw.flux_job_statetostr(stateid, fmt).decode("utf-8")
@@ -541,6 +548,55 @@ def zero_remove(key):
541548

542549
return result
543550

551+
@memoized_property
552+
def inactive_reason(self):
553+
"""
554+
Generate contextual exit reason based on how the job ended
555+
"""
556+
state = str(self.state)
557+
if state != "INACTIVE":
558+
return ""
559+
result = str(self.result)
560+
if result == "CANCELED":
561+
if (
562+
self.exception.occurred
563+
and self.exception.type == "cancel"
564+
and self.exception.note
565+
):
566+
return f"Canceled: {self.exception.note}"
567+
else:
568+
return "Canceled"
569+
elif result == "FAILED":
570+
# exception.type == "exec" is special case, handled by returncode
571+
if (
572+
self.exception.occurred
573+
and self.exception.type != "exec"
574+
and self.exception.severity == 0
575+
):
576+
note = None
577+
if self.exception.note:
578+
note = f" note={self.exception.note}"
579+
return f'Exception: type={self.exception.type}{note or ""}'
580+
elif self.returncode > 128:
581+
signum = self.returncode - 128
582+
try:
583+
sigdesc = strsignal(signum)
584+
except ValueError:
585+
sigdesc = f"Signaled {signum}"
586+
return sigdesc
587+
elif self.returncode == 126:
588+
return "Command invoked cannot execute"
589+
elif self.returncode == 127:
590+
return "command not found"
591+
elif self.returncode == 128:
592+
return "Invalid argument to exit"
593+
else:
594+
return f"Exit {self.returncode}"
595+
elif result == "TIMEOUT":
596+
return "Timeout"
597+
else:
598+
return f"Exit {self.returncode}"
599+
544600

545601
def job_fields_to_attrs(fields):
546602
# Note there is no attr for "id", it is always returned
@@ -593,6 +649,15 @@ def job_fields_to_attrs(fields):
593649
"dependencies": ("dependencies",),
594650
"contextual_info": ("state", "dependencies", "annotations", "nodelist"),
595651
"contextual_time": ("state", "t_run", "t_cleanup", "duration"),
652+
"inactive_reason": (
653+
"state",
654+
"result",
655+
"waitstatus",
656+
"exception_occurred",
657+
"exception_severity",
658+
"exception_type",
659+
"exception_note",
660+
),
596661
# Special cases, pointers to sub-dicts in annotations
597662
"sched": ("annotations",),
598663
"user": ("annotations",),
@@ -676,6 +741,7 @@ class JobInfoFormat(flux.util.OutputFormat):
676741
"dependencies": "DEPENDENCIES",
677742
"contextual_info": "INFO",
678743
"contextual_time": "TIME",
744+
"inactive_reason": "INACTIVE-REASON",
679745
# The following are special pre-defined cases per RFC27
680746
"annotations.sched.t_estimate": "T_ESTIMATE",
681747
"annotations.sched.reason_pending": "REASON",

src/bindings/python/flux/util.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,12 @@ def fsd(secs):
360360
return strtmp
361361

362362

363+
def empty_outputs():
364+
localepoch = datetime.fromtimestamp(0.0).strftime("%FT%T")
365+
empty = ("", "0s", "0.0", "0:00:00", "1970-01-01T00:00:00", localepoch)
366+
return empty
367+
368+
363369
class UtilFormatter(Formatter):
364370
# pylint: disable=too-many-branches
365371

@@ -439,8 +445,7 @@ def format_field(self, value, spec):
439445
# must be deferred to the UtilDatetetime format() method, since
440446
# that method will be called after this one:
441447
if spec.endswith("h") and not isinstance(value, UtilDatetime):
442-
localepoch = datetime.fromtimestamp(0.0).strftime("%FT%T")
443-
basecases = ("", "0s", "0.0", "0:00:00", "1970-01-01T00:00:00", localepoch)
448+
basecases = empty_outputs()
444449
value = "-" if str(value) in basecases else str(value)
445450
spec = spec[:-1] + "s"
446451
retval = super().format_field(value, spec)
@@ -639,7 +644,7 @@ def filter_empty(self, items):
639644

640645
# Iterate over all items, rebuilding lst each time to contain
641646
# only those fields that resulted in non-"empty" strings:
642-
empty = ("", "0", "0s", "0.0", "0:00:00", "1970-01-01T00:00:00")
647+
empty = empty_outputs()
643648
for item in items:
644649
lst = [x for x in lst if formatter.format(x["fmt"], item) in empty]
645650

src/cmd/flux-jobs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ class FluxJobsConfig(UtilConfig):
6060
"{priority:<12} {state:<8.8} {dependencies}"
6161
),
6262
},
63+
"endreason": {
64+
"description": "Show why each job ended",
65+
"format": (
66+
"{id.f58:>12} ?:{queue:<8.8} {username:<8.8} {name:<10.10+} "
67+
"{status_abbrev:>2.2} {t_inactive!d:%b%d %R::>12h} {inactive_reason}"
68+
),
69+
},
6370
}
6471

6572
def __init__(self):

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ TESTSCRIPTS = \
266266
python/t0025-uri.py \
267267
python/t0026-tree.py \
268268
python/t0027-constraint-parser.py \
269+
python/t0028-compat36.py \
269270
python/t1000-service-add-remove.py
270271

271272
if HAVE_FLUX_SECURITY

t/python/t0028-compat36.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env python3
2+
3+
###############################################################
4+
# Copyright 2021 Lawrence Livermore National Security, LLC
5+
# (c.f. AUTHORS, NOTICE.LLNS, COPYING)
6+
#
7+
# This file is part of the Flux resource manager framework.
8+
# For details, see https://github.com/flux-framework.
9+
#
10+
# SPDX-License-Identifier: LGPL-3.0
11+
###############################################################
12+
13+
import unittest
14+
import subflux # To set up PYTHONPATH
15+
from pycotap import TAPTestRunner
16+
import signal
17+
import flux.compat36
18+
19+
20+
class TestCompat36(unittest.TestCase):
21+
def test_strsignal(self):
22+
# Cover getting all signals strings, sanity check
23+
# values of most common ones.
24+
# N.B. SIGSTKFLT not defined until Python 3.11, will get ValueError
25+
for i in range(signal.SIGHUP, signal.SIGIO):
26+
try:
27+
desc = flux.compat36.strsignal(i)
28+
except ValueError:
29+
pass
30+
31+
self.assertEqual(flux.compat36.strsignal(signal.SIGHUP), "Hangup")
32+
self.assertEqual(flux.compat36.strsignal(signal.SIGINT), "Interrupt")
33+
self.assertEqual(flux.compat36.strsignal(signal.SIGKILL), "Killed")
34+
self.assertEqual(flux.compat36.strsignal(signal.SIGSEGV), "Segmentation Fault")
35+
self.assertEqual(flux.compat36.strsignal(signal.SIGTERM), "Terminated")
36+
37+
def test_strsignal_invalid(self):
38+
gotvalueerror = False
39+
40+
try:
41+
str = flux.compat36.strsignal(0)
42+
except ValueError:
43+
gotvalueerror = True
44+
45+
self.assertIs(gotvalueerror, True)
46+
47+
48+
if __name__ == "__main__":
49+
unittest.main(testRunner=TAPTestRunner())

0 commit comments

Comments
 (0)