Skip to content

Commit af619fb

Browse files
authored
Merge pull request #5212 from grondo/issue#5104
cli: support option to signal a job a configurable time before expiration
2 parents 6a138c8 + 3f0b43a commit af619fb

File tree

14 files changed

+697
-83
lines changed

14 files changed

+697
-83
lines changed

doc/man1/common/submit-other-options.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,20 @@ OTHER OPTIONS
4444
is accepted, e.g. ``2021-06-21 8am``, ``in an hour``,
4545
``tomorrow morning``, etc.
4646

47+
**--signal=SIG@TIME**
48+
Send signal ``SIG`` to job ``TIME`` before the job time limit. ``SIG``
49+
can specify either an integer signal number or a full or abbreviated
50+
signal name, e.g. ``SIGUSR1`` or ``USR1`` or ``10``. ``TIME`` is
51+
specified in Flux Standard Duration, e.g. ``30`` for 30s or ``1h`` for
52+
1 hour. Either parameter may be omitted, with defaults of ``SIGUSR1``
53+
and 60s. For example, ``--signal=USR2`` will send ``SIGUSR2`` to
54+
the job 60 seconds before expiration, and ``--signal=@3m`` will send
55+
``SIGUSR1`` 3 minutes before expiration. Note that if ``TIME`` is
56+
greater than the remaining time of a job as it starts, the job will
57+
be signaled immediately.
58+
59+
The default behavior is to not send any warning signal to jobs.
60+
4761
**--taskmap=SCHEME[:VALUE]**
4862
Choose an alternate method for mapping job task IDs to nodes of the
4963
job. The job shell maps tasks using a "block" distribution scheme by

doc/man1/flux-shell.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,19 @@ options are supported by the builtin plugins of ``flux-shell``:
363363
job's nodes if the scope is local, versus only the first node of the
364364
job if the scope is global.
365365

366+
**signal=OPTION**
367+
Deliver signal ``SIGUSR1`` to the job 60s before job expiration.
368+
To customize the signal number or amount of time before expiration to
369+
deliver the signal, the ``signal`` option may be an object with one
370+
or both of the keys ``signum`` or ``timeleft``. (See below)
371+
372+
**signal.signum**\ =\ *NUMBER*
373+
Send signal *NUMBER* to the job ``signal.timeleft`` seconds before
374+
the time limit.
375+
376+
**signal.timeleft**\ =\ *TIME*
377+
Send signal ``signal.signum`` *TIME* seconds before job expiration.
378+
366379
.. warning::
367380
The $FLUX_JOB_TMPDIR is cleaned up when the job ends, is guaranteed to
368381
be unique, and is generally on fast local storage such as a *tmpfs*.

doc/test/spell.en.pws

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,3 +705,4 @@ libc
705705
unbuffered
706706
statex
707707
pmix
708+
SIGUSR

src/bindings/python/flux/cli/base.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import os
2121
import re
2222
import resource
23+
import signal
2324
import sys
2425
from collections import ChainMap
2526
from itertools import chain
@@ -36,6 +37,71 @@
3637
LOGGER = logging.getLogger("flux")
3738

3839

40+
def decode_signal(val):
41+
"""
42+
Decode a signal as string or number
43+
A string can be of the form 'SIGUSR1' or just 'USR1'
44+
"""
45+
if isinstance(val, int):
46+
return val
47+
try:
48+
return int(val)
49+
except ValueError:
50+
pass # Fall back to signal name
51+
try:
52+
return getattr(signal, val)
53+
except AttributeError:
54+
pass # Fall back SIG{name}
55+
try:
56+
return getattr(signal, f"SIG{val}")
57+
except AttributeError:
58+
pass
59+
raise ValueError(f"signal '{val}' is invalid")
60+
61+
62+
def decode_duration(val):
63+
"""
64+
Decode a duration as a number or string in FSD
65+
"""
66+
if isinstance(val, (float, int)):
67+
return val
68+
try:
69+
return float(val)
70+
except ValueError:
71+
pass # Fall back to fsd
72+
return util.parse_fsd(val)
73+
74+
75+
def parse_signal_option(arg):
76+
"""
77+
Parse the --signal= option argument of the form SIG@TIME, where
78+
both signal and time are optional.
79+
80+
Returns a dict with signal and timeleft members
81+
"""
82+
signo = signal.SIGUSR1
83+
tleft = 60
84+
if arg is not None:
85+
sig, _, time = arg.partition("@")
86+
if time:
87+
tleft = time
88+
if sig:
89+
signo = sig
90+
try:
91+
signum = decode_signal(signo)
92+
if signum <= 0:
93+
raise ValueError("signal must be > 0")
94+
except ValueError as exc:
95+
raise ValueError(f"--signal={arg}: {exc}") from None
96+
97+
try:
98+
timeleft = decode_duration(tleft)
99+
except ValueError as exc:
100+
raise ValueError(f"--signal={arg}: {exc}") from None
101+
102+
return {"signum": signum, "timeleft": timeleft}
103+
104+
39105
class MiniConstraintParser(ConstraintParser):
40106
operator_map = {
41107
None: "properties",
@@ -714,6 +780,12 @@ def create_parser(
714780
+ "flags: debug, waitable, novalidate",
715781
metavar="FLAGS",
716782
)
783+
parser.add_argument(
784+
"--signal",
785+
help="Schedule delivery of signal SIG at a defined TIME before "
786+
+ "job expiration. Default SIG is SIGUSR1, default TIME is 60s.",
787+
metavar="[SIG][@TIME]",
788+
)
717789
parser.add_argument(
718790
"--dry-run",
719791
action="store_true",
@@ -742,6 +814,9 @@ def jobspec_create(self, args):
742814
rlimits = get_filtered_rlimits(args.rlimit)
743815
if rlimits:
744816
jobspec.setattr_shell_option("rlimit", rlimits)
817+
if args.signal:
818+
entry = parse_signal_option(args.signal)
819+
jobspec.setattr_shell_option("signal", entry)
745820

746821
# --taskmap is only defined for run/submit, but we check
747822
# for it in the base jobspec_create() for convenience

src/cmd/flux-job.c

Lines changed: 3 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include <sys/types.h>
1717
#include <sys/stat.h>
1818
#include <fcntl.h>
19-
#include <signal.h>
2019
#include <unistd.h>
2120
#include <time.h>
2221
#include <stdio.h>
@@ -48,6 +47,7 @@
4847
#include "src/common/libutil/fsd.h"
4948
#include "src/common/libutil/fdutils.h"
5049
#include "src/common/libutil/strstrip.h"
50+
#include "src/common/libutil/sigutil.h"
5151
#include "src/common/libidset/idset.h"
5252
#include "src/common/libeventlog/eventlog.h"
5353
#include "src/common/libioencode/ioencode.h"
@@ -1055,83 +1055,6 @@ int cmd_raiseall (optparse_t *p, int argc, char **argv)
10551055
return 0;
10561056
}
10571057

1058-
/*
1059-
* List generated by:
1060-
*
1061-
* $ kill -l | sed 's/[0-9]*)//g' | xargs -n1 printf ' "%s",\n'
1062-
*
1063-
* (ignoring SIGRT*)
1064-
*/
1065-
static const char *sigmap[] = {
1066-
NULL, /* 1 origin */
1067-
"SIGHUP",
1068-
"SIGINT",
1069-
"SIGQUIT",
1070-
"SIGILL",
1071-
"SIGTRAP",
1072-
"SIGABRT",
1073-
"SIGBUS",
1074-
"SIGFPE",
1075-
"SIGKILL",
1076-
"SIGUSR1",
1077-
"SIGSEGV",
1078-
"SIGUSR2",
1079-
"SIGPIPE",
1080-
"SIGALRM",
1081-
"SIGTERM",
1082-
"SIGSTKFLT",
1083-
"SIGCHLD",
1084-
"SIGCONT",
1085-
"SIGSTOP",
1086-
"SIGTSTP",
1087-
"SIGTTIN",
1088-
"SIGTTOU",
1089-
"SIGURG",
1090-
"SIGXCPU",
1091-
"SIGXFSZ",
1092-
"SIGVTALRM",
1093-
"SIGPROF",
1094-
"SIGWINCH",
1095-
"SIGIO",
1096-
"SIGPWR",
1097-
"SIGSYS",
1098-
NULL,
1099-
};
1100-
1101-
static bool isnumber (const char *s, int *result)
1102-
{
1103-
char *endptr;
1104-
long int l;
1105-
1106-
errno = 0;
1107-
l = strtol (s, &endptr, 10);
1108-
if (errno
1109-
|| *endptr != '\0'
1110-
|| l <= 0) {
1111-
return false;
1112-
}
1113-
*result = (int) l;
1114-
return true;
1115-
}
1116-
1117-
static int str2signum (const char *sigstr)
1118-
{
1119-
int i;
1120-
if (isnumber (sigstr, &i)) {
1121-
if (i <= 0)
1122-
return -1;
1123-
return i;
1124-
}
1125-
i = 1;
1126-
while (sigmap[i] != NULL) {
1127-
if (streq (sigstr, sigmap[i]) ||
1128-
streq (sigstr, sigmap[i]+3))
1129-
return i;
1130-
i++;
1131-
}
1132-
return -1;
1133-
}
1134-
11351058
int cmd_kill (optparse_t *p, int argc, char **argv)
11361059
{
11371060
flux_t *h;
@@ -1151,7 +1074,7 @@ int cmd_kill (optparse_t *p, int argc, char **argv)
11511074
parse_jobids_and_note (p, argv + optindex, &args, NULL);
11521075

11531076
s = optparse_get_str (p, "signal", "SIGTERM");
1154-
if ((signum = str2signum (s))< 0)
1077+
if ((signum = sigutil_signum (s)) < 0)
11551078
log_msg_exit ("kill: Invalid signal %s", s);
11561079

11571080
if (!(h = flux_open (NULL, 0)))
@@ -1192,7 +1115,7 @@ int cmd_killall (optparse_t *p, int argc, char **argv)
11921115
exit (1);
11931116
}
11941117
s = optparse_get_str (p, "signal", "SIGTERM");
1195-
if ((signum = str2signum (s))< 0)
1118+
if ((signum = sigutil_signum (s)) < 0)
11961119
log_msg_exit ("killall: Invalid signal %s", s);
11971120
if (optparse_hasopt (p, "user"))
11981121
userid = parse_arg_userid (p, "user");

src/common/libutil/Makefile.am

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ libutil_la_SOURCES = \
100100
strstrip.c \
101101
strstrip.h \
102102
basemoji.h \
103-
basemoji.c
103+
basemoji.c \
104+
sigutil.h \
105+
sigutil.c
104106

105107
EXTRA_DIST = veb_mach.c
106108

@@ -135,7 +137,8 @@ TESTS = test_sha1.t \
135137
test_slice.t \
136138
test_timestamp.t \
137139
test_environment.t \
138-
test_basemoji.t
140+
test_basemoji.t \
141+
test_sigutil.t
139142

140143
test_ldadd = \
141144
$(top_builddir)/src/common/libutil/libutil.la \
@@ -290,3 +293,7 @@ test_environment_t_LDADD = $(test_ldadd)
290293
test_basemoji_t_SOURCES = test/basemoji.c
291294
test_basemoji_t_CPPFLAGS = $(test_cppflags)
292295
test_basemoji_t_LDADD = $(test_ldadd)
296+
297+
test_sigutil_t_SOURCES = test/sigutil.c
298+
test_sigutil_t_CPPFLAGS = $(test_cppflags)
299+
test_sigutil_t_LDADD = $(test_ldadd)

0 commit comments

Comments
 (0)