Skip to content

Commit 26e39f2

Browse files
authored
Merge pull request #6666 from garlick/sdexec_notify
sdexec: add stop timeout to handle unkillable processes
2 parents f0eb393 + a84adc0 commit 26e39f2

File tree

12 files changed

+315
-38
lines changed

12 files changed

+315
-38
lines changed

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ AS_IF([test x$enable_code_coverage = xyes], [
433433
AC_ARG_WITH([flux-security], AS_HELP_STRING([--with-flux-security],
434434
[Build with flux-security]))
435435
AS_IF([test "x$with_flux_security" = "xyes"], [
436-
PKG_CHECK_MODULES([FLUX_SECURITY], [flux-security >= 0.13.0],
436+
PKG_CHECK_MODULES([FLUX_SECURITY], [flux-security >= 0.14.0],
437437
[flux_sec_incdir=`$PKG_CONFIG --variable=includedir flux-security`])
438438
AS_IF([test "x$flux_sec_incdir" = x],
439439
[AC_MSG_ERROR([couldn't find flux-security include directory])])

debian/control

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Maintainer: Mark A. Grondona <[email protected]>
55
Standards-Version: 4.1.2
66
Build-Depends:
77
debhelper (>= 10),
8-
flux-security (>= 0.13.0),
8+
flux-security (>= 0.14.0),
99
libsystemd-dev (>= 0.23),
1010
libarchive-dev,
1111
uuid-dev,

doc/man5/flux-config-exec.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,21 @@ sdexec-properties
3838
(optional) A table of systemd properties to set for all jobs. All values
3939
must be strings. See :ref:`sdexec_properties` below.
4040

41+
sdexec-stop-timer-sec
42+
(optional) Configure the length of time in seconds after a unit enters
43+
deactivating state when it will be sent the sdexec-stop-timer-signal.
44+
Deactivating state is entered by ``imp-shell`` units when the
45+
:man1:`flux-shell` terminates. The unit may remain there as long as
46+
user processes remain in the unit's cgroup.
47+
48+
After the same length of time, if the unit hasn't terminated, for example
49+
due to unkillable processes, the unit is abandoned and the node is drained.
50+
Default 30.
51+
52+
sdexec-stop-timer-signal
53+
(optional) Configure the signal used by the stop timer. By default,
54+
10 (SIGUSR1, the IMP proxy for SIGKILL) is used.
55+
4156
kill-timeout
4257
(optional) The amount of time in FSD to wait after ``SIGTERM`` is
4358
sent to a job before sending ``SIGKILL``. The default is "5s". See

src/common/libsdexec/start.c

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,21 @@ static int prop_add (json_t *prop, const char *name, const char *val)
285285
if (prop_add_bool (prop, name, value) < 0)
286286
return -1;
287287
}
288+
else if (streq (name, "TimeoutStopUSec")) {
289+
if (streq (val, "infinity")) {
290+
if (prop_add_u64 (prop, name, UINT64_MAX) < 0)
291+
return -1;
292+
}
293+
else {
294+
errno = 0;
295+
char *endptr;
296+
uint64_t u = strtoull (val, &endptr, 10);
297+
if (errno != 0 || *endptr != '\0')
298+
return -1;
299+
if (prop_add_u64 (prop, name, u) < 0)
300+
return -1;
301+
}
302+
}
288303
else {
289304
if (prop_add_string (prop, name, val) < 0)
290305
return -1;
@@ -293,7 +308,6 @@ static int prop_add (json_t *prop, const char *name, const char *val)
293308
}
294309

295310
static json_t *prop_create (json_t *cmd,
296-
const char *type,
297311
int stdin_fd,
298312
int stdout_fd,
299313
int stderr_fd,
@@ -307,6 +321,7 @@ static json_t *prop_create (json_t *cmd,
307321
json_t *opts;
308322
const char *key;
309323
json_t *val;
324+
bool type_is_set = false;
310325

311326
// Not unpacked: channels
312327
if (json_unpack_ex (cmd,
@@ -328,7 +343,6 @@ static json_t *prop_create (json_t *cmd,
328343
return NULL;
329344
}
330345
if (prop_add_execstart (prop, "ExecStart", cmdline) < 0
331-
|| prop_add_string (prop, "Type", type) < 0
332346
|| prop_add_string (prop, "WorkingDirectory", cwd) < 0
333347
|| prop_add_bool (prop, "RemainAfterExit", true) < 0
334348
|| prop_add_env (prop, "Environment", env) < 0
@@ -345,6 +359,14 @@ static json_t *prop_create (json_t *cmd,
345359
errprintf (error, "%s: error setting property", key);
346360
goto error;
347361
}
362+
if (streq (key + 12, "Type"))
363+
type_is_set = true;
364+
}
365+
}
366+
if (!type_is_set) {
367+
if (prop_add_string (prop, "Type", "exec") < 0) {
368+
errprintf (error, "error setting property Type=simple");
369+
goto error;
348370
}
349371
}
350372
return prop;
@@ -357,7 +379,6 @@ static json_t *prop_create (json_t *cmd,
357379
flux_future_t *sdexec_start_transient_unit (flux_t *h,
358380
uint32_t rank,
359381
const char *mode,
360-
const char *type,
361382
json_t *cmd,
362383
int stdin_fd,
363384
int stdout_fd,
@@ -368,13 +389,12 @@ flux_future_t *sdexec_start_transient_unit (flux_t *h,
368389
json_t *prop = NULL;
369390
const char *name;
370391

371-
if (!h || !mode || !type || !cmd) {
392+
if (!h || !mode || !cmd) {
372393
errprintf (error, "invalid argument");
373394
errno = EINVAL;
374395
return NULL;
375396
}
376397
if (!(prop = prop_create (cmd,
377-
type,
378398
stdin_fd,
379399
stdout_fd,
380400
stderr_fd,

src/common/libsdexec/start.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
* and systemd.service(5) for more info on mode and type parameters.
2222
* mode may be one of:
2323
replace, fail, isolate, ignore-dependencies, ignore-requirements
24-
* type may be one of:
25-
* simple, exec, forking, oneshot, dbus, notify, notify-reload, idle
2624
*
2725
* stdin_fd, stdout_fd, and stderr_fd are file descriptors to be duped and
2826
* passed to the new unit. The dup should be complete on first fulfillment
@@ -50,7 +48,6 @@
5048
flux_future_t *sdexec_start_transient_unit (flux_t *h,
5149
uint32_t rank,
5250
const char *mode,
53-
const char *type,
5451
json_t *cmd,
5552
int stdin_fd,
5653
int stdout_fd,

src/common/libsdexec/test/start.c

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ void test_inval (void)
5757
ok (sdexec_start_transient_unit (NULL, // h
5858
0, // rank
5959
"fail", // mode
60-
"simple", // type
6160
cmd_o, // cmd
6261
-1, -1, -1, // *_fd
6362
&error) == NULL
@@ -70,7 +69,6 @@ void test_inval (void)
7069
ok (sdexec_start_transient_unit (h, // h
7170
0, // rank
7271
NULL, // mode
73-
"simple", // type
7472
cmd_o, // cmd
7573
-1, -1, -1, // *_fd
7674
&error) == NULL
@@ -83,20 +81,6 @@ void test_inval (void)
8381
ok (sdexec_start_transient_unit (h, // h
8482
0, // rank
8583
"fail", // mode
86-
NULL, // type
87-
cmd_o, // cmd
88-
-1, -1, -1, // *_fd
89-
&error) == NULL
90-
&& errno == EINVAL,
91-
"sdexec_start_transient_unit type=NULL fails with EINVAL");
92-
diag ("%s", error.text);
93-
94-
errno = 0;
95-
error.text[0] = '\0';
96-
ok (sdexec_start_transient_unit (h, // h
97-
0, // rank
98-
"fail", // mode
99-
"simple", // type
10084
NULL, // cmd
10185
-1, -1, -1, // *_fd
10286
&error) == NULL
@@ -109,7 +93,6 @@ void test_inval (void)
10993
ok (sdexec_start_transient_unit (h, // h
11094
0, // rank
11195
"fail", // mode
112-
"simple", // type
11396
cmd_o_noname, // cmd
11497
-1, -1, -1, // *_fd
11598
&error) == NULL
@@ -122,7 +105,6 @@ void test_inval (void)
122105
ok (sdexec_start_transient_unit (h, // h
123106
0, // rank
124107
"fail", // mode
125-
"simple", // type
126108
cmd_o_badprop, // cmd
127109
-1, -1, -1, // *_fd
128110
&error) == NULL

src/modules/job-exec/exec.c

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,42 @@ static void error_cb (struct bulk_exec *exec, flux_subprocess_t *p, void *arg)
347347
* create flux_cmd_t
348348
*/
349349
if (cmd) {
350-
if (errnum == EHOSTUNREACH) {
350+
if (errnum == EDEADLK) {
351+
/* EDEADLK from sdexec means that unkillable processes were left
352+
* on the node and it must be drained. A "finished" response
353+
* will not have been received, so after draining, treat this
354+
* like EHOSTUNREACH.
355+
*/
356+
char ranks[16];
357+
snprintf (ranks, sizeof (ranks), "%d", rank);
358+
(void) jobinfo_drain_ranks (job,
359+
ranks,
360+
"unkillable processes from job %s",
361+
idf58 (job->id));
362+
bool critical = is_critical_rank (job, shell_rank);
363+
364+
/* Always notify rank 0 shell of a lost shell.
365+
*/
366+
lost_shell (job,
367+
critical,
368+
shell_rank,
369+
"shell exited with unkillable processes"
370+
" on %s (shell rank %d)",
371+
hostname,
372+
shell_rank);
373+
374+
/* Raise a fatal error and terminate job immediately if
375+
* the lost shell was critical.
376+
*/
377+
if (critical)
378+
jobinfo_fatal_error (job,
379+
0,
380+
"shell exited with unkillable processes"
381+
" on %s (rank %d)",
382+
hostname,
383+
rank);
384+
}
385+
else if (errnum == EHOSTUNREACH) {
351386
bool critical = is_critical_rank (job, shell_rank);
352387

353388
/* Always notify rank 0 shell of a lost shell.
@@ -519,6 +554,7 @@ static int parse_service_option (json_t *jobspec,
519554
return 0;
520555
}
521556

557+
522558
static struct bulk_exec_ops exec_ops = {
523559
.on_start = start_cb,
524560
.on_exit = exit_cb,
@@ -604,14 +640,27 @@ static int exec_init (struct jobinfo *job)
604640
goto err;
605641
}
606642
/* The systemd user instance running as user flux is not privileged
607-
* to signal guest processes, therefore only signal the IMP and
608-
* never use SIGKILL. See flux-framework/flux-core#6399
643+
* to signal guest processes, therefore:
644+
* - Set the KillMode=process so only the IMP is signaled
645+
* - Use Type=notify in conjunction with IMP calling sd_notify(3) so
646+
* the unit transitions to deactivating when the shell exits.
647+
* - Set TimeoutStopUsec=infinity to disable systemd's stop timeout.
648+
* - Enable sdexec's stop timeout which is armed at deactivating,
649+
* delivers SIGUSR1 (proxy for SIGKILL) after 30s, then abandons
650+
* the unit and terminates the exec RPC after another 30s.
609651
*/
610652
if (streq (service, "sdexec")) {
611653
if (flux_cmd_setopt (cmd, "SDEXEC_PROP_KillMode", "process") < 0
654+
|| flux_cmd_setopt (cmd, "SDEXEC_PROP_Type", "notify") < 0
655+
|| flux_cmd_setopt (cmd,
656+
"SDEXEC_PROP_TimeoutStopUSec",
657+
"infinity") < 0
658+
|| flux_cmd_setopt (cmd,
659+
"SDEXEC_STOP_TIMER_SIGNAL",
660+
config_get_sdexec_stop_timer_signal ()) < 0
612661
|| flux_cmd_setopt (cmd,
613-
"SDEXEC_PROP_SendSIGKILL",
614-
"off") < 0) {
662+
"SDEXEC_STOP_TIMER_SEC",
663+
config_get_sdexec_stop_timer_sec ()) < 0) {
615664
flux_log_error (job->h,
616665
"Unable to set multiuser sdexec options");
617666
return -1;

src/modules/job-exec/exec_config.c

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ struct exec_config {
3636
const char *exec_service;
3737
int exec_service_override;
3838
json_t *sdexec_properties;
39+
int sdexec_stop_timer_sec;
40+
int sdexec_stop_timer_signal;
3941
double default_barrier_timeout;
4042
};
4143

@@ -107,6 +109,20 @@ json_t *config_get_sdexec_properties (void)
107109
return exec_conf.sdexec_properties;
108110
}
109111

112+
const char *config_get_sdexec_stop_timer_sec (void)
113+
{
114+
static char buf[32];
115+
snprintf (buf, sizeof (buf), "%d", exec_conf.sdexec_stop_timer_sec);
116+
return buf;
117+
}
118+
119+
const char *config_get_sdexec_stop_timer_signal (void)
120+
{
121+
static char buf[32];
122+
snprintf (buf, sizeof (buf), "%d", exec_conf.sdexec_stop_timer_signal);
123+
return buf;
124+
}
125+
110126
double config_get_default_barrier_timeout (void)
111127
{
112128
return exec_conf.default_barrier_timeout;
@@ -116,15 +132,19 @@ int config_get_stats (json_t **config_stats)
116132
{
117133
json_t *o = NULL;
118134

119-
if (!(o = json_pack ("{s:s? s:s? s:s? s:s? s:i s:f}",
135+
if (!(o = json_pack ("{s:s? s:s? s:s? s:s? s:i s:f s:i s:i}",
120136
"default_cwd", default_cwd,
121137
"default_job_shell", exec_conf.default_job_shell,
122138
"flux_imp_path", exec_conf.flux_imp_path,
123139
"exec_service", exec_conf.exec_service,
124140
"exec_service_override",
125141
exec_conf.exec_service_override,
126142
"default_barrier_timeout",
127-
exec_conf.default_barrier_timeout))) {
143+
exec_conf.default_barrier_timeout,
144+
"sdexec_stop_timer_sec",
145+
exec_conf.sdexec_stop_timer_sec,
146+
"sdexec_stop_timer_signal",
147+
exec_conf.sdexec_stop_timer_signal))) {
128148
errno = ENOMEM;
129149
return -1;
130150
}
@@ -153,6 +173,8 @@ static void exec_config_init (struct exec_config *ec)
153173
ec->exec_service = "rexec";
154174
ec->exec_service_override = 0;
155175
ec->sdexec_properties = NULL;
176+
ec->sdexec_stop_timer_sec = 30;
177+
ec->sdexec_stop_timer_signal = 10; // SIGUSR1
156178
ec->default_barrier_timeout = 1800.;
157179
}
158180

@@ -249,6 +271,22 @@ int config_setup (flux_t *h,
249271
}
250272
}
251273

274+
/* Check configuration for exec.stop-timer-* */
275+
if (flux_conf_unpack (conf,
276+
&err,
277+
"{s?{s?i s?i}}",
278+
"exec",
279+
"sdexec-stop-timer-sec",
280+
&tmpconf.sdexec_stop_timer_sec,
281+
"sdexec-stop-timer-signal",
282+
&tmpconf.sdexec_stop_timer_signal) < 0) {
283+
errprintf (errp,
284+
"error reading config values exec.sdexec-stop-timer-sec: %s"
285+
" or exec.sdexec-stop-timer-signal",
286+
err.text);
287+
return -1;
288+
}
289+
252290
/* Check configuration for exec.barrier-timeout */
253291
if (flux_conf_unpack (conf,
254292
&err,

src/modules/job-exec/exec_config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ double config_get_default_barrier_timeout (void);
3838

3939
int config_get_stats (json_t **config_stats);
4040

41+
const char *config_get_sdexec_stop_timer_sec (void);
42+
43+
const char *config_get_sdexec_stop_timer_signal (void);
44+
4145
int config_setup (flux_t *h,
4246
const flux_conf_t *conf,
4347
int argc,

0 commit comments

Comments
 (0)