Skip to content

Commit 058387d

Browse files
committed
job-exec: use Type=notify and timer with sdexec
Problem: jobs remain in R state when the flux-shell exits with unkillable processes. Run imp-shell units with Type=notify and the new sdexec stop timer. Disable systemd's stop timer by setting TimeoutStopUsec=infinity. This assumes the IMP has been modified to call sd_notify(3) at appropriate transitions. The stop timer, which is enabled by default with a timeout of 30s and signal of SIGUSR1, may be configured or disabled via the TOML [exec] table. Fixes #6656
1 parent 4cea52f commit 058387d

File tree

3 files changed

+62
-6
lines changed

3 files changed

+62
-6
lines changed

src/modules/job-exec/exec.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,7 @@ static int parse_service_option (json_t *jobspec,
519519
return 0;
520520
}
521521

522+
522523
static struct bulk_exec_ops exec_ops = {
523524
.on_start = start_cb,
524525
.on_exit = exit_cb,
@@ -604,14 +605,27 @@ static int exec_init (struct jobinfo *job)
604605
goto err;
605606
}
606607
/* The systemd user instance running as user flux is not privileged
607-
* to signal guest processes, therefore only signal the IMP and
608-
* never use SIGKILL. See flux-framework/flux-core#6399
608+
* to signal guest processes, therefore:
609+
* - Set the KillMode=process so only the IMP is signaled
610+
* - Use Type=notify in conjunction with IMP calling sd_notify(3) so
611+
* the unit transitions to deactivating when the shell exits.
612+
* - Set TimeoutStopUsec=infinity to disable systemd's stop timeout.
613+
* - Enable sdexec's stop timeout which is armed at deactivating,
614+
* delivers SIGUSR1 (proxy for SIGKILL) after 30s, then abandons
615+
* the unit and terminates the exec RPC after another 30s.
609616
*/
610617
if (streq (service, "sdexec")) {
611618
if (flux_cmd_setopt (cmd, "SDEXEC_PROP_KillMode", "process") < 0
619+
|| flux_cmd_setopt (cmd, "SDEXEC_PROP_Type", "notify") < 0
620+
|| flux_cmd_setopt (cmd,
621+
"SDEXEC_PROP_TimeoutStopUSec",
622+
"infinity") < 0
623+
|| flux_cmd_setopt (cmd,
624+
"SDEXEC_STOP_TIMER_SIGNAL",
625+
config_get_sdexec_stop_timer_signal ()) < 0
612626
|| flux_cmd_setopt (cmd,
613-
"SDEXEC_PROP_SendSIGKILL",
614-
"off") < 0) {
627+
"SDEXEC_STOP_TIMER_SEC",
628+
config_get_sdexec_stop_timer_sec ()) < 0) {
615629
flux_log_error (job->h,
616630
"Unable to set multiuser sdexec options");
617631
return -1;

src/modules/job-exec/exec_config.c

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ struct exec_config {
3636
const char *exec_service;
3737
int exec_service_override;
3838
json_t *sdexec_properties;
39+
int sdexec_stop_timer_sec;
40+
int sdexec_stop_timer_signal;
3941
double default_barrier_timeout;
4042
};
4143

@@ -107,6 +109,20 @@ json_t *config_get_sdexec_properties (void)
107109
return exec_conf.sdexec_properties;
108110
}
109111

112+
const char *config_get_sdexec_stop_timer_sec (void)
113+
{
114+
static char buf[32];
115+
snprintf (buf, sizeof (buf), "%d", exec_conf.sdexec_stop_timer_sec);
116+
return buf;
117+
}
118+
119+
const char *config_get_sdexec_stop_timer_signal (void)
120+
{
121+
static char buf[32];
122+
snprintf (buf, sizeof (buf), "%d", exec_conf.sdexec_stop_timer_signal);
123+
return buf;
124+
}
125+
110126
double config_get_default_barrier_timeout (void)
111127
{
112128
return exec_conf.default_barrier_timeout;
@@ -116,15 +132,19 @@ int config_get_stats (json_t **config_stats)
116132
{
117133
json_t *o = NULL;
118134

119-
if (!(o = json_pack ("{s:s? s:s? s:s? s:s? s:i s:f}",
135+
if (!(o = json_pack ("{s:s? s:s? s:s? s:s? s:i s:f s:i s:i}",
120136
"default_cwd", default_cwd,
121137
"default_job_shell", exec_conf.default_job_shell,
122138
"flux_imp_path", exec_conf.flux_imp_path,
123139
"exec_service", exec_conf.exec_service,
124140
"exec_service_override",
125141
exec_conf.exec_service_override,
126142
"default_barrier_timeout",
127-
exec_conf.default_barrier_timeout))) {
143+
exec_conf.default_barrier_timeout,
144+
"sdexec_stop_timer_sec",
145+
exec_conf.sdexec_stop_timer_sec,
146+
"sdexec_stop_timer_signal",
147+
exec_conf.sdexec_stop_timer_signal))) {
128148
errno = ENOMEM;
129149
return -1;
130150
}
@@ -153,6 +173,8 @@ static void exec_config_init (struct exec_config *ec)
153173
ec->exec_service = "rexec";
154174
ec->exec_service_override = 0;
155175
ec->sdexec_properties = NULL;
176+
ec->sdexec_stop_timer_sec = 30;
177+
ec->sdexec_stop_timer_signal = 10; // SIGUSR1
156178
ec->default_barrier_timeout = 1800.;
157179
}
158180

@@ -249,6 +271,22 @@ int config_setup (flux_t *h,
249271
}
250272
}
251273

274+
/* Check configuration for exec.stop-timer-* */
275+
if (flux_conf_unpack (conf,
276+
&err,
277+
"{s?{s?i s?i}}",
278+
"exec",
279+
"sdexec-stop-timer-sec",
280+
&tmpconf.sdexec_stop_timer_sec,
281+
"sdexec-stop-timer-signal",
282+
&tmpconf.sdexec_stop_timer_signal) < 0) {
283+
errprintf (errp,
284+
"error reading config values exec.sdexec-stop-timer-sec: %s"
285+
" or exec.sdexec-stop-timer-signal",
286+
err.text);
287+
return -1;
288+
}
289+
252290
/* Check configuration for exec.barrier-timeout */
253291
if (flux_conf_unpack (conf,
254292
&err,

src/modules/job-exec/exec_config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ double config_get_default_barrier_timeout (void);
3838

3939
int config_get_stats (json_t **config_stats);
4040

41+
const char *config_get_sdexec_stop_timer_sec (void);
42+
43+
const char *config_get_sdexec_stop_timer_signal (void);
44+
4145
int config_setup (flux_t *h,
4246
const flux_conf_t *conf,
4347
int argc,

0 commit comments

Comments
 (0)