Skip to content

Commit c21c959

Browse files
authored
Merge pull request #4070 from chu11/job_exec_sdexec
job-exec: support new sdexec job launch plugin
2 parents 8b0157f + 4b3834f commit c21c959

File tree

15 files changed

+1447
-90
lines changed

15 files changed

+1447
-90
lines changed

doc/man5/flux-config-exec.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@ imp
2222
the credentials of the guest user that submitted them. If unset, only
2323
jobs submitted by the instance owner may be executed.
2424

25+
method
26+
(optional) Run job shell under a specific mechanism other than the default
27+
forked subprocesses. Potential configurations:
28+
29+
systemd
30+
31+
Run job shells are run under systemd, the job shell may be able to
32+
survive an unexpected broker shutdown and be recovered when the
33+
broker is restarted.
34+
2535
job-shell
2636
(optional) Override the compiled-in default job shell path.
2737

etc/rc1

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,14 @@ if test $RANK -eq 0 -a "${FLUX_SCHED_MODULE}" != "none" \
8989
flux module load ${FLUX_SCHED_MODULE:-sched-simple}
9090
fi
9191

92-
if test $RANK -eq 0 -a -z "${FLUX_DISABLE_JOB_CLEANUP}"; then
92+
if test $RANK -eq 0; then
93+
method=$(flux config get --default=notset exec.method)
94+
if test "${method}" != "systemd" \
95+
-a -z "${FLUX_DISABLE_JOB_CLEANUP}"; then
9396
flux admin cleanup-push <<-EOT
9497
flux queue stop --quiet
9598
flux job cancelall --user=all --quiet -f --states RUN
9699
flux queue idle --quiet
97100
EOT
101+
fi
98102
fi

src/modules/job-exec/Makefile.am

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,17 @@ job_exec_la_SOURCES = \
3030
job-exec.c \
3131
checkpoint.h \
3232
checkpoint.c \
33+
exec_config.h \
34+
exec_config.c \
3335
rset.c \
3436
rset.h \
3537
testexec.c \
3638
exec.c
3739

40+
if HAVE_LIBSYSTEMD
41+
job_exec_la_SOURCES += sdexec.c
42+
endif
43+
3844
job_exec_la_LDFLAGS = \
3945
$(fluxmod_ldflags) \
4046
-module
@@ -46,6 +52,10 @@ job_exec_la_LIBADD = \
4652
$(JANSSON_LIBS) \
4753
libbulk-exec.la
4854

55+
if HAVE_LIBSYSTEMD
56+
job_exec_la_LIBADD += $(top_builddir)/src/common/libsdprocess/libsdprocess.la
57+
endif
58+
4959
bulk_exec_SOURCES = \
5060
test/bulk-exec.c
5161

src/modules/job-exec/exec.c

Lines changed: 7 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,11 @@
3333
#include <unistd.h>
3434

3535
#include "job-exec.h"
36+
#include "exec_config.h"
3637
#include "bulk-exec.h"
3738
#include "rset.h"
3839

3940
extern char **environ;
40-
static const char *default_cwd = "/tmp";
41-
static const char *default_job_shell = NULL;
42-
static const char *flux_imp_path = NULL;
4341

4442
struct exec_ctx {
4543
const char * mock_exception; /* fake exception */
@@ -74,40 +72,6 @@ static const char * exec_mock_exception (struct bulk_exec *exec)
7472
return ctx->mock_exception;
7573
}
7674

77-
static const char *jobspec_get_job_shell (json_t *jobspec)
78-
{
79-
const char *path = NULL;
80-
(void) json_unpack (jobspec, "{s:{s:{s:{s:s}}}}",
81-
"attributes", "system", "exec",
82-
"job_shell", &path);
83-
return path;
84-
}
85-
86-
static const char *job_shell_path (struct jobinfo *job)
87-
{
88-
const char *path = jobspec_get_job_shell (job->jobspec);
89-
return path ? path : default_job_shell;
90-
}
91-
92-
static const char *jobspec_get_cwd (json_t *jobspec)
93-
{
94-
const char *cwd = NULL;
95-
(void) json_unpack (jobspec, "{s:{s:{s:s}}}",
96-
"attributes", "system",
97-
"cwd", &cwd);
98-
return cwd;
99-
}
100-
101-
static const char *job_get_cwd (struct jobinfo *job)
102-
{
103-
const char *cwd;
104-
if (job->multiuser)
105-
cwd = "/";
106-
else if (!(cwd = jobspec_get_cwd (job->jobspec)))
107-
cwd = default_cwd;
108-
return (cwd);
109-
}
110-
11175
static void start_cb (struct bulk_exec *exec, void *arg)
11276
{
11377
struct jobinfo *job = arg;
@@ -300,7 +264,7 @@ static int exec_init (struct jobinfo *job)
300264
struct bulk_exec *exec = NULL;
301265
const struct idset *ranks = NULL;
302266

303-
if (job->multiuser && !flux_imp_path) {
267+
if (job->multiuser && !config_get_imp_path ()) {
304268
flux_log (job->h,
305269
LOG_ERR,
306270
"unable run multiuser job with no IMP configured!");
@@ -333,18 +297,18 @@ static int exec_init (struct jobinfo *job)
333297
goto err;
334298
}
335299
if (job->multiuser) {
336-
if (flux_cmd_argv_append (cmd, flux_imp_path) < 0
300+
if (flux_cmd_argv_append (cmd, config_get_imp_path ()) < 0
337301
|| flux_cmd_argv_append (cmd, "exec") < 0) {
338302
flux_log_error (job->h, "exec_init: flux_cmd_argv_append");
339303
goto err;
340304
}
341305
}
342-
if (flux_cmd_argv_append (cmd, job_shell_path (job)) < 0
306+
if (flux_cmd_argv_append (cmd, config_get_job_shell (job)) < 0
343307
|| flux_cmd_argv_appendf (cmd, "%ju", (uintmax_t) job->id) < 0) {
344308
flux_log_error (job->h, "exec_init: flux_cmd_argv_append");
345309
goto err;
346310
}
347-
if (flux_cmd_setcwd (cmd, job_get_cwd (job)) < 0) {
311+
if (flux_cmd_setcwd (cmd, config_get_cwd (job)) < 0) {
348312
flux_log_error (job->h, "exec_init: flux_cmd_setcwd");
349313
goto err;
350314
}
@@ -431,7 +395,7 @@ static int exec_kill (struct jobinfo *job, int signum)
431395
flux_future_t *f;
432396

433397
if (job->multiuser)
434-
f = bulk_exec_imp_kill (exec, flux_imp_path, signum);
398+
f = bulk_exec_imp_kill (exec, config_get_imp_path (), signum);
435399
else
436400
f = bulk_exec_kill (exec, signum);
437401
if (!f) {
@@ -467,54 +431,9 @@ static void exec_exit (struct jobinfo *job)
467431
job->data = NULL;
468432
}
469433

470-
/* Configure the exec module.
471-
* Read the default job shell path from config. Allow override on cmdline
472-
*/
473434
static int exec_config (flux_t *h, int argc, char **argv)
474435
{
475-
flux_error_t err;
476-
477-
/* Set default job shell path from builtin configuration,
478-
* allow override via configuration, then cmdline.
479-
*/
480-
default_job_shell = flux_conf_builtin_get ("shell_path", FLUX_CONF_AUTO);
481-
482-
483-
/* Check configuration for exec.job-shell */
484-
if (flux_conf_unpack (flux_get_conf (h),
485-
&err,
486-
"{s?:{s?s}}",
487-
"exec",
488-
"job-shell", &default_job_shell) < 0) {
489-
flux_log (h, LOG_ERR,
490-
"error reading config value exec.job-shell: %s",
491-
err.text);
492-
return -1;
493-
}
494-
495-
/* Check configuration for exec.imp */
496-
if (flux_conf_unpack (flux_get_conf (h),
497-
&err,
498-
"{s?:{s?s}}",
499-
"exec",
500-
"imp", &flux_imp_path) < 0) {
501-
flux_log (h, LOG_ERR,
502-
"error reading config value exec.imp: %s",
503-
err.text);
504-
return -1;
505-
}
506-
507-
/* Finally, override values on cmdline */
508-
for (int i = 0; i < argc; i++) {
509-
if (strncmp (argv[i], "job-shell=", 10) == 0)
510-
default_job_shell = argv[i]+10;
511-
else if (strncmp (argv[i], "imp=", 4) == 0)
512-
flux_imp_path = argv[i]+4;
513-
}
514-
flux_log (h, LOG_DEBUG, "using default shell path %s", default_job_shell);
515-
if (flux_imp_path)
516-
flux_log (h, LOG_DEBUG, "using imp path %s", flux_imp_path);
517-
return 0;
436+
return config_init (h, argc, argv);
518437
}
519438

520439
struct exec_implementation bulkexec = {

src/modules/job-exec/exec_config.c

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/************************************************************\
2+
* Copyright 2021 Lawrence Livermore National Security, LLC
3+
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
4+
*
5+
* This file is part of the Flux resource manager framework.
6+
* For details, see https://github.com/flux-framework.
7+
*
8+
* SPDX-License-Identifier: LGPL-3.0
9+
\************************************************************/
10+
11+
/* Flux job-exec configuration common code */
12+
13+
#if HAVE_CONFIG_H
14+
# include "config.h"
15+
#endif
16+
17+
#include <jansson.h>
18+
#include <unistd.h>
19+
20+
#include "exec_config.h"
21+
22+
static const char *default_cwd = "/tmp";
23+
static const char *default_job_shell = NULL;
24+
static const char *flux_imp_path = NULL;
25+
26+
static const char *jobspec_get_job_shell (json_t *jobspec)
27+
{
28+
const char *path = NULL;
29+
if (jobspec)
30+
(void) json_unpack (jobspec, "{s:{s:{s:{s:s}}}}",
31+
"attributes", "system", "exec",
32+
"job_shell", &path);
33+
return path;
34+
}
35+
36+
const char *config_get_job_shell (struct jobinfo *job)
37+
{
38+
const char *path = NULL;
39+
if (job)
40+
path = jobspec_get_job_shell (job->jobspec);
41+
return path ? path : default_job_shell;
42+
}
43+
44+
static const char *jobspec_get_cwd (json_t *jobspec)
45+
{
46+
const char *cwd = NULL;
47+
if (jobspec)
48+
(void) json_unpack (jobspec, "{s:{s:{s:s}}}",
49+
"attributes", "system",
50+
"cwd", &cwd);
51+
return cwd;
52+
}
53+
54+
const char *config_get_cwd (struct jobinfo *job)
55+
{
56+
const char *cwd = NULL;
57+
if (job) {
58+
if (job->multiuser)
59+
cwd = "/";
60+
else if (!(cwd = jobspec_get_cwd (job->jobspec)))
61+
cwd = default_cwd;
62+
}
63+
return cwd;
64+
}
65+
66+
const char *config_get_imp_path (void)
67+
{
68+
return flux_imp_path;
69+
}
70+
71+
/* Initialize common configurations for use by job-exec exec modules.
72+
*/
73+
int config_init (flux_t *h, int argc, char **argv)
74+
{
75+
flux_error_t err;
76+
77+
/* Set default job shell path from builtin configuration,
78+
* allow override via configuration, then cmdline.
79+
*/
80+
default_job_shell = flux_conf_builtin_get ("shell_path", FLUX_CONF_AUTO);
81+
82+
/* Check configuration for exec.job-shell */
83+
if (flux_conf_unpack (flux_get_conf (h),
84+
&err,
85+
"{s?:{s?s}}",
86+
"exec",
87+
"job-shell", &default_job_shell) < 0) {
88+
flux_log (h, LOG_ERR,
89+
"error reading config value exec.job-shell: %s",
90+
err.text);
91+
return -1;
92+
}
93+
94+
/* Check configuration for exec.imp */
95+
if (flux_conf_unpack (flux_get_conf (h),
96+
&err,
97+
"{s?:{s?s}}",
98+
"exec",
99+
"imp", &flux_imp_path) < 0) {
100+
flux_log (h, LOG_ERR,
101+
"error reading config value exec.imp: %s",
102+
err.text);
103+
return -1;
104+
}
105+
106+
if (argv && argc) {
107+
/* Finally, override values on cmdline */
108+
for (int i = 0; i < argc; i++) {
109+
if (strncmp (argv[i], "job-shell=", 10) == 0)
110+
default_job_shell = argv[i]+10;
111+
else if (strncmp (argv[i], "imp=", 4) == 0)
112+
flux_imp_path = argv[i]+4;
113+
}
114+
}
115+
116+
flux_log (h, LOG_DEBUG, "using default shell path %s", default_job_shell);
117+
if (flux_imp_path)
118+
flux_log (h, LOG_DEBUG, "using imp path %s", flux_imp_path);
119+
return 0;
120+
}
121+
122+
/* vi: ts=4 sw=4 expandtab
123+
*/

src/modules/job-exec/exec_config.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/************************************************************\
2+
* Copyright 2021 Lawrence Livermore National Security, LLC
3+
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
4+
*
5+
* This file is part of the Flux resource manager framework.
6+
* For details, see https://github.com/flux-framework.
7+
*
8+
* SPDX-License-Identifier: LGPL-3.0
9+
\************************************************************/
10+
11+
#ifndef HAVE_JOB_EXEC_CONFIG_H
12+
#define HAVE_JOB_EXEC_CONFIG_H 1
13+
14+
#include <flux/core.h>
15+
16+
#include "job-exec.h"
17+
18+
const char *config_get_job_shell (struct jobinfo *job);
19+
20+
const char *config_get_cwd (struct jobinfo *job);
21+
22+
const char *config_get_imp_path (void);
23+
24+
int config_init (flux_t *h, int argc, char **argv);
25+
26+
#endif /* !HAVE_JOB_EXEC_CONFIG_EXEC_H */
27+
28+
/* vi: ts=4 sw=4 expandtab
29+
*/

src/modules/job-exec/job-exec.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,16 @@
104104
static double kill_timeout=5.0;
105105

106106
extern struct exec_implementation testexec;
107+
#ifdef HAVE_LIBSYSTEMD
108+
extern struct exec_implementation sdexec;
109+
#endif
107110
extern struct exec_implementation bulkexec;
108111

109112
static struct exec_implementation * implementations[] = {
110113
&testexec,
114+
#ifdef HAVE_LIBSYSTEMD
115+
&sdexec,
116+
#endif
111117
&bulkexec,
112118
NULL
113119
};

0 commit comments

Comments
 (0)