Skip to content

Commit aecfd32

Browse files
committed
broker: forward non-deadly signals to all jobs
Problem: The broker blocks all signals in main(), except those for which a core dump may be generated, then handles a specific smaller set of "fatal" signals in broker_handle_signals(). This leaves all other signals blocked, and any attempt to send one of these signals to a batch job will be ignored since the job shell delivers them directly to the broker. Add SIGUSR1 and SIGUSR2 to the list of signals handled in broker_handle_signals(), and instead of calling runat_abort() for these 2 signals, forward them to all jobs via the job-manager.killall RPC. If the RPC fails, send the signal locally so the signal goes somewhere. For now, keep the current behavior for the set of fatal signals that terminate rc2 to avoid breaking anything that depends on this behavior.
1 parent 331fb13 commit aecfd32

File tree

1 file changed

+71
-2
lines changed

1 file changed

+71
-2
lines changed

src/broker/broker.c

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "src/common/libfluxutil/method.h"
4646
#include "ccan/array_size/array_size.h"
4747
#include "ccan/str/str.h"
48+
#include "ccan/ptrint/ptrint.h"
4849

4950
#include "module.h"
5051
#include "brokercfg.h"
@@ -1308,7 +1309,8 @@ static void broker_destroy_sigwatcher (void *data)
13081309

13091310
static int broker_handle_signals (broker_ctx_t *ctx)
13101311
{
1311-
int i, sigs[] = { SIGHUP, SIGINT, SIGQUIT, SIGTERM, SIGALRM };
1312+
int i, sigs[] = { SIGHUP, SIGINT, SIGQUIT, SIGTERM,
1313+
SIGALRM, SIGUSR1, SIGUSR2 };
13121314
int blocked[] = { SIGPIPE };
13131315
flux_watcher_t *w;
13141316

@@ -1965,13 +1967,80 @@ static void module_status_cb (module_t *p, int prev_status, void *arg)
19651967
}
19661968
}
19671969

1970+
static bool signal_is_deadly (int signum)
1971+
{
1972+
int deadly_sigs[] = { SIGHUP, SIGINT, SIGQUIT, SIGTERM, SIGALRM };
1973+
for (int i = 0; i < ARRAY_SIZE (deadly_sigs); i++) {
1974+
if (signum == deadly_sigs[i])
1975+
return true;
1976+
}
1977+
return false;
1978+
}
1979+
1980+
static void killall_cb (flux_future_t *f, void *arg)
1981+
{
1982+
broker_ctx_t *ctx = arg;
1983+
int count = 0;
1984+
if (flux_rpc_get_unpack (f, "{s:i}", "count", &count) < 0) {
1985+
flux_log_error (ctx->h,
1986+
"job-manager.killall: %s",
1987+
future_strerror (f, errno));
1988+
}
1989+
flux_future_destroy (f);
1990+
if (count) {
1991+
flux_log (ctx->h,
1992+
LOG_INFO,
1993+
"forwarded signal %d to %d jobs",
1994+
(int) ptr2int (flux_future_aux_get (f, "signal")),
1995+
count);
1996+
}
1997+
}
1998+
1999+
static int killall_jobs (broker_ctx_t *ctx, int signum)
2000+
{
2001+
flux_future_t *f = NULL;
2002+
if (!(f = flux_rpc_pack (ctx->h,
2003+
"job-manager.killall",
2004+
FLUX_NODEID_ANY,
2005+
0,
2006+
"{s:b s:i s:i}",
2007+
"dry_run", 0,
2008+
"userid", FLUX_USERID_UNKNOWN,
2009+
"signum", signum))
2010+
|| flux_future_then (f, -1., killall_cb, ctx) < 0) {
2011+
flux_future_destroy (f);
2012+
return -1;
2013+
}
2014+
if (flux_future_aux_set (f, "signum", int2ptr (signum), NULL) < 0)
2015+
flux_log_error (ctx->h, "killall: future_aux_set");
2016+
return 0;
2017+
}
2018+
19682019
static void signal_cb (flux_reactor_t *r, flux_watcher_t *w,
1969-
int revents, void *arg)
2020+
int revents, void *arg)
19702021
{
19712022
broker_ctx_t *ctx = arg;
19722023
int signum = flux_signal_watcher_get_signum (w);
19732024

19742025
flux_log (ctx->h, LOG_INFO, "signal %d", signum);
2026+
2027+
if (ctx->rank == 0 && !signal_is_deadly (signum)) {
2028+
/* Attempt to forward non-deadly signals to jobs. If that fails,
2029+
* then fall through to state_machine_kill() so the signal is
2030+
* delivered somewhere.
2031+
*/
2032+
if (killall_jobs (ctx, signum) == 0)
2033+
return;
2034+
/*
2035+
* Note: flux_rpc(3) in the rank 0 broker to the job manager module
2036+
* is expected to fail immediately if the job-manager module is not
2037+
* loaded due to the broker internal flux_t handle implementation.
2038+
*/
2039+
flux_log (ctx->h,
2040+
LOG_INFO,
2041+
"killall failed, delivering signal %d locally instead",
2042+
signum);
2043+
}
19752044
state_machine_kill (ctx->state_machine, signum);
19762045
}
19772046

0 commit comments

Comments
 (0)