Skip to content

Commit 91c7cf3

Browse files
committed
flux-dump: call sd_notify()
Problem: if an instance that is configured to dump is shutdown via 'systemctl stop flux', it could run out the shutdown timer and be killed, interrupting the dump. Have flux-dump check for the broker attribute 'broker.sd-notify'. If it is set to something other than zero, then use sd_notify(3) to send systemd EXTEND_TIMEOUT_USEC messages as the dump progresses. In addition, send progress updates that are displayed under "Status:" in 'systemctl status flux'.
1 parent 7b3516b commit 91c7cf3

File tree

2 files changed

+27
-0
lines changed

2 files changed

+27
-0
lines changed

src/cmd/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ AM_CPPFLAGS = \
1414
$(FLUX_SECURITY_CFLAGS) \
1515
$(HWLOC_CFLAGS) \
1616
$(JANSSON_CFLAGS) \
17+
$(LIBSYSTEMD_CFLAGS) \
1718
$(LIBARCHIVE_CFLAGS)
1819

1920

@@ -87,6 +88,7 @@ flux_LDADD = \
8788
$(top_builddir)/src/common/libpmi/libpmi_common.la \
8889
$(top_builddir)/src/common/libfilemap/libfilemap.la \
8990
$(LIBARCHIVE_LIBS) \
91+
$(LIBSYSTEMD_LIBS) \
9092
$(fluxcmd_ldadd)
9193

9294
#

src/cmd/builtin/dump.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#if HAVE_CONFIG_H
1212
# include <config.h>
1313
#endif
14+
#if HAVE_LIBSYSTEMD
15+
#include <systemd/sd-daemon.h>
16+
#endif
1417
#include <unistd.h>
1518
#include <stdarg.h>
1619
#include <jansson.h>
@@ -33,6 +36,7 @@ static void dump_treeobj (struct archive *ar,
3336
const char *path,
3437
json_t *treeobj);
3538

39+
static bool sd_notify_flag;
3640
static bool verbose;
3741
static bool quiet;
3842
static int content_flags;
@@ -49,12 +53,24 @@ static void progress (int delta_keys)
4953
&& !quiet
5054
&& (keycount % 100 == 0 || keycount < 10))
5155
fprintf (stderr, "\rflux-dump: archived %d keys", keycount);
56+
#if HAVE_LIBSYSTEMD
57+
if (sd_notify_flag
58+
&& (keycount % 100 == 0 || keycount < 10)) {
59+
sd_notifyf (0, "EXTEND_TIMEOUT_USEC=%d", 10000000); // 10s
60+
sd_notifyf (0, "STATUS=flux-dump(1) has archived %d keys", keycount);
61+
}
62+
#endif
5263
}
5364

5465
static void progress_end (void)
5566
{
5667
if (!quiet && !verbose)
5768
fprintf (stderr, "\rflux-dump: archived %d keys\n", keycount);
69+
#if HAVE_LIBSYSTEMD
70+
if (sd_notify_flag) {
71+
sd_notifyf (0, "STATUS=flux-dump(1) has archived %d keys", keycount);
72+
}
73+
#endif
5874
}
5975

6076
static struct archive *dump_create (const char *outfile)
@@ -369,6 +385,15 @@ static int cmd_dump (optparse_t *p, int ac, char *av[])
369385
dump_gid = getgid ();
370386

371387
h = builtin_get_flux_handle (p);
388+
389+
/* If the broker is using sd_notify(3) to talk to systemd during
390+
* start/stop, we can use it to ensure systemd doesn't kill us
391+
* while dumping during shutdown. See flux-framework/flux-core#5778.
392+
*/
393+
const char *s;
394+
if ((s = flux_attr_get (h, "broker.sd-notify")) && !streq (s, "0"))
395+
sd_notify_flag = true;
396+
372397
ar = dump_create (outfile);
373398
if (optparse_hasopt (p, "checkpoint")) {
374399
flux_future_t *f;

0 commit comments

Comments
 (0)