Skip to content

Commit 6f5bfe1

Browse files
fuziontechclaude
andcommitted
fix(controlplane): eliminate systemd "not our child" warning and metrics port conflict
Two fixes for zero-downtime handover: 1. selfExec now double-forks via `setsid --fork` when running under systemd (NOTIFY_SOCKET set). The new CP is reparented to PID 1, allowing systemd to properly track it via waitpid() for Restart=always. Outside systemd (tests), the direct spawn path is preserved for fast crash recovery via cmd.Wait(). A 30s timeout provides crash recovery for the detached path. 2. initMetrics retries binding :9090 until available. During handover the old CP still holds the metrics port until it drains and exits; the new CP's metrics goroutine now retries instead of dying on first failure. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ea5ca9a commit 6f5bfe1

File tree

3 files changed

+68
-5
lines changed

3 files changed

+68
-5
lines changed

controlplane/control.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,22 @@ func (cp *ControlPlane) shutdown() {
566566
// selfExec spawns a new control plane process from the binary on disk.
567567
// The new process detects the existing handover socket and initiates the
568568
// handover protocol to receive listener FDs.
569+
//
570+
// Under systemd (Type=notify), the child must be reparented to PID 1 for
571+
// systemd to properly track it via waitpid(). We achieve this via
572+
// double-fork using setsid --fork: the intermediate exits immediately and
573+
// the grandchild (new CP) is reparented to PID 1. Without this, systemd
574+
// logs "Supervising process X which is not our child" and may not detect
575+
// crashes for Restart=always.
576+
//
577+
// Outside systemd (tests, manual runs), we use direct spawn so the old CP
578+
// can track the child's exit via cmd.Wait() for faster crash recovery.
569579
func (cp *ControlPlane) selfExec() {
580+
if os.Getenv("NOTIFY_SOCKET") != "" {
581+
cp.selfExecDetached()
582+
return
583+
}
584+
570585
cmd := exec.Command(os.Args[0], os.Args[1:]...)
571586
cmd.Stdout = os.Stdout
572587
cmd.Stderr = os.Stderr
@@ -606,6 +621,41 @@ func (cp *ControlPlane) selfExec() {
606621
}()
607622
}
608623

624+
// selfExecDetached spawns the new CP via setsid --fork so it is reparented
625+
// to PID 1 (systemd). Because we cannot track the detached grandchild's
626+
// exit directly, we use a timeout for crash recovery instead of cmd.Wait().
627+
func (cp *ControlPlane) selfExecDetached() {
628+
args := append([]string{"--fork", os.Args[0]}, os.Args[1:]...)
629+
cmd := exec.Command("setsid", args...)
630+
cmd.Stdout = os.Stdout
631+
cmd.Stderr = os.Stderr
632+
cmd.Env = os.Environ()
633+
634+
// cmd.Run() returns almost immediately: setsid --fork double-forks and
635+
// the intermediate exits right away. The grandchild (new CP) continues.
636+
if err := cmd.Run(); err != nil {
637+
slog.Error("Self-exec (detached) failed.", "error", err)
638+
cp.recoverFromFailedReload()
639+
return
640+
}
641+
642+
slog.Info("New control plane spawned (detached).")
643+
644+
// Recovery timeout: if no handover connection is received within 30s,
645+
// the new CP likely crashed during startup. The handover protocol
646+
// already has a 30s per-connection deadline (handleHandoverRequest),
647+
// but that only fires after Accept — this timeout covers the case
648+
// where the new CP never connects at all.
649+
go func() {
650+
time.Sleep(30 * time.Second)
651+
if cp.recoverFromFailedReload() {
652+
slog.Warn("Handover timeout: new CP did not connect within 30s, recovering.")
653+
cp.cancelHandoverListener()
654+
cp.startHandoverListener()
655+
}
656+
}()
657+
}
658+
609659
// cancelHandoverListener closes the current handover listener, unblocking
610660
// any goroutine stuck in Accept().
611661
func (cp *ControlPlane) cancelHandoverListener() {

controlplane/handover.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ func (cp *ControlPlane) handleHandoverRequest(conn net.Conn, handoverLn net.List
157157

158158
handoverOK = true
159159

160+
// Clear reloading flag so the timeout-based recovery in selfExecDetached
161+
// doesn't fire during a long drain.
162+
cp.reloading.Store(false)
163+
160164
// Stop accepting new connections immediately. The new CP has its own
161165
// listener FD copy (from SCM_RIGHTS), so closing our copy doesn't
162166
// affect the underlying socket — the new CP can still accept on it.

main.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os/signal"
1111
"strconv"
1212
"syscall"
13+
"time"
1314

1415
"github.com/posthog/duckgres/controlplane"
1516
"github.com/posthog/duckgres/duckdbservice"
@@ -100,13 +101,21 @@ func env(key, defaultVal string) string {
100101
return defaultVal
101102
}
102103

103-
// initMetrics starts the Prometheus metrics HTTP server on :9090/metrics
104+
// initMetrics starts the Prometheus metrics HTTP server on :9090/metrics.
105+
// During zero-downtime handover the old process still holds :9090 until it
106+
// drains and exits, so we retry until the port becomes available.
104107
func initMetrics() {
108+
mux := http.NewServeMux()
109+
mux.Handle("/metrics", promhttp.Handler())
105110
go func() {
106-
http.Handle("/metrics", promhttp.Handler())
107-
slog.Info("Starting metrics server", "addr", ":9090")
108-
if err := http.ListenAndServe(":9090", nil); err != nil {
109-
slog.Error("Metrics server error", "error", err)
111+
for {
112+
slog.Info("Starting metrics server", "addr", ":9090")
113+
if err := http.ListenAndServe(":9090", mux); err != nil {
114+
slog.Warn("Metrics server error, retrying in 1s.", "error", err)
115+
time.Sleep(1 * time.Second)
116+
continue
117+
}
118+
return
110119
}
111120
}()
112121
}

0 commit comments

Comments
 (0)