fix(controlplane): eliminate systemd "not our child" warning and metrics port conflict

fuziontech · claude · fuziontech · commit 6f5bfe1c55f2 · 2026-02-14T02:07:39.000Z
Two fixes for zero-downtime handover:

1. selfExec now double-forks via `setsid --fork` when running under systemd
   (NOTIFY_SOCKET set). The new CP is reparented to PID 1, allowing systemd
   to properly track it via waitpid() for Restart=always. Outside systemd
   (tests), the direct spawn path is preserved for fast crash recovery via
   cmd.Wait(). A 30s timeout provides crash recovery for the detached path.

2. initMetrics retries binding :9090 until available. During handover the old
   CP still holds the metrics port until it drains and exits; the new CP's
   metrics goroutine now retries instead of dying on first failure.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/controlplane/control.go b/controlplane/control.go
@@ -566,7 +566,22 @@ func (cp *ControlPlane) shutdown() {
 // selfExec spawns a new control plane process from the binary on disk.
 // The new process detects the existing handover socket and initiates the
 // handover protocol to receive listener FDs.
+//
+// Under systemd (Type=notify), the child must be reparented to PID 1 for
+// systemd to properly track it via waitpid(). We achieve this via
+// double-fork using setsid --fork: the intermediate exits immediately and
+// the grandchild (new CP) is reparented to PID 1. Without this, systemd
+// logs "Supervising process X which is not our child" and may not detect
+// crashes for Restart=always.
+//
+// Outside systemd (tests, manual runs), we use direct spawn so the old CP
+// can track the child's exit via cmd.Wait() for faster crash recovery.
 func (cp *ControlPlane) selfExec() {
+	if os.Getenv("NOTIFY_SOCKET") != "" {
+		cp.selfExecDetached()
+		return
+	}
+
 	cmd := exec.Command(os.Args[0], os.Args[1:]...)
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -606,6 +621,41 @@ func (cp *ControlPlane) selfExec() {
 	}()
 }
 
+// selfExecDetached spawns the new CP via setsid --fork so it is reparented
+// to PID 1 (systemd). Because we cannot track the detached grandchild's
+// exit directly, we use a timeout for crash recovery instead of cmd.Wait().
+func (cp *ControlPlane) selfExecDetached() {
+	args := append([]string{"--fork", os.Args[0]}, os.Args[1:]...)
+	cmd := exec.Command("setsid", args...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	cmd.Env = os.Environ()
+
+	// cmd.Run() returns almost immediately: setsid --fork double-forks and
+	// the intermediate exits right away. The grandchild (new CP) continues.
+	if err := cmd.Run(); err != nil {
+		slog.Error("Self-exec (detached) failed.", "error", err)
+		cp.recoverFromFailedReload()
+		return
+	}
+
+	slog.Info("New control plane spawned (detached).")
+
+	// Recovery timeout: if no handover connection is received within 30s,
+	// the new CP likely crashed during startup. The handover protocol
+	// already has a 30s per-connection deadline (handleHandoverRequest),
+	// but that only fires after Accept — this timeout covers the case
+	// where the new CP never connects at all.
+	go func() {
+		time.Sleep(30 * time.Second)
+		if cp.recoverFromFailedReload() {
+			slog.Warn("Handover timeout: new CP did not connect within 30s, recovering.")
+			cp.cancelHandoverListener()
+			cp.startHandoverListener()
+		}
+	}()
+}
+
 // cancelHandoverListener closes the current handover listener, unblocking
 // any goroutine stuck in Accept().
 func (cp *ControlPlane) cancelHandoverListener() {
diff --git a/controlplane/handover.go b/controlplane/handover.go
@@ -157,6 +157,10 @@ func (cp *ControlPlane) handleHandoverRequest(conn net.Conn, handoverLn net.List
 
 	handoverOK = true
 
+	// Clear reloading flag so the timeout-based recovery in selfExecDetached
+	// doesn't fire during a long drain.
+	cp.reloading.Store(false)
+
 	// Stop accepting new connections immediately. The new CP has its own
 	// listener FD copy (from SCM_RIGHTS), so closing our copy doesn't
 	// affect the underlying socket — the new CP can still accept on it.
diff --git a/main.go b/main.go
@@ -10,6 +10,7 @@ import (
 	"os/signal"
 	"strconv"
 	"syscall"
+	"time"
 
 	"github.com/posthog/duckgres/controlplane"
 	"github.com/posthog/duckgres/duckdbservice"
@@ -100,13 +101,21 @@ func env(key, defaultVal string) string {
 	return defaultVal
 }
 
-// initMetrics starts the Prometheus metrics HTTP server on :9090/metrics
+// initMetrics starts the Prometheus metrics HTTP server on :9090/metrics.
+// During zero-downtime handover the old process still holds :9090 until it
+// drains and exits, so we retry until the port becomes available.
 func initMetrics() {
+	mux := http.NewServeMux()
+	mux.Handle("/metrics", promhttp.Handler())
 	go func() {
-		http.Handle("/metrics", promhttp.Handler())
-		slog.Info("Starting metrics server", "addr", ":9090")
-		if err := http.ListenAndServe(":9090", nil); err != nil {
-			slog.Error("Metrics server error", "error", err)
+		for {
+			slog.Info("Starting metrics server", "addr", ":9090")
+			if err := http.ListenAndServe(":9090", mux); err != nil {
+				slog.Warn("Metrics server error, retrying in 1s.", "error", err)
+				time.Sleep(1 * time.Second)
+				continue
+			}
+			return
 		}
 	}()
 }