Skip to content

Commit 13cb823

Browse files
fuziontechclaude
andauthored
Increase handover drain timeout from 5 minutes to 24 hours (#234)
* Increase handover drain timeout from 5 minutes to 24 hours The 5-minute drain timeout was too aggressive — long-running queries were being forcibly terminated during rolling deployments. Increase to 24 hours so existing connections can finish naturally. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Make handover drain timeout configurable via --handover-drain-timeout Adds 3-tier config support (CLI flag, env var, YAML) following the existing pattern for worker timeouts. Default remains 24h. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cf3d7db commit 13cb823

File tree

4 files changed

+50
-16
lines changed

4 files changed

+50
-16
lines changed

config_resolution.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,18 @@ type configCLIInputs struct {
3030
MinWorkers int
3131
WorkerQueueTimeout string
3232
WorkerIdleTimeout string
33+
HandoverDrainTimeout string
3334
ACMEDomain string
3435
ACMEEmail string
3536
ACMECacheDir string
3637
MaxConnections int
3738
}
3839

3940
type resolvedConfig struct {
40-
Server server.Config
41-
WorkerQueueTimeout time.Duration
42-
WorkerIdleTimeout time.Duration
41+
Server server.Config
42+
WorkerQueueTimeout time.Duration
43+
WorkerIdleTimeout time.Duration
44+
HandoverDrainTimeout time.Duration
4345
}
4446

4547
func defaultServerConfig() server.Config {
@@ -75,6 +77,7 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun
7577
cfg := defaultServerConfig()
7678
var workerQueueTimeout time.Duration
7779
var workerIdleTimeout time.Duration
80+
var handoverDrainTimeout time.Duration
7881

7982
if fileCfg != nil {
8083
if fileCfg.Host != "" {
@@ -230,6 +233,13 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun
230233
warn("Invalid worker_idle_timeout duration: " + err.Error())
231234
}
232235
}
236+
if fileCfg.HandoverDrainTimeout != "" {
237+
if d, err := time.ParseDuration(fileCfg.HandoverDrainTimeout); err == nil {
238+
handoverDrainTimeout = d
239+
} else {
240+
warn("Invalid handover_drain_timeout duration: " + err.Error())
241+
}
242+
}
233243
if len(fileCfg.PassthroughUsers) > 0 {
234244
cfg.PassthroughUsers = make(map[string]bool, len(fileCfg.PassthroughUsers))
235245
for _, u := range fileCfg.PassthroughUsers {
@@ -399,6 +409,13 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun
399409
warn("Invalid DUCKGRES_WORKER_IDLE_TIMEOUT duration: " + err.Error())
400410
}
401411
}
412+
if v := getenv("DUCKGRES_HANDOVER_DRAIN_TIMEOUT"); v != "" {
413+
if d, err := time.ParseDuration(v); err == nil {
414+
handoverDrainTimeout = d
415+
} else {
416+
warn("Invalid DUCKGRES_HANDOVER_DRAIN_TIMEOUT duration: " + err.Error())
417+
}
418+
}
402419
if v := getenv("DUCKGRES_ACME_DOMAIN"); v != "" {
403420
cfg.ACMEDomain = v
404421
}
@@ -504,6 +521,13 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun
504521
warn("Invalid --worker-idle-timeout duration: " + err.Error())
505522
}
506523
}
524+
if cli.Set["handover-drain-timeout"] {
525+
if d, err := time.ParseDuration(cli.HandoverDrainTimeout); err == nil {
526+
handoverDrainTimeout = d
527+
} else {
528+
warn("Invalid --handover-drain-timeout duration: " + err.Error())
529+
}
530+
}
507531
if cli.Set["acme-domain"] {
508532
cfg.ACMEDomain = cli.ACMEDomain
509533
}
@@ -530,8 +554,9 @@ func resolveEffectiveConfig(fileCfg *FileConfig, cli configCLIInputs, getenv fun
530554
}
531555

532556
return resolvedConfig{
533-
Server: cfg,
534-
WorkerQueueTimeout: workerQueueTimeout,
535-
WorkerIdleTimeout: workerIdleTimeout,
557+
Server: cfg,
558+
WorkerQueueTimeout: workerQueueTimeout,
559+
WorkerIdleTimeout: workerIdleTimeout,
560+
HandoverDrainTimeout: handoverDrainTimeout,
536561
}
537562
}

controlplane/control.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ type ControlPlaneConfig struct {
3131
ConfigPath string // Path to config file, passed to workers
3232
HandoverSocket string
3333
HealthCheckInterval time.Duration
34-
WorkerQueueTimeout time.Duration // How long to wait for an available worker slot (default: 5m)
35-
WorkerIdleTimeout time.Duration // How long to keep an idle worker alive (default: 5m)
36-
MetricsServer *http.Server // Optional metrics server to shut down during handover
34+
WorkerQueueTimeout time.Duration // How long to wait for an available worker slot (default: 5m)
35+
WorkerIdleTimeout time.Duration // How long to keep an idle worker alive (default: 5m)
36+
HandoverDrainTimeout time.Duration // How long to wait for connections to drain during handover (default: 24h)
37+
MetricsServer *http.Server // Optional metrics server to shut down during handover
3738
}
3839

3940
// ControlPlane manages the TCP listener and routes connections to Flight SQL workers.
@@ -74,6 +75,9 @@ func RunControlPlane(cfg ControlPlaneConfig) {
7475
if cfg.WorkerIdleTimeout == 0 {
7576
cfg.WorkerIdleTimeout = 5 * time.Minute
7677
}
78+
if cfg.HandoverDrainTimeout == 0 {
79+
cfg.HandoverDrainTimeout = 24 * time.Hour
80+
}
7781

7882
// Enforce secure defaults for control-plane mode.
7983
if err := validateControlPlaneSecurity(cfg); err != nil {

controlplane/handover.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,8 @@ func (cp *ControlPlane) handleHandoverRequest(conn net.Conn, handoverLn net.List
190190
select {
191191
case <-drainDone:
192192
slog.Info("All connections drained after handover.")
193-
case <-time.After(5 * time.Minute):
194-
slog.Warn("Handover drain timeout after 5 minutes, forcing exit.")
193+
case <-time.After(cp.cfg.HandoverDrainTimeout):
194+
slog.Warn("Handover drain timeout, forcing exit.", "timeout", cp.cfg.HandoverDrainTimeout)
195195
}
196196

197197
// Shut down workers

main.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ type FileConfig struct {
4343
MemoryRebalance *bool `yaml:"memory_rebalance"` // Enable dynamic per-connection memory reallocation
4444
MaxWorkers int `yaml:"max_workers"` // Max worker processes (control-plane mode)
4545
MinWorkers int `yaml:"min_workers"` // Pre-warm worker count (control-plane mode)
46-
WorkerQueueTimeout string `yaml:"worker_queue_timeout"` // e.g., "5m"
47-
WorkerIdleTimeout string `yaml:"worker_idle_timeout"` // e.g., "5m"
46+
WorkerQueueTimeout string `yaml:"worker_queue_timeout"` // e.g., "5m"
47+
WorkerIdleTimeout string `yaml:"worker_idle_timeout"` // e.g., "5m"
48+
HandoverDrainTimeout string `yaml:"handover_drain_timeout"` // e.g., "24h"
4849
PassthroughUsers []string `yaml:"passthrough_users"` // Users that bypass transpiler + pg_catalog
4950
}
5051

@@ -178,6 +179,7 @@ func main() {
178179
maxWorkers := flag.Int("max-workers", 0, "Max worker processes, 0=unlimited (control-plane mode) (env: DUCKGRES_MAX_WORKERS)")
179180
workerQueueTimeout := flag.String("worker-queue-timeout", "", "How long to wait for an available worker slot (e.g., '5m') (env: DUCKGRES_WORKER_QUEUE_TIMEOUT)")
180181
workerIdleTimeout := flag.String("worker-idle-timeout", "", "How long to keep an idle worker alive (e.g., '5m') (env: DUCKGRES_WORKER_IDLE_TIMEOUT)")
182+
handoverDrainTimeout := flag.String("handover-drain-timeout", "", "How long to wait for connections to drain during handover (default: '24h') (env: DUCKGRES_HANDOVER_DRAIN_TIMEOUT)")
181183
socketDir := flag.String("socket-dir", "/var/run/duckgres", "Unix socket directory (control-plane mode)")
182184
handoverSocket := flag.String("handover-socket", "", "Handover socket for graceful deployment (control-plane mode)")
183185

@@ -217,6 +219,7 @@ func main() {
217219
fmt.Fprintf(os.Stderr, " DUCKGRES_MIN_WORKERS Pre-warm worker count (control-plane mode)\n")
218220
fmt.Fprintf(os.Stderr, " DUCKGRES_MAX_WORKERS Max worker processes (control-plane mode)\n")
219221
fmt.Fprintf(os.Stderr, " DUCKGRES_WORKER_QUEUE_TIMEOUT Worker queue timeout (default: 5m)\n")
222+
fmt.Fprintf(os.Stderr, " DUCKGRES_HANDOVER_DRAIN_TIMEOUT Handover drain timeout (default: 24h)\n")
220223
fmt.Fprintf(os.Stderr, " DUCKGRES_ACME_DOMAIN Domain for ACME/Let's Encrypt certificate\n")
221224
fmt.Fprintf(os.Stderr, " DUCKGRES_ACME_EMAIL Contact email for Let's Encrypt notifications\n")
222225
fmt.Fprintf(os.Stderr, " DUCKGRES_ACME_CACHE_DIR Directory for ACME certificate cache\n")
@@ -303,6 +306,7 @@ func main() {
303306
MaxWorkers: *maxWorkers,
304307
WorkerQueueTimeout: *workerQueueTimeout,
305308
WorkerIdleTimeout: *workerIdleTimeout,
309+
HandoverDrainTimeout: *handoverDrainTimeout,
306310
ACMEDomain: *acmeDomain,
307311
ACMEEmail: *acmeEmail,
308312
ACMECacheDir: *acmeCacheDir,
@@ -425,9 +429,10 @@ func main() {
425429
SocketDir: *socketDir,
426430
ConfigPath: *configFile,
427431
HandoverSocket: *handoverSocket,
428-
WorkerQueueTimeout: resolved.WorkerQueueTimeout,
429-
WorkerIdleTimeout: resolved.WorkerIdleTimeout,
430-
MetricsServer: metricsSrv,
432+
WorkerQueueTimeout: resolved.WorkerQueueTimeout,
433+
WorkerIdleTimeout: resolved.WorkerIdleTimeout,
434+
HandoverDrainTimeout: resolved.HandoverDrainTimeout,
435+
MetricsServer: metricsSrv,
431436
}
432437
controlplane.RunControlPlane(cpCfg)
433438
return

0 commit comments

Comments
 (0)