feat: set default tcp_user_timeout to 5 seconds for replicas (cloudnative-pg#9317)

armru · gbartolini · web-flow · commit e29c97dc54e8 · 2025-11-26T17:29:25.000+01:00
The default `tcp_user_timeout` for standby replication connections has been changed from the system default to `5000ms` (5 seconds) for all replicas. This new default enhances the robustness of CloudNativePG clusters by enabling standby instances to detect and recover from network issues more quickly. Previously, silent network drops could cause standbys to wait up to ~127 seconds (due to TCP SYN retries) before detecting a failure. With the new 5-second timeout, standbys will close unresponsive connections sooner and promptly retry connecting to the primary. If this default does not meet your requirements, you can override it for all standbys managed by the operator using the `STANDBY_TCP_USER_TIMEOUT` configuration option. PRESERVATION GUIDE FOR EXISTING INSTALLATIONS: If you have an existing CloudNativePG installation where `STANDBY_TCP_USER_TIMEOUT` was not explicitly set (thus defaulting to `0`), and you wish to preserve that behaviour after upgrading, you must now explicitly set it to `0`. Example using a `ConfigMap`: ```yaml apiVersion: v1 kind: ConfigMap metadata: name: cnpg-controller-manager-config namespace: cnpg-system data: STANDY_TCP_USER_TIMEOUT: "0" ``` If the variable is not explicitly configured, the new default of 5 seconds will automatically apply after the next operator upgrade or pod restart. For more information on `tcp_user_timeout`, see the PostgreSQL documentation: https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-TCP-USER-TIMEOUT Closes cloudnative-pg#9229 Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com> Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
diff --git a/docs/src/operator_conf.md b/docs/src/operator_conf.md
@@ -59,7 +59,7 @@ Name | Description
 `PGBOUNCER_IMAGE_NAME` | The name of the PgBouncer image used by default for new poolers. Defaults to the version specified in the operator.
 `POSTGRES_IMAGE_NAME` | The name of the PostgreSQL image used by default for new clusters. Defaults to the version specified in the operator.
 `PULL_SECRET_NAME` | Name of an additional pull secret to be defined in the operator's namespace and to be used to download images
-`STANDBY_TCP_USER_TIMEOUT` | Defines the [`TCP_USER_TIMEOUT` socket option](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-TCP-USER-TIMEOUT) for replication connections from standby instances to the primary. Default is 0 (system's default).
+`STANDBY_TCP_USER_TIMEOUT` | Defines the [`TCP_USER_TIMEOUT` socket option](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-TCP-USER-TIMEOUT) in milliseconds for replication connections from standby instances to the primary. Default is 5000 (5 seconds). Set to `0` to use the system's default.
 `DRAIN_TAINTS` | Specifies the taint keys that should be interpreted as indicators of node drain. By default, it includes the taints commonly applied by [kubectl](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/), [Cluster Autoscaler](https://github.com/kubernetes/autoscaler), and [Karpenter](https://github.com/aws/karpenter-provider-aws): `node.kubernetes.io/unschedulable`, `ToBeDeletedByClusterAutoscaler`, `karpenter.sh/disrupted`, `karpenter.sh/disruption`.
 
 Values in `INHERITED_ANNOTATIONS` and `INHERITED_LABELS` support path-like wildcards. For example, the value `example.com/*` will match
diff --git a/docs/src/postgresql_conf.md b/docs/src/postgresql_conf.md
@@ -168,19 +168,20 @@ role within the cluster. These parameters are effectively applied only when the
 instance is operating as a replica.
 
 ```text
-primary_conninfo = 'host=<PRIMARY> user=postgres dbname=postgres'
+primary_conninfo = 'host=<PRIMARY> user=postgres dbname=postgres tcp_user_timeout=5000'
 recovery_target_timeline = 'latest'
 ```
 
-The [`STANDBY_TCP_USER_TIMEOUT` operator configuration setting](operator_conf.md#available-options),
-if specified, sets the `tcp_user_timeout` parameter on all standby instances
-managed by the operator.
-
-The `tcp_user_timeout` parameter determines how long transmitted data can
-remain unacknowledged before the TCP connection is forcibly closed. Adjusting
-this value allows you to fine-tune the responsiveness of standby instances to
-network disruptions. For more details, refer to the
-[PostgreSQL documentation](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-TCP-USER-TIMEOUT).
+!!! Important
+    By default, every standby sets `tcp_user_timeout` to **5 seconds**, as shown
+    above. This parameter defines how long transmitted data may remain
+    unacknowledged before the TCP connection is forcibly closed. Adjusting it lets
+    you control how quickly a standby reacts to network issues.
+    If the default value does not meet your requirements, you can override it
+    for all standbys managed by the operator using the
+    [`STANDBY_TCP_USER_TIMEOUT` operator configuration option](operator_conf.md#available-options).
+    For additional details on `tcp_user_timeout`, refer to the
+    [PostgreSQL documentation](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-TCP-USER-TIMEOUT).
 
 ### Log control settings
 
diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go
@@ -165,7 +165,9 @@ type Data struct {
 	// added as a tcp_user_timeout option to the primary_conninfo
 	// string, which is used by the standby server to connect to the
 	// primary server in CloudNativePG.
-	StandbyTCPUserTimeout int `json:"standbyTcpUserTimeout" env:"STANDBY_TCP_USER_TIMEOUT"`
+	// When nil, the instance manager will use a default value of 5000ms.
+	// Set to 0 explicitly to use the system's default.
+	StandbyTCPUserTimeout *int `json:"standbyTcpUserTimeout" env:"STANDBY_TCP_USER_TIMEOUT"`
 
 	// KubernetesClusterDomain defines the domain suffix for service FQDNs
 	// within the Kubernetes cluster. If left unset, it defaults to `cluster.local`.
@@ -189,7 +191,7 @@ func newDefaultConfig() *Data {
 		CreateAnyService:        false,
 		CertificateDuration:     CertificateDuration,
 		ExpiringCheckThreshold:  ExpiringCheckThreshold,
-		StandbyTCPUserTimeout:   0,
+		StandbyTCPUserTimeout:   nil,
 		KubernetesClusterDomain: DefaultKubernetesClusterDomain,
 		DrainTaints:             DefaultDrainTaints,
 	}
diff --git a/pkg/configparser/configparser.go b/pkg/configparser/configparser.go
@@ -64,7 +64,7 @@ var configparserLog = log.WithName("configparser")
 
 // ReadConfigMap reads the configuration from the environment and the passed in data map.
 // Config and defaults are supposed to be pointers to structs of the same type
-func ReadConfigMap(target interface{}, defaults interface{}, data map[string]string) {
+func ReadConfigMap(target interface{}, defaults interface{}, data map[string]string) { //nolint: gocognit
 	ensurePointerToCompatibleStruct("target", target, "default", defaults)
 
 	count := reflect.TypeOf(defaults).Elem().NumField()
@@ -89,6 +89,21 @@ func ReadConfigMap(target interface{}, defaults interface{}, data map[string]str
 		case reflect.Int:
 			value = fmt.Sprintf("%v", valueField.Int())
 
+		case reflect.Ptr:
+			// Handle pointer types - if not nil, get the underlying value
+			if !valueField.IsNil() {
+				switch valueField.Elem().Kind() {
+				case reflect.Int:
+					value = fmt.Sprintf("%v", valueField.Elem().Int())
+				default:
+					configparserLog.Info(
+						"Skipping unsupported pointer type while parsing default configuration",
+						"field", field.Name, "kind", valueField.Elem().Kind())
+					continue
+				}
+			}
+			// If nil, value stays empty, which means we'll only set it if env/data provides a value
+
 		case reflect.Slice:
 			if valueField.Type().Elem().Kind() != reflect.String {
 				configparserLog.Info(
@@ -129,6 +144,29 @@ func ReadConfigMap(target interface{}, defaults interface{}, data map[string]str
 				continue
 			}
 			reflect.ValueOf(target).Elem().FieldByName(field.Name).SetInt(intValue)
+		case reflect.Ptr:
+			// Handle pointer types
+			if value == "" {
+				// If no value is provided, leave the pointer as nil
+				continue
+			}
+			switch t.Elem().Kind() {
+			case reflect.Int:
+				intValue, err := strconv.ParseInt(value, 10, 0)
+				if err != nil {
+					configparserLog.Info(
+						"Skipping invalid integer pointer value parsing configuration",
+						"field", field.Name, "value", value)
+					continue
+				}
+				intVal := int(intValue)
+				reflect.ValueOf(target).Elem().FieldByName(field.Name).Set(reflect.ValueOf(&intVal))
+			default:
+				configparserLog.Info(
+					"Skipping unsupported pointer type while parsing configuration",
+					"field", field.Name, "kind", t.Elem().Kind())
+				continue
+			}
 		case reflect.String:
 			reflect.ValueOf(target).Elem().FieldByName(field.Name).SetString(value)
 		case reflect.Slice:
diff --git a/pkg/configparser/configparser_test.go b/pkg/configparser/configparser_test.go
@@ -44,6 +44,9 @@ type FakeData struct {
 
 	//  Threshold to consider a certificate as expiring
 	ExpiringCheckThreshold int `json:"expiringCheckThreshold" env:"EXPIRING_CHECK_THRESHOLD"`
+
+	// OptionalTimeout is an optional pointer to int for testing
+	OptionalTimeout *int `json:"optionalTimeout" env:"OPTIONAL_TIMEOUT"`
 }
 
 var defaultInheritedAnnotations = []string{
@@ -117,4 +120,55 @@ var _ = Describe("Data test suite", func() {
 		Expect(config.InheritedAnnotations).To(Equal(defaultInheritedAnnotations))
 		Expect(config.InheritedLabels).To(BeNil())
 	})
+
+	Context("pointer types", func() {
+		It("leaves pointer nil when no value is provided", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "")
+			config := &FakeData{}
+			ReadConfigMap(config, &FakeData{}, nil)
+			Expect(config.OptionalTimeout).To(BeNil())
+		})
+
+		It("sets pointer from environment value", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "5000")
+			config := &FakeData{}
+			ReadConfigMap(config, &FakeData{}, nil)
+			Expect(config.OptionalTimeout).ToNot(BeNil())
+			Expect(*config.OptionalTimeout).To(Equal(5000))
+		})
+
+		It("sets pointer from map value", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "")
+			config := &FakeData{}
+			ReadConfigMap(config, &FakeData{}, map[string]string{
+				"OPTIONAL_TIMEOUT": "3000",
+			})
+			Expect(config.OptionalTimeout).ToNot(BeNil())
+			Expect(*config.OptionalTimeout).To(Equal(3000))
+		})
+
+		It("allows setting pointer to zero value explicitly", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "0")
+			config := &FakeData{}
+			ReadConfigMap(config, &FakeData{}, nil)
+			Expect(config.OptionalTimeout).ToNot(BeNil())
+			Expect(*config.OptionalTimeout).To(Equal(0))
+		})
+
+		It("uses default pointer value when set", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "")
+			config := &FakeData{}
+			defaultValue := 1000
+			ReadConfigMap(config, &FakeData{OptionalTimeout: &defaultValue}, nil)
+			Expect(config.OptionalTimeout).ToNot(BeNil())
+			Expect(*config.OptionalTimeout).To(Equal(1000))
+		})
+
+		It("skips invalid pointer value", func() {
+			GinkgoT().Setenv("OPTIONAL_TIMEOUT", "invalid")
+			config := &FakeData{}
+			ReadConfigMap(config, &FakeData{}, nil)
+			Expect(config.OptionalTimeout).To(BeNil())
+		})
+	})
 })
diff --git a/pkg/management/postgres/instance.go b/pkg/management/postgres/instance.go
@@ -1400,11 +1400,14 @@ func (instance *Instance) GetPrimaryConnInfo() string {
 	result := buildPrimaryConnInfo(instance.GetClusterName()+"-rw", instance.GetPodName()) + " dbname=postgres"
 
 	standbyTCPUserTimeout := os.Getenv("CNPG_STANDBY_TCP_USER_TIMEOUT")
-	if len(standbyTCPUserTimeout) > 0 {
-		result = fmt.Sprintf("%s tcp_user_timeout='%s'", result,
-			strings.ReplaceAll(strings.ReplaceAll(standbyTCPUserTimeout, `\`, `\\`), `'`, `\'`))
+	if len(standbyTCPUserTimeout) == 0 {
+		// Default to 5000ms (5 seconds) if not explicitly set
+		standbyTCPUserTimeout = "5000"
 	}
 
+	result = fmt.Sprintf("%s tcp_user_timeout='%s'", result,
+		strings.ReplaceAll(strings.ReplaceAll(standbyTCPUserTimeout, `\`, `\\`), `'`, `\'`))
+
 	return result
 }
 
diff --git a/pkg/management/postgres/instance_test.go b/pkg/management/postgres/instance_test.go
@@ -361,3 +361,58 @@ func getLibraryPathFromEnv(envs []string) string {
 
 	return ldLibraryPath
 }
+
+var _ = Describe("GetPrimaryConnInfo", func() {
+	var instance *Instance
+
+	BeforeEach(func() {
+		instance = &Instance{
+			Cluster: &apiv1.Cluster{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-cluster",
+				},
+			},
+		}
+		instance.WithPodName("test-cluster-1").WithClusterName("test-cluster")
+	})
+
+	AfterEach(func() {
+		err := os.Unsetenv("CNPG_STANDBY_TCP_USER_TIMEOUT")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	It("should use default 5000ms tcp_user_timeout when env var is not set", func() {
+		err := os.Unsetenv("CNPG_STANDBY_TCP_USER_TIMEOUT")
+		Expect(err).ToNot(HaveOccurred())
+		connInfo := instance.GetPrimaryConnInfo()
+		Expect(connInfo).To(ContainSubstring("tcp_user_timeout='5000'"))
+	})
+
+	It("should use custom tcp_user_timeout when env var is set", func() {
+		err := os.Setenv("CNPG_STANDBY_TCP_USER_TIMEOUT", "10000")
+		Expect(err).ToNot(HaveOccurred())
+		connInfo := instance.GetPrimaryConnInfo()
+		Expect(connInfo).To(ContainSubstring("tcp_user_timeout='10000'"))
+	})
+
+	It("should allow setting tcp_user_timeout to 0 explicitly", func() {
+		err := os.Setenv("CNPG_STANDBY_TCP_USER_TIMEOUT", "0")
+		Expect(err).ToNot(HaveOccurred())
+		connInfo := instance.GetPrimaryConnInfo()
+		Expect(connInfo).To(ContainSubstring("tcp_user_timeout='0'"))
+	})
+
+	It("should escape single quotes in tcp_user_timeout value", func() {
+		err := os.Setenv("CNPG_STANDBY_TCP_USER_TIMEOUT", "5000'injection")
+		Expect(err).ToNot(HaveOccurred())
+		connInfo := instance.GetPrimaryConnInfo()
+		Expect(connInfo).To(ContainSubstring("tcp_user_timeout='5000\\'injection'"))
+	})
+
+	It("should escape backslashes in tcp_user_timeout value", func() {
+		err := os.Setenv("CNPG_STANDBY_TCP_USER_TIMEOUT", "5000\\test")
+		Expect(err).ToNot(HaveOccurred())
+		connInfo := instance.GetPrimaryConnInfo()
+		Expect(connInfo).To(ContainSubstring("tcp_user_timeout='5000\\\\test'"))
+	})
+})
diff --git a/pkg/management/postgres/restore.go b/pkg/management/postgres/restore.go
@@ -964,7 +964,18 @@ func (info InitInfo) ConfigureInstanceAfterRestore(ctx context.Context, cluster
 
 // GetPrimaryConnInfo returns the DSN to reach the primary
 func (info InitInfo) GetPrimaryConnInfo() string {
-	return buildPrimaryConnInfo(info.ClusterName+"-rw", info.PodName)
+	result := buildPrimaryConnInfo(info.ClusterName+"-rw", info.PodName) + " dbname=postgres"
+
+	standbyTCPUserTimeout := os.Getenv("CNPG_STANDBY_TCP_USER_TIMEOUT")
+	if len(standbyTCPUserTimeout) == 0 {
+		// Default to 5000ms (5 seconds) if not explicitly set
+		standbyTCPUserTimeout = "5000"
+	}
+
+	result = fmt.Sprintf("%s tcp_user_timeout='%s'", result,
+		strings.ReplaceAll(strings.ReplaceAll(standbyTCPUserTimeout, `\`, `\\`), `'`, `\'`))
+
+	return result
 }
 
 func (info *InitInfo) checkBackupDestination(
diff --git a/pkg/specs/pods.go b/pkg/specs/pods.go
@@ -162,12 +162,12 @@ func CreatePodEnvConfig(cluster apiv1.Cluster, podName string) EnvConfig {
 	}
 	config.EnvVars = append(config.EnvVars, cluster.Spec.Env...)
 
-	if configuration.Current.StandbyTCPUserTimeout != 0 {
+	if configuration.Current.StandbyTCPUserTimeout != nil {
 		config.EnvVars = append(
 			config.EnvVars,
 			corev1.EnvVar{
 				Name:  "CNPG_STANDBY_TCP_USER_TIMEOUT",
-				Value: strconv.Itoa(configuration.Current.StandbyTCPUserTimeout),
+				Value: strconv.Itoa(*configuration.Current.StandbyTCPUserTimeout),
 			},
 		)
 	}

Original file line number	Diff line number	Diff line change
`@@ -162,12 +162,12 @@ func CreatePodEnvConfig(cluster apiv1.Cluster, podName string) EnvConfig {`
`162`	`162`	`}`
`163`	`163`	`config.EnvVars = append(config.EnvVars, cluster.Spec.Env...)`
`164`	`164`
`165`		`- if configuration.Current.StandbyTCPUserTimeout != 0 {`
	`165`	`+ if configuration.Current.StandbyTCPUserTimeout != nil {`
`166`	`166`	`config.EnvVars = append(`
`167`	`167`	`config.EnvVars,`
`168`	`168`	`corev1.EnvVar{`
`169`	`169`	`Name: "CNPG_STANDBY_TCP_USER_TIMEOUT",`
`170`		`- Value: strconv.Itoa(configuration.Current.StandbyTCPUserTimeout),`
	`170`	`+ Value: strconv.Itoa(*configuration.Current.StandbyTCPUserTimeout),`
`171`	`171`	`},`
`172`	`172`	`)`
`173`	`173`	`}`