Skip to content

Commit 0a18bcb

Browse files
leonardocearmruNiccoloFeigbartolinimnencia
authored
feat(instance): liveness probe isolation checker (cloudnative-pg#7466)
Enhances the liveness probe logic for primary instances in HA clusters to detect network isolation scenarios. The probe first checks if the Pod can reach the API server. If the API server is unreachable, it then attempts to contact peer PostgreSQL instances in the same cluster via a REST entrypoint. If neither the API server nor any replicas are reachable, the liveness probe fails, prompting Kubernetes to restart the Pod. Upon restart, the instance manager will refuse to start PostgreSQL, as it cannot download the Cluster definition—preventing unsafe behavior in isolated environments. This behavior applies only to primary instances in clusters with HA replicas and is disabled by default. Closes: cloudnative-pg#7465 Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Niccolò Fei <niccolo.fei@enterprisedb.com> Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com> Signed-off-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Niccolò Fei <niccolo.fei@enterprisedb.com> Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com> Co-authored-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com>
1 parent 9ad673a commit 0a18bcb

File tree

11 files changed

+683
-6
lines changed

11 files changed

+683
-6
lines changed

docs/src/instance_manager.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,48 @@ spec:
188188
failureThreshold: 10
189189
```
190190

191+
### Primary Isolation (alpha)
192+
193+
CloudNativePG 1.26 introduces an opt-in experimental behavior for the liveness
194+
probe of a PostgreSQL primary, which will report a failure if **both** of the
195+
following conditions are met:
196+
197+
1. The instance manager cannot reach the Kubernetes API server
198+
2. The instance manager cannot reach **any** other instance via the instance manager’s REST API
199+
200+
The effect of this behavior is to consider an isolated primary to be not alive and subsequently **shut it down** when the liveness probe fails.
201+
202+
It is **disabled by default** and can be enabled by adding the following
203+
annotation to the `Cluster` resource:
204+
205+
```yaml
206+
metadata:
207+
annotations:
208+
alpha.cnpg.io/livenessPinger: '{"enabled": true}'
209+
```
210+
211+
!!! Warning
212+
This feature is experimental and will be introduced in a future CloudNativePG
213+
release with a new API. If you decide to use it now, note that the API **will
214+
change**.
215+
216+
!!! Important
217+
If you plan to enable this experimental feature, be aware that the default
218+
liveness probe settings—automatically derived from `livenessProbeTimeout`—might
219+
be aggressive (30 seconds). As such, we recommend explicitly setting the
220+
liveness probe configuration to suit your environment.
221+
222+
The annotation also accepts two optional network settings: `requestTimeout`
223+
and `connectionTimeout`, both defaulting to `500` (in milliseconds).
224+
In cloud environments, you may need to increase these values.
225+
For example:
226+
227+
```yaml
228+
metadata:
229+
annotations:
230+
alpha.cnpg.io/livenessPinger: '{"enabled": true,"requestTimeout":1000,"connectionTimeout":1000}'
231+
```
232+
191233
## Readiness Probe
192234

193235
The readiness probe starts once the startup probe has successfully completed.

internal/webhook/v1/cluster_webhook.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
5050

5151
apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
52+
"github.com/cloudnative-pg/cloudnative-pg/pkg/management/postgres/webserver/probes"
5253
"github.com/cloudnative-pg/cloudnative-pg/pkg/postgres"
5354
"github.com/cloudnative-pg/cloudnative-pg/pkg/specs"
5455
"github.com/cloudnative-pg/cloudnative-pg/pkg/utils"
@@ -218,6 +219,7 @@ func (v *ClusterCustomValidator) validate(r *apiv1.Cluster) (allErrs field.Error
218219
v.validatePodPatchAnnotation,
219220
v.validatePromotionToken,
220221
v.validatePluginConfiguration,
222+
v.validateLivenessPingerProbe,
221223
}
222224

223225
for _, validate := range validations {
@@ -2453,3 +2455,23 @@ func (v *ClusterCustomValidator) validatePluginConfiguration(r *apiv1.Cluster) f
24532455

24542456
return errorList
24552457
}
2458+
2459+
func (v *ClusterCustomValidator) validateLivenessPingerProbe(r *apiv1.Cluster) field.ErrorList {
2460+
value, ok := r.Annotations[utils.LivenessPingerAnnotationName]
2461+
if !ok {
2462+
return nil
2463+
}
2464+
2465+
_, err := probes.NewLivenessPingerConfigFromAnnotations(context.Background(), r.Annotations)
2466+
if err != nil {
2467+
return field.ErrorList{
2468+
field.Invalid(
2469+
field.NewPath("metadata", "annotations", utils.LivenessPingerAnnotationName),
2470+
value,
2471+
fmt.Sprintf("error decoding liveness pinger config: %s", err.Error()),
2472+
),
2473+
}
2474+
}
2475+
2476+
return nil
2477+
}

internal/webhook/v1/cluster_webhook_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5089,3 +5089,43 @@ var _ = Describe("validatePluginConfiguration", func() {
50895089
Expect(v.validatePluginConfiguration(cluster)).To(BeNil())
50905090
})
50915091
})
5092+
5093+
var _ = Describe("", func() {
5094+
var v *ClusterCustomValidator
5095+
BeforeEach(func() {
5096+
v = &ClusterCustomValidator{}
5097+
})
5098+
5099+
It("returns no errors if the liveness pinger annotation is not present", func() {
5100+
cluster := &apiv1.Cluster{
5101+
ObjectMeta: metav1.ObjectMeta{
5102+
Annotations: map[string]string{},
5103+
},
5104+
}
5105+
Expect(v.validateLivenessPingerProbe(cluster)).To(BeNil())
5106+
})
5107+
5108+
It("returns no errors if the liveness pinger annotation is valid", func() {
5109+
cluster := &apiv1.Cluster{
5110+
ObjectMeta: metav1.ObjectMeta{
5111+
Annotations: map[string]string{
5112+
utils.LivenessPingerAnnotationName: `{"connectionTimeout": 1000, "requestTimeout": 5000, "enabled": true}`,
5113+
},
5114+
},
5115+
}
5116+
Expect(v.validateLivenessPingerProbe(cluster)).To(BeNil())
5117+
})
5118+
5119+
It("returns an error if the liveness pinger annotation is invalid", func() {
5120+
cluster := &apiv1.Cluster{
5121+
ObjectMeta: metav1.ObjectMeta{
5122+
Annotations: map[string]string{
5123+
utils.LivenessPingerAnnotationName: `{"requestTimeout": 5000}`,
5124+
},
5125+
},
5126+
}
5127+
errs := v.validateLivenessPingerProbe(cluster)
5128+
Expect(errs).To(HaveLen(1))
5129+
Expect(errs[0].Error()).To(ContainSubstring("error decoding liveness pinger config"))
5130+
})
5131+
})

pkg/certs/tls.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,18 @@ func newTLSConfigFromSecret(
5757
// for the <cluster>-rw service, which would cause a name verification error.
5858
caCertPool := x509.NewCertPool()
5959
caCertPool.AppendCertsFromPEM(caCertificate)
60+
61+
return NewTLSConfigFromCertPool(caCertPool), nil
62+
}
63+
64+
// NewTLSConfigFromCertPool creates a tls.Config object from X509 cert pool
65+
// containing the expected server CA
66+
func NewTLSConfigFromCertPool(
67+
certPool *x509.CertPool,
68+
) *tls.Config {
6069
tlsConfig := tls.Config{
6170
MinVersion: tls.VersionTLS13,
62-
RootCAs: caCertPool,
71+
RootCAs: certPool,
6372
InsecureSkipVerify: true, //#nosec G402 -- we are verifying the certificate ourselves
6473
VerifyPeerCertificate: func(rawCerts [][]byte, _ [][]*x509.Certificate) error {
6574
// Code adapted from https://go.dev/src/crypto/tls/handshake_client.go#L986
@@ -77,7 +86,7 @@ func newTLSConfigFromSecret(
7786
}
7887

7988
opts := x509.VerifyOptions{
80-
Roots: caCertPool,
89+
Roots: certPool,
8190
Intermediates: x509.NewCertPool(),
8291
}
8392

@@ -93,7 +102,7 @@ func newTLSConfigFromSecret(
93102
},
94103
}
95104

96-
return &tlsConfig, nil
105+
return &tlsConfig
97106
}
98107

99108
// NewTLSConfigForContext creates a tls.config with the provided data and returns an expanded context that contains
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
Copyright © contributors to CloudNativePG, established as
3+
CloudNativePG a Series of LF Projects, LLC.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
17+
SPDX-License-Identifier: Apache-2.0
18+
*/
19+
20+
package probes
21+
22+
import (
23+
"context"
24+
"fmt"
25+
"net/http"
26+
27+
"github.com/cloudnative-pg/machinery/pkg/log"
28+
"sigs.k8s.io/controller-runtime/pkg/client"
29+
30+
apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
31+
"github.com/cloudnative-pg/cloudnative-pg/pkg/management/postgres"
32+
)
33+
34+
type livenessExecutor struct {
35+
cli client.Client
36+
instance *postgres.Instance
37+
38+
lastestKnownCluster *apiv1.Cluster
39+
}
40+
41+
// NewLivenessChecker creates a new instance of the liveness probe checker
42+
func NewLivenessChecker(
43+
cli client.Client,
44+
instance *postgres.Instance,
45+
) Checker {
46+
return &livenessExecutor{
47+
cli: cli,
48+
instance: instance,
49+
}
50+
}
51+
52+
// tryRefreshLatestCluster refreshes the latest cluster definition, returns a bool indicating if the operation was
53+
// successful
54+
func (e *livenessExecutor) tryRefreshLatestCluster(ctx context.Context) bool {
55+
var cluster apiv1.Cluster
56+
err := e.cli.Get(
57+
ctx,
58+
client.ObjectKey{Namespace: e.instance.GetNamespaceName(), Name: e.instance.GetClusterName()},
59+
&cluster,
60+
)
61+
if err != nil {
62+
return false
63+
}
64+
65+
e.lastestKnownCluster = cluster.DeepCopy()
66+
return true
67+
}
68+
69+
func (e *livenessExecutor) IsHealthy(
70+
ctx context.Context,
71+
w http.ResponseWriter,
72+
) {
73+
contextLogger := log.FromContext(ctx)
74+
75+
isPrimary, isPrimaryErr := e.instance.IsPrimary()
76+
if isPrimaryErr != nil {
77+
contextLogger.Error(
78+
isPrimaryErr,
79+
"Error while checking the instance role, skipping automatic shutdown.")
80+
_, _ = fmt.Fprint(w, "OK")
81+
return
82+
}
83+
84+
if !isPrimary {
85+
// There's no need to restart a replica if isolated
86+
_, _ = fmt.Fprint(w, "OK")
87+
return
88+
}
89+
90+
if clusterRefreshed := e.tryRefreshLatestCluster(ctx); clusterRefreshed {
91+
// We correctly reached the API server but, as a failsafe measure, we
92+
// exercise the reachability checker and leave a log message if something
93+
// is not right.
94+
// In this way a network configuration problem can be discovered as
95+
// quickly as possible.
96+
if err := evaluateLivenessPinger(ctx, e.lastestKnownCluster.DeepCopy()); err != nil {
97+
contextLogger.Warning(
98+
"Instance connectivity error - liveness probe failing but API server is reachable",
99+
"err",
100+
err.Error(),
101+
)
102+
}
103+
_, _ = fmt.Fprint(w, "OK")
104+
return
105+
}
106+
107+
contextLogger = contextLogger.WithValues("apiServerReachable", false)
108+
109+
if e.lastestKnownCluster == nil {
110+
// We were never able to download a cluster definition. This should not
111+
// happen because we check the API server connectivity as soon as the
112+
// instance manager starts, before starting the probe web server.
113+
//
114+
// To be safe, we classify this instance manager to be not isolated and
115+
// postpone any decision to a later liveness probe call.
116+
contextLogger.Warning(
117+
"No cluster definition has been received, skipping automatic shutdown.")
118+
119+
_, _ = fmt.Fprint(w, "OK")
120+
return
121+
}
122+
123+
err := evaluateLivenessPinger(ctx, e.lastestKnownCluster.DeepCopy())
124+
if err != nil {
125+
contextLogger.Error(err, "Instance connectivity error - liveness probe failing")
126+
http.Error(
127+
w,
128+
fmt.Sprintf("liveness check failed: %s", err.Error()),
129+
http.StatusInternalServerError,
130+
)
131+
return
132+
}
133+
134+
contextLogger.Debug(
135+
"Instance connectivity test succeeded - liveness probe succeeding",
136+
"latestKnownInstancesReportedState", e.lastestKnownCluster.Status.InstancesReportedState,
137+
)
138+
_, _ = fmt.Fprint(w, "OK")
139+
}
140+
141+
func evaluateLivenessPinger(
142+
ctx context.Context,
143+
cluster *apiv1.Cluster,
144+
) error {
145+
contextLogger := log.FromContext(ctx)
146+
147+
cfg, err := NewLivenessPingerConfigFromAnnotations(ctx, cluster.Annotations)
148+
if err != nil {
149+
return err
150+
}
151+
if !cfg.isEnabled() {
152+
contextLogger.Debug("pinger config not enabled, skipping")
153+
return nil
154+
}
155+
156+
if cluster.Spec.Instances == 1 {
157+
contextLogger.Debug("Only one instance present in the latest known cluster definition. Skipping automatic shutdown.")
158+
return nil
159+
}
160+
161+
checker, err := buildInstanceReachabilityChecker(*cfg)
162+
if err != nil {
163+
return fmt.Errorf("failed to build instance reachability checker: %w", err)
164+
}
165+
166+
if err := checker.ensureInstancesAreReachable(cluster); err != nil {
167+
return fmt.Errorf("liveness check failed: %w", err)
168+
}
169+
170+
return nil
171+
}

0 commit comments

Comments
 (0)