Skip to content

Commit 4335b7e

Browse files
committed
Don't exit the probe on connection issues
Do not exit the liveness probe process when the liveness probe cannot connect to the CSI driver. The driver could be crashlooping, and we should not crashloop the liveness probe process too. The process should only fail all probes to /healthz endpoint. Since the HTTP server is not running when connecting to the driver for the first time, "connection refused" must be a good enough failure.
1 parent 9cad16c commit 4335b7e

File tree

2 files changed

+9
-37
lines changed

2 files changed

+9
-37
lines changed

cmd/livenessprobe/main.go

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,8 @@ import (
2323
"net"
2424
"net/http"
2525
"os"
26-
"sync"
2726
"time"
2827

29-
"google.golang.org/grpc"
3028
"k8s.io/klog/v2"
3129

3230
"k8s.io/component-base/featuregate"
@@ -62,7 +60,7 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) {
6260
ctx, cancel := context.WithTimeout(req.Context(), *probeTimeout)
6361
defer cancel()
6462

65-
conn, err := acquireConnection(ctx, h.metricsManager)
63+
conn, err := connlib.Connect(*csiAddress, h.metricsManager, connlib.WithTimeout(*probeTimeout))
6664
if err != nil {
6765
w.WriteHeader(http.StatusInternalServerError)
6866
w.Write([]byte(err.Error()))
@@ -92,37 +90,6 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) {
9290
klog.V(5).InfoS("Health check succeeded")
9391
}
9492

95-
// acquireConnection wraps the connlib.Connect but adding support to context
96-
// cancelation.
97-
func acquireConnection(ctx context.Context, metricsManager metrics.CSIMetricsManager) (conn *grpc.ClientConn, err error) {
98-
99-
var m sync.Mutex
100-
var canceled bool
101-
ready := make(chan bool)
102-
go func() {
103-
conn, err = connlib.Connect(*csiAddress, metricsManager)
104-
105-
m.Lock()
106-
defer m.Unlock()
107-
if err != nil && canceled && conn != nil {
108-
conn.Close()
109-
}
110-
111-
close(ready)
112-
}()
113-
114-
select {
115-
case <-ctx.Done():
116-
m.Lock()
117-
defer m.Unlock()
118-
canceled = true
119-
return nil, ctx.Err()
120-
121-
case <-ready:
122-
return conn, err
123-
}
124-
}
125-
12693
func main() {
12794
fg := featuregate.NewFeatureGate()
12895
logsapi.AddFeatureGates(fg)
@@ -151,10 +118,14 @@ func main() {
151118
}
152119

153120
metricsManager := metrics.NewCSIMetricsManager("" /* driverName */)
154-
csiConn, err := acquireConnection(context.Background(), metricsManager)
121+
// Connect to the CSI driver without any timeout to avoid crashing the probe when the driver is not ready yet.
122+
// Goal: liveness probe never crashes, it only fails the probe when the driver is not available (yet).
123+
// Since a http server for the probe is not running at this point, Kubernetes liveness probe will fail immediately
124+
// with "connection refused", which is good enough to fail the probe.
125+
csiConn, err := connlib.Connect(*csiAddress, metricsManager, connlib.WithTimeout(0))
155126
if err != nil {
156127
// connlib should retry forever so a returned error should mean
157-
// the grpc client is misconfigured rather than an error on the network
128+
// the grpc client is misconfigured rather than an error on the network or CSI driver.
158129
klog.ErrorS(err, "Failed to establish connection to CSI driver")
159130
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
160131
}
@@ -163,6 +134,7 @@ func main() {
163134
csiDriverName, err := rpc.GetDriverName(context.Background(), csiConn)
164135
csiConn.Close()
165136
if err != nil {
137+
// The CSI driver does not support GetDriverName, which is serious enough to crash the probe.
166138
klog.ErrorS(err, "Failed to get CSI driver name")
167139
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
168140
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ require (
77
github.com/golang/mock v1.6.0
88
github.com/kubernetes-csi/csi-lib-utils v0.17.0
99
github.com/kubernetes-csi/csi-test/v5 v5.2.0
10-
google.golang.org/grpc v1.60.1
1110
k8s.io/component-base v0.29.0
1211
k8s.io/klog/v2 v2.110.1
1312
)
@@ -46,6 +45,7 @@ require (
4645
golang.org/x/sys v0.14.0 // indirect
4746
golang.org/x/text v0.14.0 // indirect
4847
google.golang.org/genproto/googleapis/rpc v0.0.0-20231106174013-bbf56f31fb17 // indirect
48+
google.golang.org/grpc v1.60.1 // indirect
4949
google.golang.org/protobuf v1.31.0 // indirect
5050
gopkg.in/inf.v0 v0.9.1 // indirect
5151
gopkg.in/yaml.v2 v2.4.0 // indirect

0 commit comments

Comments
 (0)