Skip to content

Commit 3214394

Browse files
committed
Fix health server timeout
We observed cases when the k8s client code had some delay somewhere, which exceeded the configured k8s client timeout, and even the health client timeout on the caller side, causing unneeded fencing. To fix this, the health server monitors the context itself now, and new unit tests cover this usecase. Signed-off-by: Marc Sluiter <[email protected]>
1 parent 6ca8ab5 commit 3214394

File tree

3 files changed

+113
-9
lines changed

3 files changed

+113
-9
lines changed

pkg/peerhealth/client_server_test.go

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package peerhealth
22

33
import (
44
"context"
5+
"fmt"
56
"time"
67

78
. "github.com/onsi/ginkgo/v2"
@@ -78,7 +79,7 @@ var _ = Describe("Checking health using grpc client and server", func() {
7879

7980
By("calling isHealthy")
8081
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
81-
defer (cancel)()
82+
defer cancel()
8283
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
8384
NodeName: nodeName,
8485
})
@@ -110,7 +111,7 @@ var _ = Describe("Checking health using grpc client and server", func() {
110111

111112
By("calling isHealthy")
112113
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
113-
defer (cancel)()
114+
defer cancel()
114115
Eventually(func() bool {
115116
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
116117
NodeName: nodeName,
@@ -122,4 +123,60 @@ var _ = Describe("Checking health using grpc client and server", func() {
122123

123124
})
124125

126+
Describe("with a peer running into API timeout", func() {
127+
128+
var (
129+
apiCallDelay = 7 * time.Second
130+
peerRequestTimeout = 5 * time.Second // must be lower than apiCallDelay for this test!
131+
)
132+
133+
BeforeEach(func() {
134+
reader.delay = &apiCallDelay
135+
})
136+
137+
AfterEach(func() {
138+
reader.delay = nil
139+
})
140+
141+
It("should return API error", func() {
142+
By("calling isHealthy")
143+
// The health server code has a hardcoded timeout of 3s for the API call!
144+
// When we add a delay of > 3s to the API call, that 3s timeout needs to be respected,
145+
// to not exceed the peerRequestTimeout (5s) of the client.
146+
ctx, cancel := context.WithTimeout(context.Background(), peerRequestTimeout)
147+
defer cancel()
148+
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
149+
NodeName: nodeName,
150+
})
151+
152+
// wait for having more logs from async server code
153+
time.Sleep(10 * time.Second)
154+
155+
Expect(err).ToNot(HaveOccurred())
156+
Expect(api.HealthCheckResponseCode(resp.Status)).To(Equal(api.ApiError))
157+
})
158+
159+
})
160+
161+
Describe("with a peer running into API error", func() {
162+
163+
BeforeEach(func() {
164+
reader.err = fmt.Errorf("some API error")
165+
})
166+
167+
AfterEach(func() {
168+
reader.err = nil
169+
})
170+
171+
It("should return API error", func() {
172+
By("calling isHealthy")
173+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
174+
defer cancel()
175+
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
176+
NodeName: nodeName,
177+
})
178+
Expect(err).ToNot(HaveOccurred())
179+
Expect(api.HealthCheckResponseCode(resp.Status)).To(Equal(api.ApiError))
180+
})
181+
})
125182
})

pkg/peerhealth/server.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,9 @@ func (s *Server) IsHealthy(ctx context.Context, request *HealthRequest) (*Health
112112
apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout)
113113
defer cancelFunc()
114114

115-
// list snrs from all ns
116-
// don't use cache, because this also tests API server connectivity!
117115
snrs := &v1alpha1.SelfNodeRemediationList{}
118-
if err := s.reader.List(apiCtx, snrs); err != nil {
119-
s.log.Error(err, "api error, failed to list snrs")
116+
if err := s.listWithTimeoutHandling(apiCtx, snrs); err != nil {
117+
s.log.Info("failed to list SelfNodeRemediations, returning ApiError", "cause", err.Error())
120118
return toResponse(selfNodeRemediationApis.ApiError)
121119
}
122120

@@ -136,6 +134,28 @@ func (s *Server) IsHealthy(ctx context.Context, request *HealthRequest) (*Health
136134
return toResponse(selfNodeRemediationApis.Healthy)
137135
}
138136

137+
// listWithTimeoutHandling wraps a reader list method with additional context timeout handling.
138+
// This is needed because we observed cases when the client code had some delay somewhere, which exceeded the
139+
// configured timeout, and even the timeout on the caller side, causing unneeded fencing actions.
140+
func (s *Server) listWithTimeoutHandling(apiCtx context.Context, snrs *v1alpha1.SelfNodeRemediationList) error {
141+
// run the list call async
142+
var listErr error
143+
listDone := make(chan struct{})
144+
go func() {
145+
// don't use cached client but the reader, because this also tests API server connectivity!
146+
listErr = s.reader.List(apiCtx, snrs)
147+
close(listDone)
148+
}()
149+
150+
// wait until context expired or list call returned
151+
select {
152+
case <-apiCtx.Done():
153+
return fmt.Errorf("timed out")
154+
case <-listDone:
155+
return listErr
156+
}
157+
}
158+
139159
func (s *Server) getNode(ctx context.Context, nodeName string) (*corev1.Node, error) {
140160
apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout)
141161
defer cancelFunc()

pkg/peerhealth/suite_test.go

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package peerhealth
33
import (
44
"context"
55
"path/filepath"
6+
"sync"
67
"testing"
78
"time"
89

@@ -32,7 +33,7 @@ const nodeName = "somenode"
3233

3334
var cfg *rest.Config
3435
var k8sClient client.Client
35-
var reader client.Reader
36+
var reader *ReaderWrapper
3637
var testEnv *envtest.Environment
3738
var snrReconciler *controllers.SelfNodeRemediationReconciler
3839
var cancelFunc context.CancelFunc
@@ -73,8 +74,12 @@ var _ = BeforeSuite(func() {
7374
k8sClient = k8sManager.GetClient()
7475
Expect(k8sClient).ToNot(BeNil())
7576

76-
reader = k8sManager.GetAPIReader()
77-
Expect(reader).ToNot(BeNil())
77+
originalReader := k8sManager.GetAPIReader()
78+
Expect(originalReader).ToNot(BeNil())
79+
80+
reader = &ReaderWrapper{
81+
Reader: originalReader,
82+
}
7883

7984
// we need a reconciler for getting last SNR namespace
8085
snrReconciler = &controllers.SelfNodeRemediationReconciler{
@@ -101,3 +106,25 @@ var _ = AfterSuite(func() {
101106
cancelFunc()
102107
Expect(testEnv.Stop()).To(Succeed())
103108
})
109+
110+
type ReaderWrapper struct {
111+
client.Reader
112+
mu sync.RWMutex
113+
delay *time.Duration
114+
err error
115+
}
116+
117+
func (r *ReaderWrapper) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
118+
r.mu.RLock()
119+
d, e := r.delay, r.err
120+
r.mu.RUnlock()
121+
122+
if d != nil {
123+
time.Sleep(*d)
124+
}
125+
if e != nil {
126+
return e
127+
}
128+
129+
return r.Reader.List(ctx, list, opts...)
130+
}

0 commit comments

Comments
 (0)