Skip to content

Commit c960b00

Browse files
Merge pull request #262 from slintes/health-server-timeout
Fix health server timeout
2 parents 6ca8ab5 + 3214394 commit c960b00

File tree

3 files changed

+113
-9
lines changed

3 files changed

+113
-9
lines changed

pkg/peerhealth/client_server_test.go

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package peerhealth
22

33
import (
44
"context"
5+
"fmt"
56
"time"
67

78
. "github.com/onsi/ginkgo/v2"
@@ -78,7 +79,7 @@ var _ = Describe("Checking health using grpc client and server", func() {
7879

7980
By("calling isHealthy")
8081
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
81-
defer (cancel)()
82+
defer cancel()
8283
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
8384
NodeName: nodeName,
8485
})
@@ -110,7 +111,7 @@ var _ = Describe("Checking health using grpc client and server", func() {
110111

111112
By("calling isHealthy")
112113
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
113-
defer (cancel)()
114+
defer cancel()
114115
Eventually(func() bool {
115116
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
116117
NodeName: nodeName,
@@ -122,4 +123,60 @@ var _ = Describe("Checking health using grpc client and server", func() {
122123

123124
})
124125

126+
Describe("with a peer running into API timeout", func() {
127+
128+
var (
129+
apiCallDelay = 7 * time.Second
130+
peerRequestTimeout = 5 * time.Second // must be lower than apiCallDelay for this test!
131+
)
132+
133+
BeforeEach(func() {
134+
reader.delay = &apiCallDelay
135+
})
136+
137+
AfterEach(func() {
138+
reader.delay = nil
139+
})
140+
141+
It("should return API error", func() {
142+
By("calling isHealthy")
143+
// The health server code has a hardcoded timeout of 3s for the API call!
144+
// When we add a delay of > 3s to the API call, that 3s timeout needs to be respected,
145+
// to not exceed the peerRequestTimeout (5s) of the client.
146+
ctx, cancel := context.WithTimeout(context.Background(), peerRequestTimeout)
147+
defer cancel()
148+
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
149+
NodeName: nodeName,
150+
})
151+
152+
// wait for having more logs from async server code
153+
time.Sleep(10 * time.Second)
154+
155+
Expect(err).ToNot(HaveOccurred())
156+
Expect(api.HealthCheckResponseCode(resp.Status)).To(Equal(api.ApiError))
157+
})
158+
159+
})
160+
161+
Describe("with a peer running into API error", func() {
162+
163+
BeforeEach(func() {
164+
reader.err = fmt.Errorf("some API error")
165+
})
166+
167+
AfterEach(func() {
168+
reader.err = nil
169+
})
170+
171+
It("should return API error", func() {
172+
By("calling isHealthy")
173+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
174+
defer cancel()
175+
resp, err := phClient.IsHealthy(ctx, &HealthRequest{
176+
NodeName: nodeName,
177+
})
178+
Expect(err).ToNot(HaveOccurred())
179+
Expect(api.HealthCheckResponseCode(resp.Status)).To(Equal(api.ApiError))
180+
})
181+
})
125182
})

pkg/peerhealth/server.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,9 @@ func (s *Server) IsHealthy(ctx context.Context, request *HealthRequest) (*Health
112112
apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout)
113113
defer cancelFunc()
114114

115-
// list snrs from all ns
116-
// don't use cache, because this also tests API server connectivity!
117115
snrs := &v1alpha1.SelfNodeRemediationList{}
118-
if err := s.reader.List(apiCtx, snrs); err != nil {
119-
s.log.Error(err, "api error, failed to list snrs")
116+
if err := s.listWithTimeoutHandling(apiCtx, snrs); err != nil {
117+
s.log.Info("failed to list SelfNodeRemediations, returning ApiError", "cause", err.Error())
120118
return toResponse(selfNodeRemediationApis.ApiError)
121119
}
122120

@@ -136,6 +134,28 @@ func (s *Server) IsHealthy(ctx context.Context, request *HealthRequest) (*Health
136134
return toResponse(selfNodeRemediationApis.Healthy)
137135
}
138136

137+
// listWithTimeoutHandling wraps a reader list method with additional context timeout handling.
138+
// This is needed because we observed cases when the client code had some delay somewhere, which exceeded the
139+
// configured timeout, and even the timeout on the caller side, causing unneeded fencing actions.
140+
func (s *Server) listWithTimeoutHandling(apiCtx context.Context, snrs *v1alpha1.SelfNodeRemediationList) error {
141+
// run the list call async
142+
var listErr error
143+
listDone := make(chan struct{})
144+
go func() {
145+
// don't use cached client but the reader, because this also tests API server connectivity!
146+
listErr = s.reader.List(apiCtx, snrs)
147+
close(listDone)
148+
}()
149+
150+
// wait until context expired or list call returned
151+
select {
152+
case <-apiCtx.Done():
153+
return fmt.Errorf("timed out")
154+
case <-listDone:
155+
return listErr
156+
}
157+
}
158+
139159
func (s *Server) getNode(ctx context.Context, nodeName string) (*corev1.Node, error) {
140160
apiCtx, cancelFunc := context.WithTimeout(ctx, apiServerTimeout)
141161
defer cancelFunc()

pkg/peerhealth/suite_test.go

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package peerhealth
33
import (
44
"context"
55
"path/filepath"
6+
"sync"
67
"testing"
78
"time"
89

@@ -32,7 +33,7 @@ const nodeName = "somenode"
3233

3334
var cfg *rest.Config
3435
var k8sClient client.Client
35-
var reader client.Reader
36+
var reader *ReaderWrapper
3637
var testEnv *envtest.Environment
3738
var snrReconciler *controllers.SelfNodeRemediationReconciler
3839
var cancelFunc context.CancelFunc
@@ -73,8 +74,12 @@ var _ = BeforeSuite(func() {
7374
k8sClient = k8sManager.GetClient()
7475
Expect(k8sClient).ToNot(BeNil())
7576

76-
reader = k8sManager.GetAPIReader()
77-
Expect(reader).ToNot(BeNil())
77+
originalReader := k8sManager.GetAPIReader()
78+
Expect(originalReader).ToNot(BeNil())
79+
80+
reader = &ReaderWrapper{
81+
Reader: originalReader,
82+
}
7883

7984
// we need a reconciler for getting last SNR namespace
8085
snrReconciler = &controllers.SelfNodeRemediationReconciler{
@@ -101,3 +106,25 @@ var _ = AfterSuite(func() {
101106
cancelFunc()
102107
Expect(testEnv.Stop()).To(Succeed())
103108
})
109+
110+
type ReaderWrapper struct {
111+
client.Reader
112+
mu sync.RWMutex
113+
delay *time.Duration
114+
err error
115+
}
116+
117+
func (r *ReaderWrapper) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
118+
r.mu.RLock()
119+
d, e := r.delay, r.err
120+
r.mu.RUnlock()
121+
122+
if d != nil {
123+
time.Sleep(*d)
124+
}
125+
if e != nil {
126+
return e
127+
}
128+
129+
return r.Reader.List(ctx, list, opts...)
130+
}

0 commit comments

Comments
 (0)