Skip to content

Commit 2513756

Browse files
committed
Add kubelet apiserver connection fail check in health checker
1 parent f42281e commit 2513756

File tree

4 files changed

+114
-41
lines changed

4 files changed

+114
-41
lines changed

cmd/healthchecker/health_checker.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,12 @@ func main() {
4949
fmt.Println(err)
5050
os.Exit(int(types.Unknown))
5151
}
52-
if !hc.CheckHealth() {
52+
healthy, err := hc.CheckHealth()
53+
if err != nil {
54+
fmt.Printf("error checking %v health: %v\n", hco.Component, err)
55+
os.Exit(int(types.Unknown))
56+
}
57+
if !healthy {
5358
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
5459
os.Exit(int(types.NonOK))
5560
}

pkg/healthchecker/health_checker.go

Lines changed: 80 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"errors"
2222
"net/http"
2323
"os/exec"
24+
"strconv"
2425
"strings"
2526
"time"
2627

@@ -33,7 +34,7 @@ import (
3334
type healthChecker struct {
3435
component string
3536
enableRepair bool
36-
healthCheckFunc func() bool
37+
healthCheckFunc func() (bool, error)
3738
// The repair is "best-effort" and ignores the error from the underlying actions.
3839
// The bash commands to kill the process will fail if the service is down and hence ignore.
3940
repairFunc func()
@@ -102,41 +103,37 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
102103
}
103104

104105
// getHealthCheckFunc returns the health check function based on the component.
105-
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool {
106+
func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() (bool, error) {
106107
switch hco.Component {
107108
case types.KubeletComponent:
108-
return func() bool {
109-
httpClient := http.Client{Timeout: hco.HealthCheckTimeout}
110-
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
111-
if err != nil || response.StatusCode != http.StatusOK {
112-
return false
113-
}
114-
return true
115-
}
109+
return getKubeletHealthCheckFunc(hco.HealthCheckTimeout)
116110
case types.DockerComponent:
117-
return func() bool {
111+
return func() (bool, error) {
118112
if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil {
119-
return false
113+
return false, nil
120114
}
121-
return true
115+
return true, nil
122116
}
123117
case types.CRIComponent:
124-
return func() bool {
118+
return func() (bool, error) {
125119
if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil {
126-
return false
120+
return false, nil
127121
}
128-
return true
122+
return true, nil
129123
}
130124
}
131125
return nil
132126
}
133127

134128
// CheckHealth checks for the health of the component and tries to repair if enabled.
135129
// Returns true if healthy, false otherwise.
136-
func (hc *healthChecker) CheckHealth() bool {
137-
healthy := hc.healthCheckFunc()
130+
func (hc *healthChecker) CheckHealth() (bool, error) {
131+
healthy, err := hc.healthCheckFunc()
132+
if err != nil {
133+
return healthy, err
134+
}
138135
if healthy {
139-
return true
136+
return true, nil
140137
}
141138
// The service is unhealthy.
142139
// Attempt repair based on flag.
@@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool {
152149
hc.repairFunc()
153150
}
154151
}
155-
return false
152+
return false, nil
156153
}
157154

158155
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
159156
func execCommand(timeout time.Duration, command string, args ...string) (string, error) {
160157
ctx, cancel := context.WithTimeout(context.Background(), timeout)
161158
defer cancel()
162-
163159
cmd := exec.CommandContext(ctx, command, args...)
164160
out, err := cmd.Output()
165161
if err != nil {
@@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
168164
}
169165
return strings.TrimSuffix(string(out), "\n"), nil
170166
}
167+
168+
// kubeletHttpHealthCheck checks the health api response on kubelet.
169+
// Returns true for healthy, false otherwise.
170+
func kubeletHttpHealthCheck(healthCheckTimeout time.Duration) bool {
171+
httpClient := http.Client{Timeout: healthCheckTimeout}
172+
response, err := httpClient.Get(types.KubeletHealthCheckEndpoint)
173+
if err != nil || response.StatusCode != http.StatusOK {
174+
glog.Info("kubelet failed http health check")
175+
return false
176+
}
177+
return true
178+
}
179+
180+
// kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
181+
// by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
182+
// Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
183+
func kubeletConnectionHealthCheck() (bool, error) {
184+
kubeletUptimeFunc := getUptimeFunc(types.KubeletComponent)
185+
uptime, err := kubeletUptimeFunc()
186+
if err != nil {
187+
return true, err
188+
}
189+
logStartTime := time.Now().Add(-uptime).Format(types.LogParsingTimeLayout)
190+
if err != nil {
191+
return true, err
192+
}
193+
out, err := execCommand(types.CmdTimeout, "/bin/sh", "-c",
194+
// Query kubelet logs since the logStartTime
195+
`journalctl --unit kubelet --since "`+logStartTime+
196+
// Grep the pattern for lost connection
197+
`" | grep -i "`+types.KubeletClosedConnectionLogPattern+
198+
// Get the count of occurrences
199+
`" | wc -l`)
200+
if err != nil {
201+
return true, err
202+
}
203+
occurrences, err := strconv.Atoi(out)
204+
if err != nil {
205+
return true, err
206+
}
207+
if occurrences >= types.KubeletClosedConnectionLogPatternThresholdCount {
208+
glog.Infof("kubelet failed apiserver connection check, log pattern occurrences: %v", occurrences)
209+
return false, nil
210+
}
211+
return true, nil
212+
}
213+
214+
// getKubeletHealthCheckFunc returns a function that checks for kubelet health and
215+
// return false if identified as unhealthy, true otherwise.
216+
func getKubeletHealthCheckFunc(healthCheckTimeout time.Duration) func() (bool, error) {
217+
return func() (bool, error) {
218+
httpHealthy := kubeletHttpHealthCheck(healthCheckTimeout)
219+
connectionHealthy, err := kubeletConnectionHealthCheck()
220+
// The plugin will return Unknown status code in case there is any error in
221+
// checking kubelet health.
222+
if err != nil {
223+
glog.Infof("Error in determining apiserver connection health: %v", err)
224+
return false, err
225+
}
226+
healthy := httpHealthy && connectionHealthy
227+
return healthy, nil
228+
}
229+
}

pkg/healthchecker/health_checker_test.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525

2626
var repairCalled bool
2727

28-
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
28+
func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() (bool, error), uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker {
2929
repairCalled = false
3030
return &healthChecker{
3131
enableRepair: enableRepair,
@@ -37,12 +37,12 @@ func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptime
3737
}
3838
}
3939

40-
func healthyFunc() bool {
41-
return true
40+
func healthyFunc() (bool, error) {
41+
return true, nil
4242
}
4343

44-
func unhealthyFunc() bool {
45-
return false
44+
func unhealthyFunc() (bool, error) {
45+
return false, nil
4646
}
4747

4848
func repairFunc() {
@@ -62,7 +62,7 @@ func TestHealthCheck(t *testing.T) {
6262
description string
6363
enableRepair bool
6464
healthy bool
65-
healthCheckFunc func() bool
65+
healthCheckFunc func() (bool, error)
6666
uptimeFunc func() (time.Duration, error)
6767
repairFunc func()
6868
repairCalled bool
@@ -106,7 +106,10 @@ func TestHealthCheck(t *testing.T) {
106106
} {
107107
t.Run(tc.description, func(t *testing.T) {
108108
hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair)
109-
healthy := hc.CheckHealth()
109+
healthy, err := hc.CheckHealth()
110+
if err != nil {
111+
t.Errorf("unexpected error occurred got %v; expected nil", err)
112+
}
110113
if healthy != tc.healthy {
111114
t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy)
112115
}

pkg/healthchecker/types/types.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,25 @@ package types
1919
import "time"
2020

2121
const (
22-
DefaultCoolDownTime = 2 * time.Minute
23-
DefaultHealthCheckTimeout = 10 * time.Second
24-
CmdTimeout = 10 * time.Second
25-
DefaultCriCtl = "/usr/bin/crictl"
26-
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
27-
KubeletComponent = "kubelet"
28-
CRIComponent = "cri"
29-
DockerComponent = "docker"
30-
ContainerdService = "containerd"
31-
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
32-
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
22+
DefaultCoolDownTime = 2 * time.Minute
23+
DefaultHealthCheckTimeout = 10 * time.Second
24+
CmdTimeout = 10 * time.Second
25+
UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC"
26+
LogParsingTimeLayout = "2006-01-02 15:04:05"
27+
28+
DefaultCriCtl = "/usr/bin/crictl"
29+
DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock"
30+
31+
KubeletComponent = "kubelet"
32+
CRIComponent = "cri"
33+
DockerComponent = "docker"
34+
ContainerdService = "containerd"
35+
36+
KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz"
37+
KubeletClosedConnectionLogPattern = "use of closed network connection"
38+
KubeletClosedConnectionLogPatternThresholdCount = 10
3339
)
3440

3541
type HealthChecker interface {
36-
CheckHealth() bool
42+
CheckHealth() (bool, error)
3743
}

0 commit comments

Comments
 (0)