@@ -21,6 +21,7 @@ import (
21
21
"errors"
22
22
"net/http"
23
23
"os/exec"
24
+ "strconv"
24
25
"strings"
25
26
"time"
26
27
@@ -33,7 +34,7 @@ import (
33
34
type healthChecker struct {
34
35
component string
35
36
enableRepair bool
36
- healthCheckFunc func () bool
37
+ healthCheckFunc func () ( bool , error )
37
38
// The repair is "best-effort" and ignores the error from the underlying actions.
38
39
// The bash commands to kill the process will fail if the service is down and hence ignore.
39
40
repairFunc func ()
@@ -102,41 +103,37 @@ func getRepairFunc(hco *options.HealthCheckerOptions) func() {
102
103
}
103
104
104
105
// getHealthCheckFunc returns the health check function based on the component.
105
- func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () bool {
106
+ func getHealthCheckFunc (hco * options.HealthCheckerOptions ) func () ( bool , error ) {
106
107
switch hco .Component {
107
108
case types .KubeletComponent :
108
- return func () bool {
109
- httpClient := http.Client {Timeout : hco .HealthCheckTimeout }
110
- response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
111
- if err != nil || response .StatusCode != http .StatusOK {
112
- return false
113
- }
114
- return true
115
- }
109
+ return getKubeletHealthCheckFunc (hco .HealthCheckTimeout )
116
110
case types .DockerComponent :
117
- return func () bool {
111
+ return func () ( bool , error ) {
118
112
if _ , err := execCommand (hco .HealthCheckTimeout , "docker" , "ps" ); err != nil {
119
- return false
113
+ return false , nil
120
114
}
121
- return true
115
+ return true , nil
122
116
}
123
117
case types .CRIComponent :
124
- return func () bool {
118
+ return func () ( bool , error ) {
125
119
if _ , err := execCommand (hco .HealthCheckTimeout , hco .CriCtlPath , "--runtime-endpoint=" + hco .CriSocketPath , "--image-endpoint=" + hco .CriSocketPath , "pods" ); err != nil {
126
- return false
120
+ return false , nil
127
121
}
128
- return true
122
+ return true , nil
129
123
}
130
124
}
131
125
return nil
132
126
}
133
127
134
128
// CheckHealth checks for the health of the component and tries to repair if enabled.
135
129
// Returns true if healthy, false otherwise.
136
- func (hc * healthChecker ) CheckHealth () bool {
137
- healthy := hc .healthCheckFunc ()
130
+ func (hc * healthChecker ) CheckHealth () (bool , error ) {
131
+ healthy , err := hc .healthCheckFunc ()
132
+ if err != nil {
133
+ return healthy , err
134
+ }
138
135
if healthy {
139
- return true
136
+ return true , nil
140
137
}
141
138
// The service is unhealthy.
142
139
// Attempt repair based on flag.
@@ -152,14 +149,13 @@ func (hc *healthChecker) CheckHealth() bool {
152
149
hc .repairFunc ()
153
150
}
154
151
}
155
- return false
152
+ return false , nil
156
153
}
157
154
158
155
// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs.
159
156
func execCommand (timeout time.Duration , command string , args ... string ) (string , error ) {
160
157
ctx , cancel := context .WithTimeout (context .Background (), timeout )
161
158
defer cancel ()
162
-
163
159
cmd := exec .CommandContext (ctx , command , args ... )
164
160
out , err := cmd .Output ()
165
161
if err != nil {
@@ -168,3 +164,66 @@ func execCommand(timeout time.Duration, command string, args ...string) (string,
168
164
}
169
165
return strings .TrimSuffix (string (out ), "\n " ), nil
170
166
}
167
+
168
+ // kubeletHttpHealthCheck checks the health api response on kubelet.
169
+ // Returns true for healthy, false otherwise.
170
+ func kubeletHttpHealthCheck (healthCheckTimeout time.Duration ) bool {
171
+ httpClient := http.Client {Timeout : healthCheckTimeout }
172
+ response , err := httpClient .Get (types .KubeletHealthCheckEndpoint )
173
+ if err != nil || response .StatusCode != http .StatusOK {
174
+ glog .Info ("kubelet failed http health check" )
175
+ return false
176
+ }
177
+ return true
178
+ }
179
+
180
+ // kubeletConnectionHealthCheck checks for the kubelet-apiserver connection issue
181
+ // by checking repeated occurrences of log "use of closed network connection" in kubelet logs.
182
+ // Returns true if the pattern does not exist 10 times since start of service or the last 10 min, false otherwise.
183
+ func kubeletConnectionHealthCheck () (bool , error ) {
184
+ kubeletUptimeFunc := getUptimeFunc (types .KubeletComponent )
185
+ uptime , err := kubeletUptimeFunc ()
186
+ if err != nil {
187
+ return true , err
188
+ }
189
+ logStartTime := time .Now ().Add (- uptime ).Format (types .LogParsingTimeLayout )
190
+ if err != nil {
191
+ return true , err
192
+ }
193
+ out , err := execCommand (types .CmdTimeout , "/bin/sh" , "-c" ,
194
+ // Query kubelet logs since the logStartTime
195
+ `journalctl --unit kubelet --since "` + logStartTime +
196
+ // Grep the pattern for lost connection
197
+ `" | grep -i "` + types .KubeletClosedConnectionLogPattern +
198
+ // Get the count of occurrences
199
+ `" | wc -l` )
200
+ if err != nil {
201
+ return true , err
202
+ }
203
+ occurrences , err := strconv .Atoi (out )
204
+ if err != nil {
205
+ return true , err
206
+ }
207
+ if occurrences >= types .KubeletClosedConnectionLogPatternThresholdCount {
208
+ glog .Infof ("kubelet failed apiserver connection check, log pattern occurrences: %v" , occurrences )
209
+ return false , nil
210
+ }
211
+ return true , nil
212
+ }
213
+
214
+ // getKubeletHealthCheckFunc returns a function that checks for kubelet health and
215
+ // return false if identified as unhealthy, true otherwise.
216
+ func getKubeletHealthCheckFunc (healthCheckTimeout time.Duration ) func () (bool , error ) {
217
+ return func () (bool , error ) {
218
+ httpHealthy := kubeletHttpHealthCheck (healthCheckTimeout )
219
+ connectionHealthy , err := kubeletConnectionHealthCheck ()
220
+ // The plugin will return Unknown status code in case there is any error in
221
+ // checking kubelet health.
222
+ if err != nil {
223
+ glog .Infof ("Error in determining apiserver connection health: %v" , err )
224
+ return false , err
225
+ }
226
+ healthy := httpHealthy && connectionHealthy
227
+ return healthy , nil
228
+ }
229
+ }
0 commit comments