Skip to content

Commit 44dc4aa

Browse files
committed
Add health-check-monitor
1 parent 1d03b66 commit 44dc4aa

File tree

9 files changed

+630
-2
lines changed

9 files changed

+630
-2
lines changed

Makefile

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,14 @@ endif
113113
-tags "$(BUILD_TAGS)" \
114114
./test/e2e/problemmaker/problem_maker.go
115115

116+
./bin/health-checker: $(PKG_SOURCES)
117+
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
118+
-mod vendor \
119+
-o bin/health-checker \
120+
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
121+
-tags "$(BUILD_TAGS)" \
122+
cmd/healthchecker/health_checker.go
123+
116124
Dockerfile: Dockerfile.in
117125
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
118126
ifneq ($(ENABLE_JOURNALD), 1)
@@ -134,12 +142,12 @@ e2e-test: vet fmt build-tar
134142
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
135143
-artifacts-dir=$(ARTIFACTS)
136144

137-
build-binaries: ./bin/node-problem-detector ./bin/log-counter
145+
build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker
138146

139147
build-container: build-binaries Dockerfile
140148
docker build -t $(IMAGE) .
141149

142-
build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker
150+
build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
143151
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
144152
sha1sum $(TARBALL)
145153
md5sum $(TARBALL)
@@ -164,6 +172,7 @@ push-tar: build-tar
164172
push: push-container push-tar
165173

166174
clean:
175+
rm -f bin/health-checker
167176
rm -f bin/log-counter
168177
rm -f bin/node-problem-detector
169178
rm -f test/bin/problem-maker

cmd/healthchecker/health_checker.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"flag"
21+
"fmt"
22+
"os"
23+
24+
"github.com/spf13/pflag"
25+
26+
"k8s.io/node-problem-detector/cmd/healthchecker/options"
27+
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
28+
"k8s.io/node-problem-detector/pkg/healthchecker"
29+
)
30+
31+
func main() {
32+
// Set glog flag so that it does not log to files.
33+
if err := flag.Set("logtostderr", "true"); err != nil {
34+
fmt.Printf("Failed to set logtostderr=true: %v", err)
35+
os.Exit(int(types.Unknown))
36+
}
37+
38+
hco := options.NewHealthCheckerOptions()
39+
hco.AddFlags(pflag.CommandLine)
40+
pflag.Parse()
41+
hco.SetDefaults()
42+
if err := hco.IsValid(); err != nil {
43+
fmt.Println(err)
44+
os.Exit(int(types.Unknown))
45+
}
46+
47+
hc, err := healthchecker.NewHealthChecker(hco)
48+
if err != nil {
49+
fmt.Println(err)
50+
os.Exit(int(types.Unknown))
51+
}
52+
if !hc.CheckHealth() {
53+
fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair)
54+
os.Exit(int(types.NonOK))
55+
}
56+
os.Exit(int(types.OK))
57+
}

cmd/healthchecker/options/options.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package options
18+
19+
import (
20+
"flag"
21+
"fmt"
22+
"time"
23+
24+
"github.com/spf13/pflag"
25+
26+
"k8s.io/node-problem-detector/pkg/healthchecker/types"
27+
)
28+
29+
// NewHealthCheckerOptions returns an empty health check options struct.
30+
func NewHealthCheckerOptions() *HealthCheckerOptions {
31+
return &HealthCheckerOptions{}
32+
}
33+
34+
// HealthCheckerOptions are the options used to configure the health checker.
35+
type HealthCheckerOptions struct {
36+
Component string
37+
SystemdService string
38+
EnableRepair bool
39+
CriCtlPath string
40+
CriSocketPath string
41+
CoolDownTime time.Duration
42+
HealthCheckTimeout time.Duration
43+
}
44+
45+
// AddFlags adds health checker command line options to pflag.
46+
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
47+
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
48+
"The component to check health for. Supports kubelet, docker and cri")
49+
fs.StringVar(&hco.SystemdService, "systemd-service", "",
50+
"The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.")
51+
fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.")
52+
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
53+
"The path to the crictl binary. This is used to check health of cri component.")
54+
fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath,
55+
"The path to the cri socket. Used with crictl to specify the socket path.")
56+
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime,
57+
"The duration to wait for the service to be up before attempting repair.")
58+
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
59+
"The time to wait before marking the component as unhealthy.")
60+
}
61+
62+
// IsValid validates health checker command line options.
63+
// Returns error if invalid, nil otherwise.
64+
func (hco *HealthCheckerOptions) IsValid() error {
65+
// Make sure the component specified is valid.
66+
if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent {
67+
return fmt.Errorf("the component specified is not supported. Supported components are : <kubelet/docker/cri>")
68+
}
69+
// Make sure the systemd service is specified if repair is enabled.
70+
if hco.EnableRepair && hco.SystemdService == "" {
71+
return fmt.Errorf("systemd-service cannot be empty when repair is enabled")
72+
}
73+
// Skip checking further if the component is not cri.
74+
if hco.Component != types.CRIComponent {
75+
return nil
76+
}
77+
// Make sure the crictl path is not empty for cri component.
78+
if hco.Component == types.CRIComponent && hco.CriCtlPath == "" {
79+
return fmt.Errorf("the crictl-path cannot be empty for cri component")
80+
}
81+
// Make sure the cri socker path is not empty for cri component.
82+
if hco.Component == types.CRIComponent && hco.CriSocketPath == "" {
83+
return fmt.Errorf("the cri-socket-path cannot be empty for cri component")
84+
}
85+
return nil
86+
}
87+
88+
// SetDefaults sets the defaults values for the dependent flags.
89+
func (hco *HealthCheckerOptions) SetDefaults() {
90+
if hco.SystemdService != "" {
91+
return
92+
}
93+
if hco.Component != types.CRIComponent {
94+
hco.SystemdService = hco.Component
95+
return
96+
}
97+
hco.SystemdService = types.ContainerdService
98+
}
99+
100+
func init() {
101+
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
102+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package options
18+
19+
import (
20+
"testing"
21+
22+
"github.com/stretchr/testify/assert"
23+
24+
"k8s.io/node-problem-detector/pkg/healthchecker/types"
25+
)
26+
27+
func TestIsValid(t *testing.T) {
28+
testCases := []struct {
29+
name string
30+
hco HealthCheckerOptions
31+
expectError bool
32+
}{
33+
{
34+
name: "valid component",
35+
hco: HealthCheckerOptions{
36+
Component: types.KubeletComponent,
37+
},
38+
expectError: false,
39+
},
40+
{
41+
name: "invalid component",
42+
hco: HealthCheckerOptions{
43+
Component: "wrongComponent",
44+
},
45+
expectError: true,
46+
},
47+
{
48+
name: "empty crictl-path with cri",
49+
hco: HealthCheckerOptions{
50+
Component: types.CRIComponent,
51+
CriCtlPath: "",
52+
EnableRepair: false,
53+
},
54+
expectError: true,
55+
},
56+
{
57+
name: "empty systemd-service and repair enabled",
58+
hco: HealthCheckerOptions{
59+
Component: types.KubeletComponent,
60+
EnableRepair: true,
61+
SystemdService: "",
62+
},
63+
expectError: true,
64+
},
65+
}
66+
67+
for _, test := range testCases {
68+
t.Run(test.name, func(t *testing.T) {
69+
if test.expectError {
70+
assert.Error(t, test.hco.IsValid(), "HealthChecker option %+v is invalid. Expected IsValid to return error.", test.hco)
71+
} else {
72+
assert.NoError(t, test.hco.IsValid(), "HealthChecker option %+v is valid. Expected IsValid to return nil.", test.hco)
73+
}
74+
})
75+
}
76+
}

config/health-checker-docker.json

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "10s",
5+
"timeout": "3m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "health-checker",
10+
"metricsReporting": true,
11+
"conditions": [
12+
{
13+
"type": "ContainerRuntimeUnhealthy",
14+
"reason": "ContainerRuntimeIsHealthy",
15+
"message": "Container runtime on the node is functioning properly"
16+
}
17+
],
18+
"rules": [
19+
{
20+
"type": "permanent",
21+
"condition": "ContainerRuntimeUnhealthy",
22+
"reason": "DockerUnhealthy",
23+
"path": "/home/kubernetes/bin/health-checker",
24+
"args": [
25+
"--component=docker",
26+
"--enable-repair=false",
27+
"--cooldown-time=2m",
28+
"--health-check-timeout=60s"
29+
],
30+
"timeout": "3m"
31+
}
32+
]
33+
}

config/health-checker-kubelet.json

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "10s",
5+
"timeout": "3m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "health-checker",
10+
"metricsReporting": true,
11+
"conditions": [
12+
{
13+
"type": "KubeletUnhealthy",
14+
"reason": "KubeletIsHealthy",
15+
"message": "kubelet on the node is functioning properly"
16+
}
17+
],
18+
"rules": [
19+
{
20+
"type": "permanent",
21+
"condition": "KubeletUnhealthy",
22+
"reason": "KubeletUnhealthy",
23+
"path": "/home/kubernetes/bin/health-checker",
24+
"args": [
25+
"--component=kubelet",
26+
"--enable-repair=false",
27+
"--cooldown-time=1m",
28+
"--health-check-timeout=10s"
29+
],
30+
"timeout": "3m"
31+
}
32+
]
33+
}

0 commit comments

Comments
 (0)