Skip to content

Commit 8b5b66e

Browse files
authored
Add health-monitor.sh script (#353)
1 parent a1b7b4e commit 8b5b66e

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2016 The Kubernetes Authors.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# This script is for master and node instance health monitoring, which is
18+
# packed in kube-manifest tarball. It is executed through a systemd service
19+
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
20+
# file provided by the systemd service.
21+
22+
# This script is a slightly adjusted version of
23+
# https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh
24+
# Adjustments are:
25+
# * Kubelet health port is 10248 not 10255
26+
# * Removal of all all references to the KUBE_ENV file
27+
28+
set -o nounset
29+
set -o pipefail
30+
31+
# We simply kill the process when there is a failure. Another systemd service will
32+
# automatically restart the process.
33+
function container_runtime_monitoring {
34+
local -r max_attempts=5
35+
local attempt=1
36+
local -r crictl="${KUBE_HOME}/bin/crictl"
37+
local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
38+
# We still need to use 'docker ps' when container runtime is "docker". This is because
39+
# dockershim is still part of kubelet today. When kubelet is down, crictl pods
40+
# will also fail, and docker will be killed. This is undesirable especially when
41+
# docker live restore is disabled.
42+
local healthcheck_command="docker ps"
43+
if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
44+
healthcheck_command="${crictl} pods"
45+
fi
46+
# Container runtime startup takes time. Make initial attempts before starting
47+
# killing the container runtime.
48+
until timeout 60 ${healthcheck_command} > /dev/null; do
49+
if (( attempt == max_attempts )); then
50+
echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
51+
break
52+
fi
53+
echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
54+
sleep "$(( 2 ** attempt++ ))"
55+
done
56+
while true; do
57+
if ! timeout 60 ${healthcheck_command} > /dev/null; then
58+
echo "Container runtime ${container_runtime_name} failed!"
59+
if [[ "$container_runtime_name" == "docker" ]]; then
60+
# Dump stack of docker daemon for investigation.
61+
# Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to
62+
# the exec root directory, which is /var/run/docker/ on Ubuntu and COS.
63+
pkill -SIGUSR1 dockerd
64+
fi
65+
systemctl kill --kill-who=main "${container_runtime_name}"
66+
# Wait for a while, as we don't want to kill it again before it is really up.
67+
sleep 120
68+
else
69+
sleep "${SLEEP_SECONDS}"
70+
fi
71+
done
72+
}
73+
74+
function kubelet_monitoring {
75+
echo "Wait for 2 minutes for kubelet to be functional"
76+
# TODO(andyzheng0831): replace it with a more reliable method if possible.
77+
sleep 120
78+
local -r max_seconds=10
79+
local output=""
80+
while [ 1 ]; do
81+
if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then
82+
# Print the response and/or errors.
83+
echo $output
84+
echo "Kubelet is unhealthy!"
85+
systemctl kill kubelet
86+
# Wait for a while, as we don't want to kill it again before it is really up.
87+
sleep 60
88+
else
89+
sleep "${SLEEP_SECONDS}"
90+
fi
91+
done
92+
}
93+
94+
95+
############## Main Function ################
96+
if [[ "$#" -ne 1 ]]; then
97+
echo "Usage: health-monitor.sh <container-runtime/kubelet>"
98+
exit 1
99+
fi
100+
101+
KUBE_HOME="/home/kubernetes"
102+
103+
SLEEP_SECONDS=10
104+
component=$1
105+
echo "Start kubernetes health monitoring for ${component}"
106+
if [[ "${component}" == "container-runtime" ]]; then
107+
container_runtime_monitoring
108+
elif [[ "${component}" == "kubelet" ]]; then
109+
kubelet_monitoring
110+
else
111+
echo "Health monitoring for component "${component}" is not supported!"
112+
fi

0 commit comments

Comments
 (0)