|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Copyright 2016 The Kubernetes Authors. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +# This script is for master and node instance health monitoring, which is |
| 18 | +# packed in kube-manifest tarball. It is executed through a systemd service |
| 19 | +# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env |
| 20 | +# file provided by the systemd service. |
| 21 | + |
| 22 | +# This script is a slightly adjusted version of |
| 23 | +# https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh |
| 24 | +# Adjustments are: |
| 25 | +# * Kubelet health port is 10248 not 10255 |
| 26 | +# * Removal of all all references to the KUBE_ENV file |
| 27 | + |
| 28 | +set -o nounset |
| 29 | +set -o pipefail |
| 30 | + |
| 31 | +# We simply kill the process when there is a failure. Another systemd service will |
| 32 | +# automatically restart the process. |
| 33 | +function container_runtime_monitoring { |
| 34 | + local -r max_attempts=5 |
| 35 | + local attempt=1 |
| 36 | + local -r crictl="${KUBE_HOME}/bin/crictl" |
| 37 | + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" |
| 38 | + # We still need to use 'docker ps' when container runtime is "docker". This is because |
| 39 | + # dockershim is still part of kubelet today. When kubelet is down, crictl pods |
| 40 | + # will also fail, and docker will be killed. This is undesirable especially when |
| 41 | + # docker live restore is disabled. |
| 42 | + local healthcheck_command="docker ps" |
| 43 | + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then |
| 44 | + healthcheck_command="${crictl} pods" |
| 45 | + fi |
| 46 | + # Container runtime startup takes time. Make initial attempts before starting |
| 47 | + # killing the container runtime. |
| 48 | + until timeout 60 ${healthcheck_command} > /dev/null; do |
| 49 | + if (( attempt == max_attempts )); then |
| 50 | + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." |
| 51 | + break |
| 52 | + fi |
| 53 | + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." |
| 54 | + sleep "$(( 2 ** attempt++ ))" |
| 55 | + done |
| 56 | + while true; do |
| 57 | + if ! timeout 60 ${healthcheck_command} > /dev/null; then |
| 58 | + echo "Container runtime ${container_runtime_name} failed!" |
| 59 | + if [[ "$container_runtime_name" == "docker" ]]; then |
| 60 | + # Dump stack of docker daemon for investigation. |
| 61 | + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to |
| 62 | + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. |
| 63 | + pkill -SIGUSR1 dockerd |
| 64 | + fi |
| 65 | + systemctl kill --kill-who=main "${container_runtime_name}" |
| 66 | + # Wait for a while, as we don't want to kill it again before it is really up. |
| 67 | + sleep 120 |
| 68 | + else |
| 69 | + sleep "${SLEEP_SECONDS}" |
| 70 | + fi |
| 71 | + done |
| 72 | +} |
| 73 | + |
| 74 | +function kubelet_monitoring { |
| 75 | + echo "Wait for 2 minutes for kubelet to be functional" |
| 76 | + # TODO(andyzheng0831): replace it with a more reliable method if possible. |
| 77 | + sleep 120 |
| 78 | + local -r max_seconds=10 |
| 79 | + local output="" |
| 80 | + while [ 1 ]; do |
| 81 | + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then |
| 82 | + # Print the response and/or errors. |
| 83 | + echo $output |
| 84 | + echo "Kubelet is unhealthy!" |
| 85 | + systemctl kill kubelet |
| 86 | + # Wait for a while, as we don't want to kill it again before it is really up. |
| 87 | + sleep 60 |
| 88 | + else |
| 89 | + sleep "${SLEEP_SECONDS}" |
| 90 | + fi |
| 91 | + done |
| 92 | +} |
| 93 | + |
| 94 | + |
| 95 | +############## Main Function ################ |
| 96 | +if [[ "$#" -ne 1 ]]; then |
| 97 | + echo "Usage: health-monitor.sh <container-runtime/kubelet>" |
| 98 | + exit 1 |
| 99 | +fi |
| 100 | + |
| 101 | +KUBE_HOME="/home/kubernetes" |
| 102 | + |
| 103 | +SLEEP_SECONDS=10 |
| 104 | +component=$1 |
| 105 | +echo "Start kubernetes health monitoring for ${component}" |
| 106 | +if [[ "${component}" == "container-runtime" ]]; then |
| 107 | + container_runtime_monitoring |
| 108 | +elif [[ "${component}" == "kubelet" ]]; then |
| 109 | + kubelet_monitoring |
| 110 | +else |
| 111 | + echo "Health monitoring for component "${component}" is not supported!" |
| 112 | +fi |
0 commit comments