@@ -58,11 +58,12 @@ CURL_COMMAND="curl -s http://${LOCALDNS_NODE_LISTENER_IP}:8181/ready"
5858# This is used by disable_dhcp_use_clusterlistener and cleanup_localdns_configs functions.
5959NETWORKCTL_RELOAD_CMD=" networkctl reload"
6060
61- # The health check is a DNS request to the localdns service IPs.
62- HEALTH_CHECK_DNS_REQUEST=$' health-check.localdns.local @' " ${LOCALDNS_NODE_LISTENER_IP} " $' \n health-check.localdns.local @' " ${LOCALDNS_CLUSTER_LISTENER_IP} "
63-
6461START_LOCALDNS_TIMEOUT=10
6562
63+ # DNS health check timeout.
64+ DNS_HEALTH_CHECK_TIMEOUT=2
65+ DNS_HEALTH_CHECK_TRIES=2
66+
6667# Function definitions used in this file.
6768# functions defined until "${__SOURCED__:+return}" are sourced and tested in -
6869# spec/parts/linux/cloud-init/artifacts/localdns_spec.sh.
@@ -509,11 +510,50 @@ start_localdns_watchdog() {
509510 # five times in every watchdog interval, and thus need to fail five checks to get restarted.
510511 HEALTH_CHECK_INTERVAL=$(( ${WATCHDOG_USEC:- 5000000} * 20 / 100 / 1000000 ))
511512 echo " Starting watchdog loop at ${HEALTH_CHECK_INTERVAL} second intervals."
513+
514+ # Sliding window failure detection: 10 failures in 10 minutes (600 seconds).
515+ # This catches intermittent but frequent failures that might not be consecutive.
516+ max_sliding_window_failures=10
517+ sliding_window_duration_in_seconds=600
518+ sliding_window_failure_count=0
519+ sliding_window_start_time=0
520+
521+ # If health check failed 5 consecutive times or failed 10 times in a 10 minute sliding window, watchdog restarts the systemd unit.
512522 while true ; do
513- if [ " $( $CURL_COMMAND ) " = " OK" ] && dig +short +timeout=1 +tries=1 -f <( printf ' %s\n' " $HEALTH_CHECK_DNS_REQUEST " ) ; then
523+ health_check_passed=true
524+ if [ " $( $CURL_COMMAND ) " != " OK" ]; then
525+ echo " Health check failed: HTTP ready endpoint not responding."
526+ health_check_passed=false
527+ fi
528+ if ! dig +short +timeout=${DNS_HEALTH_CHECK_TIMEOUT} +tries=${DNS_HEALTH_CHECK_TRIES} health-check.localdns.local @${LOCALDNS_NODE_LISTENER_IP} > /dev/null 2>&1 ; then
529+ echo " Health check failed: DNS query to ${LOCALDNS_NODE_LISTENER_IP} failed."
530+ health_check_passed=false
531+ fi
532+ if ! dig +short +timeout=${DNS_HEALTH_CHECK_TIMEOUT} +tries=${DNS_HEALTH_CHECK_TRIES} health-check.localdns.local @${LOCALDNS_CLUSTER_LISTENER_IP} > /dev/null 2>&1 ; then
533+ echo " Health check failed: DNS query to ${LOCALDNS_CLUSTER_LISTENER_IP} failed."
534+ health_check_passed=false
535+ fi
536+
537+ if [ " $health_check_passed " = true ]; then
514538 systemd-notify WATCHDOG=1
515539 else
516- echo " Localdns health check failed - will be restarted."
540+ echo " localdns health check failed - will be restarted if failures persist."
541+
542+ current_time=$( date +%s)
543+ # If this is the first failure or window has expired, start a new window
544+ if [ " $sliding_window_start_time " -eq 0 ] || [ $(( current_time - sliding_window_start_time)) -gt " $sliding_window_duration_in_seconds " ]; then
545+ sliding_window_start_time=$current_time
546+ sliding_window_failure_count=1
547+ else
548+ sliding_window_failure_count=$(( sliding_window_failure_count + 1 ))
549+ fi
550+
551+ # Check if sliding window threshold is exceeded
552+ if [ " $sliding_window_failure_count " -ge " $max_sliding_window_failures " ]; then
553+ echo " max sliding window failures (${max_sliding_window_failures} in ${sliding_window_duration_in_seconds} s) reached. Triggering restart."
554+ systemd-notify WATCHDOG=trigger
555+ exit $ERR_LOCALDNS_FAIL
556+ fi
517557 fi
518558 sleep " ${HEALTH_CHECK_INTERVAL} "
519559 done
0 commit comments