Skip to content

Commit 4712cf2

Browse files
authored
fix: update localdns watchdog logic (#7546)
1 parent 85588da commit 4712cf2

File tree

1 file changed

+45
-5
lines changed

1 file changed

+45
-5
lines changed

parts/linux/cloud-init/artifacts/localdns.sh

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,12 @@ CURL_COMMAND="curl -s http://${LOCALDNS_NODE_LISTENER_IP}:8181/ready"
5858
# This is used by disable_dhcp_use_clusterlistener and cleanup_localdns_configs functions.
5959
NETWORKCTL_RELOAD_CMD="networkctl reload"
6060

61-
# The health check is a DNS request to the localdns service IPs.
62-
HEALTH_CHECK_DNS_REQUEST=$'health-check.localdns.local @'"${LOCALDNS_NODE_LISTENER_IP}"$'\nhealth-check.localdns.local @'"${LOCALDNS_CLUSTER_LISTENER_IP}"
63-
6461
START_LOCALDNS_TIMEOUT=10
6562

63+
# DNS health check timeout.
64+
DNS_HEALTH_CHECK_TIMEOUT=2
65+
DNS_HEALTH_CHECK_TRIES=2
66+
6667
# Function definitions used in this file.
6768
# functions defined until "${__SOURCED__:+return}" are sourced and tested in -
6869
# spec/parts/linux/cloud-init/artifacts/localdns_spec.sh.
@@ -509,11 +510,50 @@ start_localdns_watchdog() {
509510
# five times in every watchdog interval, and thus need to fail five checks to get restarted.
510511
HEALTH_CHECK_INTERVAL=$((${WATCHDOG_USEC:-5000000} * 20 / 100 / 1000000))
511512
echo "Starting watchdog loop at ${HEALTH_CHECK_INTERVAL} second intervals."
513+
514+
# Sliding window failure detection: 10 failures in 10 minutes (600 seconds).
515+
# This catches intermittent but frequent failures that might not be consecutive.
516+
max_sliding_window_failures=10
517+
sliding_window_duration_in_seconds=600
518+
sliding_window_failure_count=0
519+
sliding_window_start_time=0
520+
521+
# If health check failed 5 consecutive times or failed 10 times in a 10 minute sliding window, watchdog restarts the systemd unit.
512522
while true; do
513-
if [ "$($CURL_COMMAND)" = "OK" ] && dig +short +timeout=1 +tries=1 -f <(printf '%s\n' "$HEALTH_CHECK_DNS_REQUEST"); then
523+
health_check_passed=true
524+
if [ "$($CURL_COMMAND)" != "OK" ]; then
525+
echo "Health check failed: HTTP ready endpoint not responding."
526+
health_check_passed=false
527+
fi
528+
if ! dig +short +timeout=${DNS_HEALTH_CHECK_TIMEOUT} +tries=${DNS_HEALTH_CHECK_TRIES} health-check.localdns.local @${LOCALDNS_NODE_LISTENER_IP} >/dev/null 2>&1; then
529+
echo "Health check failed: DNS query to ${LOCALDNS_NODE_LISTENER_IP} failed."
530+
health_check_passed=false
531+
fi
532+
if ! dig +short +timeout=${DNS_HEALTH_CHECK_TIMEOUT} +tries=${DNS_HEALTH_CHECK_TRIES} health-check.localdns.local @${LOCALDNS_CLUSTER_LISTENER_IP} >/dev/null 2>&1; then
533+
echo "Health check failed: DNS query to ${LOCALDNS_CLUSTER_LISTENER_IP} failed."
534+
health_check_passed=false
535+
fi
536+
537+
if [ "$health_check_passed" = true ]; then
514538
systemd-notify WATCHDOG=1
515539
else
516-
echo "Localdns health check failed - will be restarted."
540+
echo "localdns health check failed - will be restarted if failures persist."
541+
542+
current_time=$(date +%s)
543+
# If this is the first failure or window has expired, start a new window
544+
if [ "$sliding_window_start_time" -eq 0 ] || [ $((current_time - sliding_window_start_time)) -gt "$sliding_window_duration_in_seconds" ]; then
545+
sliding_window_start_time=$current_time
546+
sliding_window_failure_count=1
547+
else
548+
sliding_window_failure_count=$((sliding_window_failure_count + 1))
549+
fi
550+
551+
# Check if sliding window threshold is exceeded
552+
if [ "$sliding_window_failure_count" -ge "$max_sliding_window_failures" ]; then
553+
echo "max sliding window failures (${max_sliding_window_failures} in ${sliding_window_duration_in_seconds}s) reached. Triggering restart."
554+
systemd-notify WATCHDOG=trigger
555+
exit $ERR_LOCALDNS_FAIL
556+
fi
517557
fi
518558
sleep "${HEALTH_CHECK_INTERVAL}"
519559
done

0 commit comments

Comments
 (0)