From 84abf878f46d9ea0c51beff855464cfc037c4513 Mon Sep 17 00:00:00 2001 From: Saewon Kwak <23280628+saewoni@users.noreply.github.com> Date: Wed, 28 Jan 2026 22:58:54 +0000 Subject: [PATCH 01/14] fix(localdns): wait for resolv.conf update after networkctl reload to prevent race condition --- parts/linux/cloud-init/artifacts/localdns.sh | 58 ++++++++++ .../cloud-init/artifacts/localdns_spec.sh | 108 ++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index 7f22a6518fb..4df56b6e821 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -388,6 +388,48 @@ add_iptable_rules_to_skip_conntrack_from_pods(){ done } +# Wait for DNS configuration to be applied after networkctl reload. +# This function polls the resolv.conf to verify the expected DNS server is present. +# Arguments: +# $1: expected_dns_ip - The DNS IP that should appear in resolv.conf. +# $2: should_contain - "true" if the IP should be present, "false" if it should be absent. +# $3: max_wait_seconds - Maximum time to wait for the change (default: 10). +wait_for_dns_config_applied() { + local expected_dns_ip="$1" + local should_contain="$2" + local max_wait_seconds="${3:-10}" + local elapsed=0 + + echo "Waiting for DNS configuration to be applied (expecting ${expected_dns_ip} to be ${should_contain})..." + + while [ "$elapsed" -lt "$max_wait_seconds" ]; do + # Get current DNS servers from resolv.conf. + local current_dns + current_dns=$(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ') + + if [ "$should_contain" = "true" ]; then + # Check if the expected DNS IP is present. + if echo "$current_dns" | grep -qw "$expected_dns_ip"; then + echo "DNS configuration applied successfully. Current DNS: ${current_dns}" + return 0 + fi + else + # Check if the expected DNS IP is absent. + if ! echo "$current_dns" | grep -qw "$expected_dns_ip"; then + echo "DNS configuration reverted successfully. Current DNS: ${current_dns}" + return 0 + fi + fi + + sleep 1 + elapsed=$((elapsed + 1)) + done + + echo "Timed out waiting for DNS configuration to be applied after ${max_wait_seconds} seconds." + echo "Expected ${expected_dns_ip} to be ${should_contain}, current DNS: $(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ')" + return 1 +} + # Disable DNS provided by DHCP and point the system at localdns. disable_dhcp_use_clusterlistener() { mkdir -p "${NETWORK_DROPIN_DIR}" @@ -412,6 +454,14 @@ EOF echo "Failed to reload networkctl." return 1 fi + + # Wait for the DNS configuration to be applied. + # This ensures systemd-resolved has updated resolv.conf before we proceed. + if ! wait_for_dns_config_applied "${LOCALDNS_NODE_LISTENER_IP}" "true" 10; then + echo "Warning: DNS configuration may not have been fully applied." + return 1 + fi + return 0 } @@ -472,6 +522,14 @@ cleanup_iptables_and_dns() { fi echo "Reloading network configuration succeeded." + # Wait for the DNS configuration to be reverted. + # This ensures systemd-resolved has removed localdns from resolv.conf before we proceed. + # This is called both at startup (to clean up leftover state) and during shutdown. + if ! wait_for_dns_config_applied "${LOCALDNS_NODE_LISTENER_IP}" "false" 10; then + echo "Warning: DNS configuration may not have been fully reverted." + # Don't fail for this - the localdns IP might not have been configured previously. + fi + return 0 } diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 374aee1233a..3f86b9186ff 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -1110,4 +1110,112 @@ EOF The status should be success End End + + +# This section tests - wait_for_dns_config_applied +# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'wait_for_dns_config_applied' + setup() { + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + TEST_DIR="/tmp/localdnstest" + RESOLV_CONF="${TEST_DIR}/run/systemd/resolve/resolv.conf" + mkdir -p "$(dirname "$RESOLV_CONF")" + } + cleanup() { + rm -rf "$TEST_DIR" + } + BeforeEach 'setup' + AfterEach 'cleanup' + + #------------------------- wait_for_dns_config_applied (should_contain=true) ---------------------------------- + It 'should return success immediately if expected IP is present (should_contain=true)' + cat > "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" + When run wait_for_dns_config_applied "169.254.10.10" "false" 5 + The status should be success + The stdout should include "DNS configuration reverted successfully" + End + + #------------------------- wait_for_dns_config_applied (default timeout) -------------------------------------- + It 'should use default timeout of 10 seconds when not specified' + cat > "$RESOLV_CONF" < "$RESOLV_CONF" < Date: Thu, 29 Jan 2026 20:46:08 +0000 Subject: [PATCH 02/14] update the tests --- .../cloud-init/artifacts/localdns_spec.sh | 103 +++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 3f86b9186ff..873282b8ced 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -631,25 +631,34 @@ EOF #------------------------------------------------------------------------------------------------------------------------------------ Describe 'disable_dhcp_use_clusterlistener' setup() { - NETWORK_DROPIN_DIR="/tmp/test-systemd-network" + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + + TEST_DIR="/tmp/test-disable-dhcp-$$" + NETWORK_DROPIN_DIR="${TEST_DIR}/systemd-network" NETWORK_DROPIN_FILE="${NETWORK_DROPIN_DIR}/10-localdns.conf" LOCALDNS_NODE_LISTENER_IP="169.254.10.10" - Include "./parts/linux/cloud-init/artifacts/localdns.sh" + # Setup RESOLV_CONF for wait_for_dns_config_applied (must be after Include) + mkdir -p "${TEST_DIR}/run/systemd/resolve" + RESOLV_CONF="${TEST_DIR}/run/systemd/resolve/resolv.conf" } cleanup() { - rm -rf "$NETWORK_DROPIN_DIR" + rm -rf "$TEST_DIR" } BeforeEach 'setup' AfterEach 'cleanup' #------------------------- disable_dhcp_use_clusterlistener ------------------------------------------------- It 'should update network configuration and reload networkctl' + # Pre-populate resolv.conf with expected IP so wait succeeds immediately + echo "nameserver 169.254.10.10" > "$RESOLV_CONF" NETWORKCTL_RELOAD_CMD="true" When call disable_dhcp_use_clusterlistener The status should be success The file "${NETWORK_DROPIN_FILE}" should be exist The contents of file "${NETWORK_DROPIN_FILE}" should include "UseDNS=false" The contents of file "${NETWORK_DROPIN_FILE}" should include "DNS=169.254.10.10" + The stdout should include "DNS configuration applied successfully" + The contents of file "${RESOLV_CONF}" should include "nameserver 169.254.10.10" End It 'should fail if networkctl reload fails' @@ -803,6 +812,88 @@ EOF End +# This section tests - async networkctl reload behavior with wait_for_dns_config_applied +# These tests verify that cleanup_iptables_and_dns properly waits for resolv.conf to be updated +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'cleanup_iptables_and_dns_with_async_resolv_update' + setup() { + # Mock iptables to return no rules (must be defined before Include) + iptables() { + case "$1" in + "-w") + if [ "$2" = "-t" ] && [ "$3" = "raw" ] && [ "$4" = "-L" ]; then + echo "Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes)" + echo "Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)" + fi + ;; + esac + return 0 + } + + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + + TEST_DIR="/tmp/localdns-async-test-$$" + mkdir -p "${TEST_DIR}/run/systemd/resolve" + mkdir -p "${TEST_DIR}/network.d" + + # Set RESOLV_CONF AFTER including the script to override the default + RESOLV_CONF="${TEST_DIR}/run/systemd/resolve/resolv.conf" + NETWORK_DROPIN_FILE="${TEST_DIR}/test-network-dropin.conf" + DEFAULT_ROUTE_INTERFACE="eth0" + NETWORK_FILE="/etc/systemd/network/eth0.network" + NETWORK_DROPIN_DIR="${TEST_DIR}/network.d" + } + cleanup() { + rm -rf "$TEST_DIR" + } + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'should wait for resolv.conf update after networkctl reload (simulating async behavior)' + # Setup resolv.conf with localdns IP present + cat > "$RESOLV_CONF" < "$ASYNC_SCRIPT" <