diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index 7f22a6518fb..f05e8c3837c 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -388,6 +388,36 @@ add_iptable_rules_to_skip_conntrack_from_pods(){ done } +# Wait for localdns IP to be removed from resolv.conf after networkctl reload. +# Arguments: +# $1: max_wait_seconds - Maximum time to wait for the change (default: 5). +wait_for_localdns_removed_from_resolv_conf() { + local max_wait_seconds="${1:-5}" + local sleep_interval=0.25 + local max_iterations=$((max_wait_seconds * 4)) # 4 iterations per second with 0.25s sleep + local iteration=0 + + echo "Waiting for localdns (${LOCALDNS_NODE_LISTENER_IP}) to be removed from resolv.conf..." + + while [ "$iteration" -lt "$max_iterations" ]; do + local current_dns + current_dns=$(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ') + + # Use word boundary matching (-w) with fixed string (-F) to avoid partial IP matches. + if ! grep -qwF "$LOCALDNS_NODE_LISTENER_IP" <<< "$current_dns"; then + echo "DNS configuration refreshed successfully. Current DNS: ${current_dns}" + return 0 + fi + + sleep $sleep_interval + iteration=$((iteration + 1)) + done + + echo "Timed out waiting for localdns to be removed from resolv.conf after ${max_wait_seconds} seconds." + echo "Current DNS: $(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ')" + return 1 +} + # Disable DNS provided by DHCP and point the system at localdns. disable_dhcp_use_clusterlistener() { mkdir -p "${NETWORK_DROPIN_DIR}" @@ -621,6 +651,19 @@ initialize_network_variables || exit $ERR_LOCALDNS_FAIL # --------------------------------------------------------------------------------------------------------------------- cleanup_iptables_and_dns || exit $ERR_LOCALDNS_FAIL +# During startup, wait for the DNS configuration to be fully refreshed. +# This ensures systemd-resolved has removed localdns from resolv.conf before we read upstream DNS servers. +# The wait is necessary because networkctl reload is async - there's a delay before systemd-resolved +# updates /run/systemd/resolve/resolv.conf. The next step (replace_azurednsip_in_corefile) reads +# resolv.conf to get upstream DNS servers. Without this wait, we might still see 169.254.10.10 +# (localdns IP) as a nameserver, which would create a circular dependency in the corefile. +# Note: the shutdown path does not need this wait because it doesn't read from resolv.conf afterward - +# it just cleans up and exits, so systemd-resolved can complete the update asynchronously. +if ! wait_for_localdns_removed_from_resolv_conf 5; then + echo "Error: DNS configuration was not refreshed within timeout." + exit $ERR_LOCALDNS_FAIL +fi + # Replace AzureDNSIP in corefile with VNET DNS ServerIPs. # --------------------------------------------------------------------------------------------------------------------- replace_azurednsip_in_corefile || exit $ERR_LOCALDNS_FAIL diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 374aee1233a..95a5c555364 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -1110,4 +1110,155 @@ EOF The status should be success End End + + +# This section tests - wait_for_localdns_removed_from_resolv_conf +# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'wait_for_localdns_removed_from_resolv_conf' + setup() { + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + TEST_DIR="/tmp/localdnstest-$$" + RESOLV_CONF="${TEST_DIR}/run/systemd/resolve/resolv.conf" + mkdir -p "$(dirname "$RESOLV_CONF")" + } + cleanup() { + rm -rf "$TEST_DIR" + } + BeforeEach 'setup' + AfterEach 'cleanup' + + #------------------------- wait_for_localdns_removed_from_resolv_conf ------------------------------------------ + It 'should return success immediately if localdns IP is absent' + cat > "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" + When run wait_for_localdns_removed_from_resolv_conf 5 + The status should be success + The stdout should include "DNS configuration refreshed successfully" + End + + It 'should use default timeout of 5 seconds when not specified' + cat > "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF") & + When run wait_for_localdns_removed_from_resolv_conf 5 + The status should be success + The stdout should include "DNS configuration refreshed successfully" + End + + It 'should ignore commented lines in resolv.conf' + cat > "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" < "$RESOLV_CONF" + When run wait_for_localdns_removed_from_resolv_conf 2 + The status should be success + The stdout should include "DNS configuration refreshed successfully" + End + End End