Skip to content

Commit 6e5f086

Browse files
saewoniCopilot
andauthored
fix(localdns): wait for resolv.conf update after networkctl reload to prevent race condition (#7749)
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Devin Wong <wongsiosun@outlook.com>
1 parent d823cda commit 6e5f086

File tree

2 files changed

+194
-0
lines changed

2 files changed

+194
-0
lines changed

parts/linux/cloud-init/artifacts/localdns.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,36 @@ add_iptable_rules_to_skip_conntrack_from_pods(){
388388
done
389389
}
390390

391+
# Wait for localdns IP to be removed from resolv.conf after networkctl reload.
392+
# Arguments:
393+
# $1: max_wait_seconds - Maximum time to wait for the change (default: 5).
394+
wait_for_localdns_removed_from_resolv_conf() {
395+
local max_wait_seconds="${1:-5}"
396+
local sleep_interval=0.25
397+
local max_iterations=$((max_wait_seconds * 4)) # 4 iterations per second with 0.25s sleep
398+
local iteration=0
399+
400+
echo "Waiting for localdns (${LOCALDNS_NODE_LISTENER_IP}) to be removed from resolv.conf..."
401+
402+
while [ "$iteration" -lt "$max_iterations" ]; do
403+
local current_dns
404+
current_dns=$(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ')
405+
406+
# Use word boundary matching (-w) with fixed string (-F) to avoid partial IP matches.
407+
if ! grep -qwF "$LOCALDNS_NODE_LISTENER_IP" <<< "$current_dns"; then
408+
echo "DNS configuration refreshed successfully. Current DNS: ${current_dns}"
409+
return 0
410+
fi
411+
412+
sleep $sleep_interval
413+
iteration=$((iteration + 1))
414+
done
415+
416+
echo "Timed out waiting for localdns to be removed from resolv.conf after ${max_wait_seconds} seconds."
417+
echo "Current DNS: $(awk '/^nameserver/ {print $2}' "$RESOLV_CONF" 2>/dev/null | paste -sd' ')"
418+
return 1
419+
}
420+
391421
# Disable DNS provided by DHCP and point the system at localdns.
392422
disable_dhcp_use_clusterlistener() {
393423
mkdir -p "${NETWORK_DROPIN_DIR}"
@@ -621,6 +651,19 @@ initialize_network_variables || exit $ERR_LOCALDNS_FAIL
621651
# ---------------------------------------------------------------------------------------------------------------------
622652
cleanup_iptables_and_dns || exit $ERR_LOCALDNS_FAIL
623653

654+
# During startup, wait for the DNS configuration to be fully refreshed.
655+
# This ensures systemd-resolved has removed localdns from resolv.conf before we read upstream DNS servers.
656+
# The wait is necessary because networkctl reload is async - there's a delay before systemd-resolved
657+
# updates /run/systemd/resolve/resolv.conf. The next step (replace_azurednsip_in_corefile) reads
658+
# resolv.conf to get upstream DNS servers. Without this wait, we might still see 169.254.10.10
659+
# (localdns IP) as a nameserver, which would create a circular dependency in the corefile.
660+
# Note: the shutdown path does not need this wait because it doesn't read from resolv.conf afterward -
661+
# it just cleans up and exits, so systemd-resolved can complete the update asynchronously.
662+
if ! wait_for_localdns_removed_from_resolv_conf 5; then
663+
echo "Error: DNS configuration was not refreshed within timeout."
664+
exit $ERR_LOCALDNS_FAIL
665+
fi
666+
624667
# Replace AzureDNSIP in corefile with VNET DNS ServerIPs.
625668
# ---------------------------------------------------------------------------------------------------------------------
626669
replace_azurednsip_in_corefile || exit $ERR_LOCALDNS_FAIL

spec/parts/linux/cloud-init/artifacts/localdns_spec.sh

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,4 +1110,155 @@ EOF
11101110
The status should be success
11111111
End
11121112
End
1113+
1114+
1115+
# This section tests - wait_for_localdns_removed_from_resolv_conf
1116+
# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file.
1117+
#------------------------------------------------------------------------------------------------------------------------------------
1118+
Describe 'wait_for_localdns_removed_from_resolv_conf'
1119+
setup() {
1120+
Include "./parts/linux/cloud-init/artifacts/localdns.sh"
1121+
TEST_DIR="/tmp/localdnstest-$$"
1122+
RESOLV_CONF="${TEST_DIR}/run/systemd/resolve/resolv.conf"
1123+
mkdir -p "$(dirname "$RESOLV_CONF")"
1124+
}
1125+
cleanup() {
1126+
rm -rf "$TEST_DIR"
1127+
}
1128+
BeforeEach 'setup'
1129+
AfterEach 'cleanup'
1130+
1131+
#------------------------- wait_for_localdns_removed_from_resolv_conf ------------------------------------------
1132+
It 'should return success immediately if localdns IP is absent'
1133+
cat > "$RESOLV_CONF" <<EOF
1134+
nameserver 10.0.0.1
1135+
nameserver 10.0.0.2
1136+
EOF
1137+
When run wait_for_localdns_removed_from_resolv_conf 5
1138+
The status should be success
1139+
The stdout should include "DNS configuration refreshed successfully"
1140+
The stdout should include "Current DNS: 10.0.0.1 10.0.0.2"
1141+
End
1142+
1143+
It 'should timeout if localdns IP is still present'
1144+
cat > "$RESOLV_CONF" <<EOF
1145+
nameserver 169.254.10.10
1146+
nameserver 10.0.0.1
1147+
EOF
1148+
When run wait_for_localdns_removed_from_resolv_conf 2
1149+
The status should be failure
1150+
The stdout should include "Timed out waiting for localdns to be removed from resolv.conf after 2 seconds"
1151+
The stdout should include "Current DNS:"
1152+
End
1153+
1154+
It 'should return success if resolv.conf is empty'
1155+
> "$RESOLV_CONF"
1156+
When run wait_for_localdns_removed_from_resolv_conf 5
1157+
The status should be success
1158+
The stdout should include "DNS configuration refreshed successfully"
1159+
End
1160+
1161+
It 'should use default timeout of 5 seconds when not specified'
1162+
cat > "$RESOLV_CONF" <<EOF
1163+
nameserver 10.0.0.1
1164+
EOF
1165+
When run wait_for_localdns_removed_from_resolv_conf
1166+
The status should be success
1167+
The stdout should include "DNS configuration refreshed successfully"
1168+
End
1169+
1170+
It 'should handle resolv.conf not existing gracefully'
1171+
rm -f "$RESOLV_CONF"
1172+
When run wait_for_localdns_removed_from_resolv_conf 2
1173+
The status should be success
1174+
The stdout should include "DNS configuration refreshed successfully"
1175+
End
1176+
1177+
It 'should not match partial IP addresses'
1178+
cat > "$RESOLV_CONF" <<EOF
1179+
nameserver 169.254.10.100
1180+
EOF
1181+
# 169.254.10.100 should NOT match 169.254.10.10
1182+
When run wait_for_localdns_removed_from_resolv_conf 2
1183+
The status should be success
1184+
The stdout should include "DNS configuration refreshed successfully"
1185+
End
1186+
1187+
It 'should detect localdns IP among multiple nameservers'
1188+
cat > "$RESOLV_CONF" <<EOF
1189+
nameserver 10.0.0.1
1190+
nameserver 169.254.10.10
1191+
nameserver 10.0.0.2
1192+
EOF
1193+
When run wait_for_localdns_removed_from_resolv_conf 2
1194+
The status should be failure
1195+
The stdout should include "Timed out waiting for localdns to be removed"
1196+
End
1197+
1198+
It 'should succeed when localdns IP is removed during wait (async removal)'
1199+
# Start with localdns IP present
1200+
cat > "$RESOLV_CONF" <<EOF
1201+
nameserver 169.254.10.10
1202+
nameserver 10.0.0.1
1203+
EOF
1204+
# Create background process that removes localdns IP after 2 seconds
1205+
(sleep 2 && echo "nameserver 10.0.0.1" > "$RESOLV_CONF") &
1206+
When run wait_for_localdns_removed_from_resolv_conf 5
1207+
The status should be success
1208+
The stdout should include "DNS configuration refreshed successfully"
1209+
End
1210+
1211+
It 'should ignore commented lines in resolv.conf'
1212+
cat > "$RESOLV_CONF" <<EOF
1213+
# nameserver 169.254.10.10
1214+
nameserver 10.0.0.1
1215+
# This is a comment
1216+
nameserver 10.0.0.2
1217+
EOF
1218+
When run wait_for_localdns_removed_from_resolv_conf 2
1219+
The status should be success
1220+
The stdout should include "DNS configuration refreshed successfully"
1221+
End
1222+
1223+
It 'should timeout when only localdns IP is present'
1224+
cat > "$RESOLV_CONF" <<EOF
1225+
nameserver 169.254.10.10
1226+
EOF
1227+
When run wait_for_localdns_removed_from_resolv_conf 2
1228+
The status should be failure
1229+
The stdout should include "Timed out waiting for localdns to be removed"
1230+
End
1231+
1232+
It 'should handle IPv6 nameservers mixed with IPv4'
1233+
cat > "$RESOLV_CONF" <<EOF
1234+
nameserver 10.0.0.1
1235+
nameserver 2001:4860:4860::8888
1236+
nameserver 10.0.0.2
1237+
EOF
1238+
When run wait_for_localdns_removed_from_resolv_conf 2
1239+
The status should be success
1240+
The stdout should include "DNS configuration refreshed successfully"
1241+
End
1242+
1243+
It 'should handle resolv.conf with search and options directives'
1244+
cat > "$RESOLV_CONF" <<EOF
1245+
search example.com local
1246+
nameserver 10.0.0.1
1247+
nameserver 10.0.0.2
1248+
options timeout:2 attempts:3
1249+
EOF
1250+
When run wait_for_localdns_removed_from_resolv_conf 2
1251+
The status should be success
1252+
The stdout should include "DNS configuration refreshed successfully"
1253+
The stdout should include "Current DNS: 10.0.0.1 10.0.0.2"
1254+
End
1255+
1256+
It 'should handle whitespace variations in resolv.conf'
1257+
# Use tabs and extra spaces
1258+
printf "nameserver\t10.0.0.1\nnameserver 10.0.0.2\n" > "$RESOLV_CONF"
1259+
When run wait_for_localdns_removed_from_resolv_conf 2
1260+
The status should be success
1261+
The stdout should include "DNS configuration refreshed successfully"
1262+
End
1263+
End
11131264
End

0 commit comments

Comments
 (0)