Skip to content

Commit dc2b238

Browse files
authored
Merge pull request #149 from ddermendzhiev/fix/setup-policy-routes-sysfs-timeout
setup-policy-routes start: infinite sysfs wait loop causes unbounded process accumulation on ECS hosts
2 parents 4172b7d + 65fbb03 commit dc2b238

File tree

3 files changed

+21
-2
lines changed

3 files changed

+21
-2
lines changed

bin/setup-policy-routes.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,18 @@ refresh)
5050
start)
5151
register_networkd_reloader
5252
counter=0
53+
max_wait=3000 # 5 minute timeout to avoid infinite loop if sysfs node never appears
5354
while [ ! -e "/sys/class/net/${iface}" ]; do
5455
if ((counter % 1000 == 0)); then
5556
debug "Waiting for sysfs node to exist for ${iface} (iteration $counter)"
5657
fi
5758
sleep 0.1
58-
((counter++))
59+
((counter++)) || true
60+
if ((counter >= max_wait)); then
61+
error "Timed out waiting for sysfs node for ${iface} after $((counter / 10)) seconds"
62+
/usr/bin/systemctl disable --now refresh-policy-routes@${iface}.timer 2>/dev/null || true
63+
exit 2
64+
fi
5965
done
6066
debug "Starting configuration for $iface"
6167
debug /lib/systemd/systemd-networkd-wait-online -i "$iface"

lib/lib.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,10 +631,22 @@ maybe_reload_networkd() {
631631

632632
register_networkd_reloader() {
633633
local -i registered=1 cnt=0
634-
local -i max=10000
634+
local -i max=3000 # 300s (3000 × 0.1s); matches sysfs wait timeout in setup-policy-routes.sh
635635
local -r lockfile="${lockdir}/${iface}"
636636
local old_opts=$-
637637

638+
# If the existing lock owner is no longer alive, remove the stale lockfile
639+
# so subsequent invocations don't spin for up to 1000 seconds waiting on a
640+
# process that will never release it.
641+
if [ -f "${lockfile}" ]; then
642+
local existing_pid
643+
existing_pid=$(cat "${lockfile}" 2>/dev/null)
644+
if [ -n "$existing_pid" ] && ! kill -0 "$existing_pid" 2>/dev/null; then
645+
debug "Removing stale lock from dead process $existing_pid for ${iface}"
646+
rm -f "${lockfile}"
647+
fi
648+
fi
649+
638650
# Disable -o errexit in the following block so we can capture
639651
# nonzero exit codes from a redirect without considering them
640652
# fatal errors

systemd/system/policy-routes@.service

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ User=root
1717
ExecStart=/usr/bin/setup-policy-routes %i start
1818
Restart=on-failure
1919
RestartSec=1
20+
RestartPreventExitStatus=2
2021
KillMode=process

0 commit comments

Comments
 (0)