amazonlinux · ddermendzhiev · Mar 31, 2026 · Apr 2, 2026 · Apr 2, 2026 · joeysk2012
diff --git a/bin/setup-policy-routes.sh b/bin/setup-policy-routes.sh
@@ -50,12 +50,17 @@ refresh)
 start)
     register_networkd_reloader
     counter=0
+    max_wait=3000   # 5 minute timeout to avoid infinite loop if sysfs node never appears
     while [ ! -e "/sys/class/net/${iface}" ]; do
         if ((counter % 1000 == 0)); then
             debug "Waiting for sysfs node to exist for ${iface} (iteration $counter)"
         fi
         sleep 0.1
         ((counter++))
+        if ((counter >= max_wait)); then
+            error "Timed out waiting for sysfs node for ${iface} after $((counter / 10)) seconds"
+            exit 1
+        fi
     done
     debug "Starting configuration for $iface"
     debug /lib/systemd/systemd-networkd-wait-online -i "$iface"

diff --git a/lib/lib.sh b/lib/lib.sh
@@ -631,6 +631,18 @@ register_networkd_reloader() {
     local -r lockfile="${lockdir}/${iface}"
     local old_opts=$-
 
+    # If the existing lock owner is no longer alive, remove the stale lockfile
+    # so subsequent invocations don't spin for up to 1000 seconds waiting on a
+    # process that will never release it.
+    if [ -f "${lockfile}" ]; then
+        local existing_pid
+        existing_pid=$(cat "${lockfile}" 2>/dev/null)
+        if [ -n "$existing_pid" ] && ! kill -0 "$existing_pid" 2>/dev/null; then
+            debug "Removing stale lock from dead process $existing_pid for ${iface}"
+            rm -f "${lockfile}"
+        fi
+    fi
+
     # Disable -o errexit in the following block so we can capture
     # nonzero exit codes from a redirect without considering them
     # fatal errors