Skip to content

Commit 04274bc

Browse files
committed
feat: monitor DuetScreen for crashes and deadlocks, restart program if they occur
1 parent c108e98 commit 04274bc

File tree

2 files changed

+225
-0
lines changed

2 files changed

+225
-0
lines changed

board/duet3d/duetscreen/rootfs_overlay/etc/init.d/S20DuetScreen renamed to board/duet3d/duetscreen/rootfs_overlay/etc/init.d/20DuetScreen

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#!/bin/sh
22

3+
# This init script is only used to start/stop the DuetScreen application for development purposes.
4+
# S21DuetScreenMonitor is used in normal operation to restart DuetScreen if it crashes.
5+
36
DAEMON=/usr/bin/DuetScreen
47
DAEMON_OPTS=""
58
NAME=DuetScreen
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/bin/sh
2+
3+
# This init script contains an embedded monitor that keeps /usr/bin/DuetScreen running.
4+
# The 'start' action backgrounds the embedded monitor using start-stop-daemon.
5+
6+
NAME=DuetScreenMonitor
7+
PIDFILE=/var/run/$NAME.pid
8+
9+
# Monitor configuration and helpers
10+
APP="/usr/bin/DuetScreen"
11+
LOG_FILE="/var/log/duetscreen-monitor.log"
12+
TAG="DuetScreen-monitor"
13+
14+
# Watchdog file: if stale, force-restart the app
15+
WATCHDOG_FILE="${WATCHDOG_FILE:-/tmp/duetscreen-watchdog}"
16+
# Seconds; if 0 or negative, watchdog check is disabled
17+
WATCHDOG_TIMEOUT="${WATCHDOG_TIMEOUT:-5}"
18+
19+
# Optional tuning via environment variables (defaults favor immediate restarts)
20+
MIN_UPTIME="${MIN_UPTIME:-0}"
21+
BACKOFF_START="${BACKOFF_START:-0}"
22+
BACKOFF_MAX="${BACKOFF_MAX:-0}"
23+
RESTART_LIMIT="${RESTART_LIMIT:-0}"
24+
25+
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
26+
touch "$LOG_FILE" 2>/dev/null || true
27+
28+
log_msg() {
29+
# Prefer syslog via logger if available, always append to LOG_FILE too
30+
ts="$(date -Iseconds 2>/dev/null || date '+%Y-%m-%dT%H:%M:%S%z')"
31+
msg="$ts $*"
32+
if command -v logger >/dev/null 2>&1; then
33+
logger -t "$TAG" -- "$*"
34+
fi
35+
printf '%s\n' "$msg" >> "$LOG_FILE" 2>/dev/null || true
36+
}
37+
38+
run_monitor() {
39+
if [ ! -x "$APP" ]; then
40+
log_msg "ERROR: $APP not found or not executable"
41+
# Continue looping; it may appear later
42+
fi
43+
44+
running=1
45+
stopping=0
46+
child_pid=""
47+
watcher_pid=""
48+
restart_count=0
49+
backoff="$BACKOFF_START"
50+
51+
terminate() {
52+
stopping=1
53+
running=0
54+
# Forward TERM to child if running
55+
if [ -n "$child_pid" ] && kill -0 "$child_pid" 2>/dev/null; then
56+
kill -TERM "$child_pid" 2>/dev/null || true
57+
# Give it a brief chance to exit cleanly
58+
wait "$child_pid" 2>/dev/null || true
59+
fi
60+
# Stop watchdog if running
61+
if [ -n "$watcher_pid" ] && kill -0 "$watcher_pid" 2>/dev/null; then
62+
kill -TERM "$watcher_pid" 2>/dev/null || true
63+
wait "$watcher_pid" 2>/dev/null || true
64+
fi
65+
log_msg "Monitor stopping"
66+
exit 0
67+
}
68+
69+
trap terminate TERM INT
70+
71+
log_msg "Monitor started (pid $$)"
72+
73+
while [ "$running" -eq 1 ]; do
74+
start_ts=$(date +%s 2>/dev/null || echo 0)
75+
touch "$WATCHDOG_FILE" 2>/dev/null || true
76+
77+
"$APP" "$@" &
78+
child_pid=$!
79+
80+
# Start a lightweight background watcher that monitors the watchdog file
81+
# and gracefully restarts the child if the file is stale.
82+
if [ "$WATCHDOG_TIMEOUT" -gt 0 ] 2>/dev/null; then
83+
(
84+
# Subshell loop ends automatically when child exits
85+
while kill -0 "$child_pid" 2>/dev/null; do
86+
# Only check if the file exists; if it's not present, skip to avoid
87+
# false restarts on initial startup or systems that don't use the file.
88+
if [ -e "$WATCHDOG_FILE" ]; then
89+
now=$(date +%s 2>/dev/null || echo 0)
90+
# Use BusyBox-compatible way to read mtime; fall back to 0 on error
91+
mtime=$(date -r "$WATCHDOG_FILE" +%s 2>/dev/null || echo 0)
92+
# Ensure numeric comparison (guard if mtime failed)
93+
if [ "$mtime" -gt 0 ] 2>/dev/null; then
94+
age=$((now - mtime))
95+
if [ "$age" -gt "$WATCHDOG_TIMEOUT" ] 2>/dev/null; then
96+
log_msg "Watchdog stale: $WATCHDOG_FILE age=${age}s > ${WATCHDOG_TIMEOUT}s; restarting DuetScreen"
97+
# Ask the child to terminate; the main loop's wait will observe exit
98+
kill -TERM "$child_pid" 2>/dev/null || true
99+
# Give a brief moment between checks to avoid tight loop
100+
sleep 1
101+
# Continue; if the child is still alive, the loop will repeat
102+
fi
103+
fi
104+
fi
105+
sleep 1
106+
done
107+
) &
108+
watcher_pid=$!
109+
else
110+
watcher_pid=""
111+
fi
112+
# Wait for child to exit
113+
wait "$child_pid"
114+
status=$?
115+
end_ts=$(date +%s 2>/dev/null || echo 0)
116+
runtime=$((end_ts - start_ts))
117+
118+
# Ensure the watcher is stopped after the child exits
119+
if [ -n "$watcher_pid" ] && kill -0 "$watcher_pid" 2>/dev/null; then
120+
kill -TERM "$watcher_pid" 2>/dev/null || true
121+
wait "$watcher_pid" 2>/dev/null || true
122+
watcher_pid=""
123+
fi
124+
125+
# If we're stopping due to a signal, do not restart
126+
if [ "$stopping" -eq 1 ]; then
127+
break
128+
fi
129+
130+
# Determine reason
131+
if [ "$status" -eq 0 ]; then
132+
reason="exited normally (code 0)"
133+
elif [ "$status" -ge 128 ]; then
134+
sig=$((status - 128))
135+
reason="terminated by signal $sig (status $status)"
136+
else
137+
reason="exited with code $status"
138+
fi
139+
140+
# Log crashes and exits
141+
if [ "$status" -ne 0 ]; then
142+
log_msg "DuetScreen crashed: $reason; uptime=${runtime}s; restart_count=$restart_count"
143+
else
144+
log_msg "DuetScreen exited cleanly; uptime=${runtime}s; restarting"
145+
fi
146+
147+
# If binary is missing or exec failed, avoid busy loop
148+
if [ "$status" -eq 127 ] || [ ! -x "$APP" ]; then
149+
sleep 1
150+
fi
151+
152+
# Enforce restart limit if configured
153+
if [ "$RESTART_LIMIT" -gt 0 ] && [ "$restart_count" -ge "$RESTART_LIMIT" ]; then
154+
log_msg "Restart limit ($RESTART_LIMIT) reached; giving up"
155+
exit 1
156+
fi
157+
158+
restart_count=$((restart_count + 1))
159+
160+
# Apply backoff only when runtime is less than MIN_UPTIME (rapid crash loops)
161+
if [ "$MIN_UPTIME" -gt 0 ] && [ "$runtime" -lt "$MIN_UPTIME" ]; then
162+
# Backoff grows up to BACKOFF_MAX
163+
if [ "$BACKOFF_MAX" -gt 0 ] && [ "$backoff" -gt "$BACKOFF_MAX" ]; then
164+
backoff="$BACKOFF_MAX"
165+
fi
166+
if [ "$backoff" -gt 0 ]; then
167+
log_msg "Backing off ${backoff}s (runtime ${runtime}s < MIN_UPTIME ${MIN_UPTIME}s)"
168+
sleep "$backoff"
169+
fi
170+
# Increase backoff for next time, up to max (if max>0)
171+
if [ "$BACKOFF_MAX" -gt 0 ] && [ "$backoff" -lt "$BACKOFF_MAX" ]; then
172+
next=$((backoff + 1))
173+
backoff=$([ "$next" -le "$BACKOFF_MAX" ] && echo "$next" || echo "$BACKOFF_MAX")
174+
fi
175+
else
176+
# Reset backoff when app ran long enough or MIN_UPTIME is 0
177+
backoff="$BACKOFF_START"
178+
fi
179+
done
180+
181+
exit 0
182+
}
183+
184+
start() {
185+
echo -n "Starting $NAME: "
186+
# Run this script in 'run' mode under start-stop-daemon so it backgrounds and manages its own pidfile
187+
start-stop-daemon -S -b -q -p "$PIDFILE" -m -x "$0" -- run
188+
[ $? -eq 0 ] && echo "OK" || echo "ERROR"
189+
}
190+
191+
stop() {
192+
echo -n "Stopping $NAME: "
193+
start-stop-daemon -K -q -p "$PIDFILE" --retry=TERM/5/KILL/2
194+
[ $? -eq 0 ] && echo "OK" || echo "ERROR"
195+
}
196+
197+
restart() {
198+
stop
199+
start
200+
}
201+
202+
case "$1" in
203+
start)
204+
start
205+
;;
206+
stop)
207+
stop
208+
;;
209+
restart)
210+
restart
211+
;;
212+
run)
213+
shift
214+
run_monitor "$@"
215+
;;
216+
*)
217+
echo "Usage: $0 {start|stop|restart}"
218+
exit 1
219+
;;
220+
esac
221+
222+
exit 0

0 commit comments

Comments
 (0)