board: common: Add a watchdogd configuration

mattiaswal · mattiaswal · commit 67817a182ca7 · 2025-07-11T17:06:36.000+02:00
Also increase the timeout and kick more often (required by RPI4)
diff --git a/board/common/rootfs/etc/watchdogd.conf b/board/common/rootfs/etc/watchdogd.conf
@@ -0,0 +1,165 @@
+# /etc/watchdogd.conf sample
+# Commented out values are program defaults.
+#
+# The checker/monitor `warning` and `critical` levels are 0.00-1.00,
+# i.e. 0-100%, except for load average which can vary a lot between
+# systems and use-cases, not just because of the number of CPU cores.
+# Use the `script = ...` setting to call script when `warning` and
+# `critical` are reached for a monitor.  In `critical` the monitor
+# otherwise triggers an unconditional reboot.
+#
+# NOTE: `critical` is optional, omitting it disables the reboot action.
+#
+
+### Watchdogs ##########################################################
+# Global settings that can be overridden per watchdog
+
+# Do not set WDT timeout and kick interval too low, the daemon runs at
+# SCHED_OTHER level with all other tasks, unless the process supervisor
+# is enabled.  The monitor plugins (below) need CPU time as well.
+#timeout   = 20
+#interval  = 10
+
+# With safe-exit enabled (true) the daemon will ask the driver disable
+# the WDT before exiting (SIGINT).  However, some WDT drivers (or HW)
+# may not support this.
+#safe-exit = true
+
+# Multiple watchdogs can be kicked, the default, even if no .conf file
+# is found or device node given on the command line, is /dev/watchdog
+device /dev/watchdog {
+    timeout    = 60
+    interval   = 5
+    safe-exit  = true
+}
+
+#device /dev/watchdog2 {
+#    timeout    = 20
+#    interval   = 10
+#    safe-exit  = true
+#}
+
+### Supervisor #########################################################
+# Instrumented processes can have their main loop supervised.  Processes
+# subscribe to this service using the libwdog API, see the docs for more
+# on this.  When the supervisor is enabled and the priority is set to a
+# value > 0, watchdogd runs as a SCHED_RR process with elevated realtime
+# priority.  When disabled, or the priority is set to zero (0), it runs
+# as a regular SCHED_OTHER process, this is the default.
+#
+# When a supervised process fails to meet its deadline, the daemon will
+# perform an unconditional reset having saved the reset reason.  If a
+# script is provided in this section it will be called instead.  The
+# script is called as:
+#
+#    script.sh supervisor CODE PID LABEL
+#
+# Availabel CODEs for the reset reason are avilable in wdog.h
+#
+#supervisor {
+#    !!!REMEMBER TO ENABLE reset-reason (below) AS WELL!!!
+#    enabled  = true
+#    priority = 98
+#    script = "/path/to/supervisor-script.sh"
+#}
+
+### Reset reason #######################################################
+# The following section controls if/how the reset reason & reset counter
+# is tracked.  By default this is disabled, since not all systems allow
+# writing to disk, e.g. embedded systems using MTD devices with limited
+# number of write cycles.
+#
+# The default file setting is a non-volatile path, according to the FHS.
+# It can be changed to another location, but make sure that location is
+# writable first.
+reset-reason {
+    enabled = true
+    file    = "/var/lib/misc/watchdogd.state"
+}
+
+### Checkers/Monitors ##################################################
+#
+# Script or command to run instead of reboot when a monitor plugin
+# reaches any of its critical or warning level.  Setting this will
+# disable the built-in reboot on critical, it is therefore up to the
+# script to perform reboot, if needed.  The script is called as:
+#
+#    script.sh {filenr, fsmon, loadavg, meminfo} {crit, warn} VALUE
+#
+#script = "/path/to/reboot-action.sh"
+
+# Monitors file descriptor leaks based on /proc/sys/fs/file-nr
+filenr {
+#    enabled = true
+    interval = 300
+    logmark  = false
+    warning  = 0.9
+    critical = 1.0
+#    script = "/path/to/alt-reboot-action.sh"
+}
+
+# Monitors a file system, blocks and inode usage against watermarks
+# The script is called with fsmon as the first argument and there
+# are two environment variables FSMON_NAME, for the monitored path,
+# and FSMON_TYPE indicating either 'blocks' or 'inodes'.
+#fsmon /var {
+#    enabled = true
+#    interval = 300
+#    logmark  = false
+#    warning  = 0.95
+#    critical = 1.0
+#    script = "/path/to/alt-reboot-action.sh"
+#}
+
+# Monitors load average based on sysinfo() from /proc/loadavg
+# The level is composed from the average of the 1 and 5 min marks.
+loadavg {
+#    enabled = true
+    interval = 300
+    logmark  = false
+    warning  = 1.0
+    critical = 2.0
+#    script = "/path/to/alt-reboot-action.sh"
+}
+
+# Monitors free RAM based on data from /proc/meminfo
+meminfo {
+#    enabled = true
+    interval = 300
+    logmark  = false
+    warning  = 0.9
+    critical = 0.95
+#    script = "/path/to/alt-reboot-action.sh"
+}
+
+# Monitor temperature.  The critical value is unset by default, so no
+# action is taken at that watermark (by default).  Both the critical and
+# warning watermarks are relative to the trip/critical/max value from
+# sysfs.  The warning is default 0.9, i.e., 90% of critical.  Use script
+# to to reset the fan controller or poweroff(8) the system.
+#
+# Each temp monitor caches the last 10 values, calculates the mean, and
+# compares that to the warning and critical levels.  Logging is only
+# done every 10 x interval (if enabled).
+#tempmon /sys/class/hwmon/hwmon0/temp1_input {
+#    enabled  = true
+#    interval = 30
+#    warning  = 0.9
+#    critical = 0.95
+#    logmark  = true
+#    script   = "/script/to/log/and/poweroff.sh"
+#}
+
+# Monitor a generic script, executes 'monitor-script' every 'interval'
+# seconds, with a max runtime of 'timeout' seconds.  When the exit code
+# of the monitor script is above the critical level watchdogd either
+# starts the reboot, or calls the alternate 'script' to determin the
+# next cause of action.
+#generic /path/to/monitor-script.sh {
+#    enabled = true
+#    interval = 300
+#    timeout = 60
+#    warning  = 1
+#    critical = 10
+#    script = "/path/to/alt-reboot-action.sh"
+#}