-
Notifications
You must be signed in to change notification settings - Fork 726
Expand file tree
/
Copy pathsentinel.sh
More file actions
executable file
·166 lines (143 loc) · 4.38 KB
/
sentinel.sh
File metadata and controls
executable file
·166 lines (143 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env bash
# sentinel.sh — Watchdog for AutoResearchClaw pipeline process.
#
# Monitors the pipeline heartbeat file and auto-restarts on crash.
# Inspired by Sibyl's sentinel watchdog design.
#
# Usage:
# ./sentinel.sh <run_dir> [--python <python_path>]
#
# The pipeline runner writes heartbeat.json after each stage. If the
# heartbeat goes stale (>5 min) and the PID is dead, sentinel restarts.
#
# Configuration via environment:
# SENTINEL_CHECK_INTERVAL — seconds between checks (default: 60)
# SENTINEL_STALE_THRESHOLD — seconds before heartbeat is stale (default: 300)
# SENTINEL_MAX_RETRIES — max restart attempts (default: 5)
# SENTINEL_COOLDOWN — seconds to wait after 3 consecutive failures (default: 360)
set -euo pipefail
# --- Arguments ---
RUN_DIR="${1:?Usage: sentinel.sh <run_dir> [--python <path>]}"
PYTHON_PATH="python"
shift
while [[ $# -gt 0 ]]; do
case "$1" in
--python)
PYTHON_PATH="$2"
shift 2
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done
# --- Configuration ---
CHECK_INTERVAL="${SENTINEL_CHECK_INTERVAL:-60}"
STALE_THRESHOLD="${SENTINEL_STALE_THRESHOLD:-300}"
MAX_RETRIES="${SENTINEL_MAX_RETRIES:-5}"
COOLDOWN="${SENTINEL_COOLDOWN:-360}"
HEARTBEAT_FILE="${RUN_DIR}/heartbeat.json"
RECOVERY_LOG="${RUN_DIR}/sentinel_recovery.log"
FAILED_LOG="${RUN_DIR}/sentinel_failed.log"
retry_count=0
consecutive_failures=0
log() {
local msg="[sentinel $(date '+%Y-%m-%dT%H:%M:%S')] $1"
echo "$msg"
echo "$msg" >> "$RECOVERY_LOG"
}
# --- Check if heartbeat is stale ---
is_stale() {
if [[ ! -f "$HEARTBEAT_FILE" ]]; then
return 0 # No heartbeat = stale
fi
local now
now=$(date +%s)
# Extract timestamp from heartbeat.json
local hb_ts
hb_ts=$(python3 -c "
import json, sys
try:
data = json.load(open('${HEARTBEAT_FILE}'))
from datetime import datetime
ts = datetime.fromisoformat(data['timestamp'])
print(int(ts.timestamp()))
except Exception:
print(0)
" 2>/dev/null || echo 0)
local age=$(( now - hb_ts ))
[[ $age -gt $STALE_THRESHOLD ]]
}
# --- Check if PID is alive ---
pid_alive() {
local pid_file="${RUN_DIR}/pipeline.pid"
if [[ ! -f "$pid_file" ]]; then
return 1
fi
local pid
pid=$(cat "$pid_file" 2>/dev/null || echo "")
if [[ -z "$pid" ]]; then
return 1
fi
kill -0 "$pid" 2>/dev/null
}
# --- Check for active subprocesses ---
has_active_children() {
local pid_file="${RUN_DIR}/pipeline.pid"
if [[ ! -f "$pid_file" ]]; then
return 1
fi
local pid
pid=$(cat "$pid_file" 2>/dev/null || echo "")
if [[ -z "$pid" ]]; then
return 1
fi
# Check if any child processes exist
pgrep -P "$pid" > /dev/null 2>&1
}
# --- Restart pipeline ---
restart_pipeline() {
log "Attempting pipeline restart (attempt $((retry_count + 1))/${MAX_RETRIES})"
$PYTHON_PATH -m researchclaw run --resume --output "$RUN_DIR" &
local new_pid=$!
echo "$new_pid" > "${RUN_DIR}/pipeline.pid"
log "Pipeline restarted with PID ${new_pid}"
retry_count=$((retry_count + 1))
}
# --- Main loop ---
log "Sentinel started for ${RUN_DIR}"
log "Check interval: ${CHECK_INTERVAL}s, Stale threshold: ${STALE_THRESHOLD}s"
log "Max retries: ${MAX_RETRIES}, Cooldown: ${COOLDOWN}s"
while true; do
sleep "$CHECK_INTERVAL"
# If PID is alive, reset failure counter
if pid_alive; then
consecutive_failures=0
continue
fi
# PID is dead — check if heartbeat is stale
if ! is_stale; then
# Heartbeat is fresh but PID is gone — might have just exited normally
continue
fi
# Don't interrupt active subprocesses
if has_active_children; then
log "Active subprocesses detected — skipping restart"
continue
fi
# Check retry limit
if [[ $retry_count -ge $MAX_RETRIES ]]; then
log "Max retries (${MAX_RETRIES}) reached — sentinel giving up"
echo "Sentinel failed after ${MAX_RETRIES} retries at $(date)" >> "$FAILED_LOG"
exit 1
fi
# Cooldown after consecutive failures
consecutive_failures=$((consecutive_failures + 1))
if [[ $consecutive_failures -ge 3 ]]; then
log "3 consecutive failures — cooling down for ${COOLDOWN}s"
sleep "$COOLDOWN"
consecutive_failures=0
fi
restart_pipeline
done