Skip to content

Commit 4d670e8

Browse files
Add comprehensive auto-diagnostics and self-healing to health checks
When MCP servers fail, the installer now automatically: 1. Checks if Node.js is installed (with instructions to fix) 2. Checks if node_modules exist (auto-runs npm install if missing) 3. Checks if launchd loaded the service 4. Checks if process is running but port not responding 5. Checks if another process is hogging the port 6. Shows filtered error logs (real errors, not startup messages) 7. Auto-recovers: kills stale port, starts server directly via nohup 8. Tests Docker→host connectivity 9. Reports final status with actionable fix instructions No more back-and-forth debugging with users. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6255f9e commit 4d670e8

1 file changed

Lines changed: 124 additions & 16 deletions

File tree

install.sh

Lines changed: 124 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -985,11 +985,38 @@ start_and_show() {
985985
echo ""
986986
bash "$IBEX_DIR/scripts/launchd-service.sh" install
987987

988-
# ── Verify MCP servers are healthy ──────────────────────────
988+
# ── Verify MCP servers are healthy (with full auto-diagnostics) ──
989989
echo ""
990990
echo " Checking MCP server health..."
991991
echo ""
992992

993+
# Pre-flight: check node is available
994+
local node_path
995+
node_path=$(which node 2>/dev/null)
996+
if [ -z "$node_path" ]; then
997+
for p in /opt/homebrew/bin/node /usr/local/bin/node; do
998+
[ -x "$p" ] && node_path="$p" && break
999+
done
1000+
fi
1001+
if [ -z "$node_path" ]; then
1002+
printf " ${RED}✗ Node.js is not installed!${NC}\n"
1003+
printf " MCP servers require Node.js. Install it:\n"
1004+
printf " brew install node\n"
1005+
printf " Then re-run the installer.\n\n"
1006+
return 1
1007+
fi
1008+
printf " ${GREEN}${NC} Node.js: $node_path ($(${node_path} --version))\n"
1009+
1010+
# Pre-flight: check node_modules exist
1011+
if [ ! -d "$IBEX_DIR/servers/node_modules" ]; then
1012+
printf " ${RED}✗ node_modules missing!${NC}\n"
1013+
printf " Running npm install...\n"
1014+
(cd "$IBEX_DIR/servers" && npm install --production 2>&1) | tail -3
1015+
else
1016+
printf " ${GREEN}${NC} node_modules present\n"
1017+
fi
1018+
1019+
echo ""
9931020
local servers_ok=0 servers_fail=0
9941021

9951022
for port in 3001 3002 3003 3005 3006; do
@@ -998,14 +1025,17 @@ start_and_show() {
9981025
3001) sname="Slack" ;; 3002) sname="Notion" ;; 3003) sname="Jira" ;;
9991026
3005) sname="ServiceNow" ;; 3006) sname="Percona Docs" ;;
10001027
esac
1028+
local sname_lower
1029+
sname_lower=$(echo "$sname" | tr '[:upper:]' '[:lower:]')
1030+
10011031
# Check if this server is configured (has a launchd plist)
10021032
if ! ls ~/Library/LaunchAgents/com.ibex.mcp-*.plist 2>/dev/null | xargs grep -l "\"$port\"" >/dev/null 2>&1; then
10031033
continue # not configured, skip
10041034
fi
10051035

1006-
# Wait up to 5 seconds for the server to respond
1036+
# Wait up to 8 seconds for the server to respond
10071037
local healthy=false
1008-
for i in 1 2 3 4 5; do
1038+
for i in 1 2 3 4 5 6 7 8; do
10091039
if curl -sf --connect-timeout 1 "http://localhost:${port}/health" >/dev/null 2>&1; then
10101040
healthy=true
10111041
break
@@ -1018,22 +1048,97 @@ start_and_show() {
10181048
servers_ok=$((servers_ok + 1))
10191049
else
10201050
printf " ${RED}${NC} %s server (port %s) — NOT responding\n" "$sname" "$port"
1021-
# Show actual error from logs to help diagnose
1022-
local sname_lower=$(echo "$sname" | tr '[:upper:]' '[:lower:]')
1051+
1052+
# ── Auto-diagnose why ──
1053+
local label="com.ibex.mcp.${sname_lower}"
10231054
local errlog="$HOME/.ibex-logs/${sname_lower}.err"
10241055
local outlog="$HOME/.ibex-logs/${sname_lower}.log"
1025-
if [ -f "$errlog" ] && [ -s "$errlog" ]; then
1026-
printf " Last error: %s\n" "$(tail -1 "$errlog")"
1027-
elif [ -f "$outlog" ] && [ -s "$outlog" ]; then
1028-
printf " Last log: %s\n" "$(tail -1 "$outlog")"
1056+
1057+
# 1. Is launchd tracking it?
1058+
if ! launchctl list 2>/dev/null | grep -q "$label"; then
1059+
printf " ${YELLOW}${NC} launchd service not loaded\n"
10291060
else
1030-
printf " No logs found — launchd may not have started the service\n"
1061+
local lpid
1062+
lpid=$(launchctl list 2>/dev/null | grep "$label" | awk '{print $1}')
1063+
if [ "$lpid" = "-" ] || [ -z "$lpid" ]; then
1064+
printf " ${YELLOW}${NC} launchd loaded but process not running (crashed)\n"
1065+
else
1066+
printf " ${YELLOW}${NC} process running (PID $lpid) but port not responding\n"
1067+
fi
10311068
fi
1032-
# Check if launchd knows about it
1033-
local label="com.ibex.mcp.${sname_lower}"
1034-
if ! launchctl list 2>/dev/null | grep -q "$label"; then
1035-
printf " launchd service not loaded — re-run: ~/IBEX/scripts/launchd-service.sh install\n"
1069+
1070+
# 2. Is something else using the port?
1071+
local port_owner
1072+
port_owner=$(lsof -ti:${port} 2>/dev/null | head -1)
1073+
if [ -n "$port_owner" ]; then
1074+
local port_cmd
1075+
port_cmd=$(ps -p "$port_owner" -o comm= 2>/dev/null)
1076+
printf " ${YELLOW}${NC} port %s in use by PID %s (%s)\n" "$port" "$port_owner" "$port_cmd"
1077+
fi
1078+
1079+
# 3. Show last meaningful log lines
1080+
if [ -f "$errlog" ] && [ -s "$errlog" ]; then
1081+
# Filter out startup messages, show actual errors
1082+
local real_errors
1083+
real_errors=$(grep -v "Streamable HTTP on" "$errlog" | tail -3)
1084+
if [ -n "$real_errors" ]; then
1085+
printf " ${YELLOW}${NC} error log:\n"
1086+
echo "$real_errors" | while read -r line; do
1087+
printf " %s\n" "$line"
1088+
done
1089+
fi
1090+
fi
1091+
if [ -f "$outlog" ] && [ -s "$outlog" ]; then
1092+
local real_out
1093+
real_out=$(grep -iE "error|fatal|EADDRINUSE|EACCES|MODULE_NOT_FOUND|Cannot find" "$outlog" | tail -3)
1094+
if [ -n "$real_out" ]; then
1095+
printf " ${YELLOW}${NC} stdout errors:\n"
1096+
echo "$real_out" | while read -r line; do
1097+
printf " %s\n" "$line"
1098+
done
1099+
fi
1100+
fi
1101+
1102+
# 4. No logs at all?
1103+
if [ ! -s "$errlog" ] && [ ! -s "$outlog" ]; then
1104+
printf " ${YELLOW}${NC} no logs found — server never started\n"
1105+
printf " check plist: cat ~/Library/LaunchAgents/${label}.plist\n"
10361106
fi
1107+
1108+
# 5. Auto-recovery: try killing port + direct start
1109+
printf " ${YELLOW}${NC} attempting recovery...\n"
1110+
lsof -ti:${port} 2>/dev/null | xargs kill -9 2>/dev/null || true
1111+
sleep 1
1112+
1113+
# Find the server script for this port
1114+
local server_script=""
1115+
case $port in
1116+
3001) server_script="servers/slack.js" ;;
1117+
3002) server_script="servers/notion.js" ;;
1118+
3003) server_script="servers/jira.js" ;;
1119+
3005) server_script="servers/servicenow.js" ;;
1120+
3006) server_script="servers/percona-docs.js" ;;
1121+
esac
1122+
1123+
if [ -n "$server_script" ] && [ -f "$IBEX_DIR/$server_script" ]; then
1124+
# Source env and start directly
1125+
set -a; source "$HOME/.ibex-mcp.env" 2>/dev/null; set +a
1126+
nohup "$node_path" "$IBEX_DIR/$server_script" --http \
1127+
>> "$errlog" 2>&1 &
1128+
local recovery_pid=$!
1129+
sleep 3
1130+
if curl -sf --connect-timeout 2 "http://localhost:${port}/health" >/dev/null 2>&1; then
1131+
printf " ${GREEN}${NC} recovered! running as PID %s\n" "$recovery_pid"
1132+
servers_fail=$((servers_fail - 1)) # undo the fail count
1133+
servers_ok=$((servers_ok + 1))
1134+
else
1135+
printf " ${RED}${NC} recovery failed — last 5 log lines:\n"
1136+
tail -5 "$errlog" 2>/dev/null | while read -r line; do
1137+
printf " %s\n" "$line"
1138+
done
1139+
fi
1140+
fi
1141+
10371142
servers_fail=$((servers_fail + 1))
10381143
fi
10391144
done
@@ -1051,7 +1156,7 @@ start_and_show() {
10511156
printf "\n ${GREEN}${NC} Docker → host networking OK\n"
10521157
else
10531158
printf "\n ${RED}${NC} Docker CANNOT reach MCP servers on host\n"
1054-
printf " This means Open WebUI won't be able to use any tools.\n"
1159+
printf " Open WebUI won't be able to use tools.\n"
10551160
printf " Fix: Docker Desktop → Settings → General → enable 'Allow host networking'\n"
10561161
printf " Then re-run this installer.\n"
10571162
fi
@@ -1060,7 +1165,10 @@ start_and_show() {
10601165

10611166
if [ "$servers_fail" -gt 0 ]; then
10621167
printf "\n ${YELLOW}!${NC} %s server(s) failed health check — tools may not work\n" "$servers_fail"
1063-
printf " Run: ~/IBEX/scripts/launchd-service.sh status\n"
1168+
printf " Full diagnostics: ~/IBEX/scripts/launchd-service.sh status\n"
1169+
printf " Logs: cat ~/.ibex-logs/*.err\n"
1170+
else
1171+
printf "\n ${GREEN}${NC} All %s server(s) healthy\n" "$servers_ok"
10641172
fi
10651173

10661174
# Configure models with system prompt and tools

0 commit comments

Comments
 (0)