Skip to content

Commit 28c448a

Browse files
fix(monitor): add periodic status reporting and fix silent health checks
Two bugs caused zero output between Galera checks: 1. check_pod_health returned silently when pods were healthy or when no pods matched a selector — no way to distinguish 2. Hourly alive message used (epoch_seconds % 3600 == 0) which almost never fires (requires exact second alignment) Fixes: - Track pods_checked counter across all selectors per cycle - Track check_cycle number for log correlation - Replace broken modulo check with time-delta STATUS_REPORT_INTERVAL (default 600s / 10 minutes, configurable) - Print periodic summary: check number, pods scanned, pods tracked - Always log immediately when issues are found - Suppress stderr on pod listing to avoid noise for missing selectors
1 parent 012276b commit 28c448a

File tree

1 file changed

+15
-5
lines changed

1 file changed

+15
-5
lines changed

openshift/scripts/monitor-pods.sh

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ oc() { command oc "$@" 2> >(grep -v "^Warning:" >&2); }
6161

6262
# Unified pod health check function
6363
# $1: selector $2: error_patterns $3: restart_enabled (true/false)
64+
# Sets global pods_checked counter as side-effect for status reporting
6465
check_pod_health() {
6566
local selector="$1"
6667
local error_patterns="$2"
6768
local restart_enabled="${3:-false}"
6869

69-
local pods=$(oc get pods -l "$selector" --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}')
70+
local pods=$(oc get pods -l "$selector" --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
7071

7172
if [[ -z "$pods" ]]; then
7273
return 0
@@ -102,6 +103,8 @@ check_pod_health() {
102103
error_counts["$pod"]=0
103104
fi
104105

106+
pods_checked=$((pods_checked + 1))
107+
105108
# Restart only for restart-eligible pods at error threshold
106109
if [[ "$restart_enabled" == "true" && ${error_counts["$pod"]:-0} -ge $ERROR_THRESHOLD ]]; then
107110
echo "$(date): Restarting $pod after $ERROR_THRESHOLD consecutive errors"
@@ -118,6 +121,9 @@ check_pod_health() {
118121

119122
# Main monitoring loop
120123
last_galera_check=0
124+
last_status_report=0
125+
check_cycle=0
126+
STATUS_REPORT_INTERVAL=${STATUS_REPORT_INTERVAL:-600} # Status summary every 10 minutes
121127

122128
# Send startup notification
123129
send_notification "MONITORING_START" "Pod Health Monitor Started" "Continuous monitoring active with ${MONITORING_INTERVAL}s intervals. Galera checks every ${GALERA_CHECK_INTERVAL}s." "white_check_mark" "$DEPLOY_NAMESPACE"
@@ -146,6 +152,7 @@ while true; do
146152

147153
# Perform health checks — restart-eligible services
148154
total_issues=0
155+
pods_checked=0
149156
for selector in "${!RESTART_DEPLOYMENTS[@]}"; do
150157
check_pod_health "$selector" "${RESTART_DEPLOYMENTS[$selector]}" "true"
151158
total_issues=$((total_issues + $?))
@@ -157,6 +164,8 @@ while true; do
157164
total_issues=$((total_issues + $?))
158165
done
159166

167+
check_cycle=$((check_cycle + 1))
168+
160169
# Comprehensive Galera check at longer interval
161170
if [[ $((current_time - last_galera_check)) -ge $GALERA_CHECK_INTERVAL ]]; then
162171
echo "$(date): Performing comprehensive Galera health check..."
@@ -184,11 +193,12 @@ while true; do
184193
last_galera_check=$current_time
185194
fi
186195

187-
# Brief status report (don't spam logs)
196+
# Status report — always log issues, periodic summary when healthy
188197
if [[ $total_issues -gt 0 ]]; then
189-
echo "$(date): Health check completed - $total_issues issue(s) found and addressed"
190-
elif [[ $((current_time % 3600)) -eq 0 ]]; then # Hourly "alive" message
191-
echo "$(date): Health monitoring active - all systems nominal"
198+
echo "$(date): Health check #$check_cycle$total_issues issue(s) found ($pods_checked pods scanned)"
199+
elif [[ $((current_time - last_status_report)) -ge $STATUS_REPORT_INTERVAL ]]; then
200+
echo "$(date): Health check #$check_cycle — all nominal ($pods_checked pods scanned, ${#error_counts[@]} tracked)"
201+
last_status_report=$current_time
192202
fi
193203

194204
# Wait for next check

0 commit comments

Comments
 (0)