fix(monitor): add periodic status reporting and fix silent health checks

warrenchristian1telus · warrenchristian1telus · commit 28c448a23ff8 · 2026-03-22T14:51:12.000-07:00
Two bugs caused zero output between Galera checks:
1. check_pod_health returned silently when pods were healthy or
   when no pods matched a selector — no way to distinguish
2. Hourly alive message used (epoch_seconds % 3600 == 0) which
   almost never fires (requires exact second alignment)

Fixes:
- Track pods_checked counter across all selectors per cycle
- Track check_cycle number for log correlation
- Replace broken modulo check with time-delta STATUS_REPORT_INTERVAL
  (default 600s / 10 minutes, configurable)
- Print periodic summary: check number, pods scanned, pods tracked
- Always log immediately when issues are found
- Suppress stderr on pod listing to avoid noise for missing selectors
diff --git a/openshift/scripts/monitor-pods.sh b/openshift/scripts/monitor-pods.sh
@@ -61,12 +61,13 @@ oc() { command oc "$@" 2> >(grep -v "^Warning:" >&2); }
 
 # Unified pod health check function
 # $1: selector  $2: error_patterns  $3: restart_enabled (true/false)
+# Sets global pods_checked counter as side-effect for status reporting
 check_pod_health() {
   local selector="$1"
   local error_patterns="$2"
   local restart_enabled="${3:-false}"
 
-  local pods=$(oc get pods -l "$selector" --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}')
+  local pods=$(oc get pods -l "$selector" --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
 
   if [[ -z "$pods" ]]; then
     return 0
@@ -102,6 +103,8 @@ check_pod_health() {
       error_counts["$pod"]=0
     fi
 
+    pods_checked=$((pods_checked + 1))
+
     # Restart only for restart-eligible pods at error threshold
     if [[ "$restart_enabled" == "true" && ${error_counts["$pod"]:-0} -ge $ERROR_THRESHOLD ]]; then
       echo "$(date): Restarting $pod after $ERROR_THRESHOLD consecutive errors"
@@ -118,6 +121,9 @@ check_pod_health() {
 
 # Main monitoring loop
 last_galera_check=0
+last_status_report=0
+check_cycle=0
+STATUS_REPORT_INTERVAL=${STATUS_REPORT_INTERVAL:-600}  # Status summary every 10 minutes
 
 # Send startup notification
 send_notification "MONITORING_START" "Pod Health Monitor Started" "Continuous monitoring active with ${MONITORING_INTERVAL}s intervals. Galera checks every ${GALERA_CHECK_INTERVAL}s." "white_check_mark" "$DEPLOY_NAMESPACE"
@@ -146,6 +152,7 @@ while true; do
 
   # Perform health checks — restart-eligible services
   total_issues=0
+  pods_checked=0
   for selector in "${!RESTART_DEPLOYMENTS[@]}"; do
     check_pod_health "$selector" "${RESTART_DEPLOYMENTS[$selector]}" "true"
     total_issues=$((total_issues + $?))
@@ -157,6 +164,8 @@ while true; do
     total_issues=$((total_issues + $?))
   done
 
+  check_cycle=$((check_cycle + 1))
+
   # Comprehensive Galera check at longer interval
   if [[ $((current_time - last_galera_check)) -ge $GALERA_CHECK_INTERVAL ]]; then
     echo "$(date): Performing comprehensive Galera health check..."
@@ -184,11 +193,12 @@ while true; do
     last_galera_check=$current_time
   fi
 
-  # Brief status report (don't spam logs)
+  # Status report — always log issues, periodic summary when healthy
   if [[ $total_issues -gt 0 ]]; then
-    echo "$(date): Health check completed - $total_issues issue(s) found and addressed"
-  elif [[ $((current_time % 3600)) -eq 0 ]]; then  # Hourly "alive" message
-    echo "$(date): Health monitoring active - all systems nominal"
+    echo "$(date): Health check #$check_cycle — $total_issues issue(s) found ($pods_checked pods scanned)"
+  elif [[ $((current_time - last_status_report)) -ge $STATUS_REPORT_INTERVAL ]]; then
+    echo "$(date): Health check #$check_cycle — all nominal ($pods_checked pods scanned, ${#error_counts[@]} tracked)"
+    last_status_report=$current_time
   fi
 
   # Wait for next check