Skip to content

Commit 5f8c36e

Browse files
Merge pull request #183 from DrDroidLab/prateek/feat/health_monitoring_&_early_degradation_detection
feat: Added comprehensive health monitoring with early degradation de…
2 parents 41cfb02 + d8f11ed commit 5f8c36e

File tree

2 files changed

+507
-10
lines changed

2 files changed

+507
-10
lines changed

agent/health_monitor.py

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
"""
2+
Enhanced health monitoring utilities for VPC Agent pods.
3+
Provides degradation detection and early warning signals.
4+
"""
5+
import re
6+
from enum import Enum
7+
from typing import Dict, List, Optional, Tuple
8+
from datetime import datetime, timezone
9+
10+
11+
class HealthStatus(Enum):
12+
"""Health status levels for containers and pods"""
13+
HEALTHY = "healthy" # All metrics normal
14+
DEGRADED = "degraded" # Warning signs detected
15+
CRITICAL = "critical" # Severe issues, failure imminent
16+
UNHEALTHY = "unhealthy" # Failed or terminated
17+
18+
19+
class HealthMetrics:
20+
"""Calculate health metrics and scores for containers/pods"""
21+
22+
# Resource usage thresholds
23+
CPU_WARNING_THRESHOLD = 80 # %
24+
CPU_CRITICAL_THRESHOLD = 95 # %
25+
MEMORY_WARNING_THRESHOLD = 80 # %
26+
MEMORY_CRITICAL_THRESHOLD = 95 # %
27+
28+
# Restart count thresholds
29+
RESTART_WARNING_THRESHOLD = 3
30+
RESTART_CRITICAL_THRESHOLD = 10
31+
32+
# Critical event reasons that indicate imminent failure
33+
CRITICAL_EVENT_REASONS = {
34+
'OOMKilling', 'OOMKilled', 'Evicted', 'Failed',
35+
'FailedMount', 'FailedAttachVolume', 'FailedScheduling',
36+
'CrashLoopBackOff', 'ImagePullBackOff', 'ErrImagePull'
37+
}
38+
39+
# Warning event reasons
40+
WARNING_EVENT_REASONS = {
41+
'BackOff', 'Unhealthy', 'ProbeWarning', 'NodeNotReady',
42+
'Pulling', 'BackoffLimitExceeded'
43+
}
44+
45+
@staticmethod
46+
def parse_resource_value(value: str, resource_type: str) -> Optional[float]:
47+
"""
48+
Parse Kubernetes resource values to numeric form.
49+
50+
Args:
51+
value: Resource value like "14271868n", "212900Ki", "1500m"
52+
resource_type: 'cpu' or 'memory'
53+
54+
Returns:
55+
Normalized value (CPU in millicores, Memory in MiB)
56+
"""
57+
if not value or value == 'unknown':
58+
return None
59+
60+
try:
61+
if resource_type == 'cpu':
62+
# CPU: handle 'n' (nanocores), 'm' (millicores), or plain number
63+
if value.endswith('n'):
64+
# nanocores to millicores: divide by 1,000,000
65+
return float(value[:-1]) / 1_000_000
66+
elif value.endswith('m'):
67+
# already in millicores
68+
return float(value[:-1])
69+
else:
70+
# cores to millicores: multiply by 1000
71+
return float(value) * 1000
72+
73+
elif resource_type == 'memory':
74+
# Memory: handle Ki, Mi, Gi, K, M, G
75+
units = {
76+
'Ki': 1024,
77+
'Mi': 1024 * 1024,
78+
'Gi': 1024 * 1024 * 1024,
79+
'K': 1000,
80+
'M': 1000 * 1000,
81+
'G': 1000 * 1000 * 1000,
82+
}
83+
84+
for suffix, multiplier in units.items():
85+
if value.endswith(suffix):
86+
bytes_val = float(value[:-len(suffix)]) * multiplier
87+
# Convert to MiB for consistency
88+
return bytes_val / (1024 * 1024)
89+
90+
# Plain bytes
91+
return float(value) / (1024 * 1024)
92+
93+
except (ValueError, AttributeError):
94+
return None
95+
96+
return None
97+
98+
@staticmethod
99+
def calculate_usage_percentage(usage: str, limit: str, resource_type: str) -> Optional[float]:
100+
"""Calculate resource usage as percentage of limit"""
101+
usage_val = HealthMetrics.parse_resource_value(usage, resource_type)
102+
limit_val = HealthMetrics.parse_resource_value(limit, resource_type)
103+
104+
if usage_val is None or limit_val is None or limit_val == 0:
105+
return None
106+
107+
return (usage_val / limit_val) * 100
108+
109+
@staticmethod
110+
def assess_container_health(
111+
container_state: str,
112+
ready: bool,
113+
restart_count: int,
114+
cpu_usage: Optional[str] = None,
115+
memory_usage: Optional[str] = None,
116+
cpu_limit: Optional[str] = None,
117+
memory_limit: Optional[str] = None,
118+
exit_code: Optional[int] = None,
119+
termination_reason: Optional[str] = None
120+
) -> Tuple[HealthStatus, List[str], int]:
121+
"""
122+
Assess container health and return status, issues, and score.
123+
124+
Args:
125+
container_state: Current state (running/terminated/waiting)
126+
ready: Ready status
127+
restart_count: Number of restarts
128+
cpu_usage: Current CPU usage
129+
memory_usage: Current memory usage
130+
cpu_limit: CPU limit
131+
memory_limit: Memory limit
132+
exit_code: Last exit code if terminated
133+
termination_reason: Reason for termination
134+
135+
Returns:
136+
Tuple of (HealthStatus, issues_list, health_score)
137+
Health score: 0-100 (100 = perfect health)
138+
"""
139+
issues = []
140+
score = 100
141+
status = HealthStatus.HEALTHY
142+
143+
# Check container state
144+
if container_state == "terminated":
145+
status = HealthStatus.UNHEALTHY
146+
score = 0
147+
148+
if termination_reason == "OOMKilled":
149+
issues.append("Container was OOM killed - increase memory limit")
150+
elif exit_code and exit_code != 0:
151+
issues.append(f"Container terminated with exit code {exit_code}")
152+
else:
153+
issues.append("Container is terminated")
154+
155+
elif container_state == "waiting":
156+
# Waiting state indicates startup problems
157+
if termination_reason:
158+
if "CrashLoopBackOff" in termination_reason:
159+
status = HealthStatus.CRITICAL
160+
score = 10
161+
issues.append("Container in CrashLoopBackOff - check logs for startup errors")
162+
elif "ImagePullBackOff" in termination_reason or "ErrImagePull" in termination_reason:
163+
status = HealthStatus.CRITICAL
164+
score = 15
165+
issues.append("Image pull failed - check image name and registry access")
166+
else:
167+
status = HealthStatus.DEGRADED
168+
score = 40
169+
issues.append(f"Container waiting: {termination_reason}")
170+
else:
171+
status = HealthStatus.DEGRADED
172+
score = 50
173+
issues.append("Container in waiting state")
174+
175+
elif container_state == "running":
176+
# Check readiness
177+
if not ready:
178+
status = HealthStatus.DEGRADED
179+
score -= 30
180+
issues.append("Container not ready - readiness probe failing")
181+
182+
# Check restart count
183+
if restart_count >= HealthMetrics.RESTART_CRITICAL_THRESHOLD:
184+
if status == HealthStatus.HEALTHY:
185+
status = HealthStatus.CRITICAL
186+
score -= 40
187+
issues.append(f"Excessive restarts ({restart_count}) - container unstable")
188+
elif restart_count >= HealthMetrics.RESTART_WARNING_THRESHOLD:
189+
if status == HealthStatus.HEALTHY:
190+
status = HealthStatus.DEGRADED
191+
score -= 20
192+
issues.append(f"Elevated restart count ({restart_count})")
193+
194+
# Check CPU usage
195+
if cpu_usage and cpu_limit:
196+
cpu_pct = HealthMetrics.calculate_usage_percentage(cpu_usage, cpu_limit, 'cpu')
197+
if cpu_pct is not None:
198+
if cpu_pct >= HealthMetrics.CPU_CRITICAL_THRESHOLD:
199+
if status == HealthStatus.HEALTHY:
200+
status = HealthStatus.CRITICAL
201+
score -= 30
202+
issues.append(f"CPU usage critical ({cpu_pct:.1f}% of limit)")
203+
elif cpu_pct >= HealthMetrics.CPU_WARNING_THRESHOLD:
204+
if status == HealthStatus.HEALTHY:
205+
status = HealthStatus.DEGRADED
206+
score -= 15
207+
issues.append(f"CPU usage high ({cpu_pct:.1f}% of limit)")
208+
209+
# Check memory usage
210+
if memory_usage and memory_limit:
211+
mem_pct = HealthMetrics.calculate_usage_percentage(memory_usage, memory_limit, 'memory')
212+
if mem_pct is not None:
213+
if mem_pct >= HealthMetrics.MEMORY_CRITICAL_THRESHOLD:
214+
if status == HealthStatus.HEALTHY:
215+
status = HealthStatus.CRITICAL
216+
score -= 30
217+
issues.append(f"Memory usage critical ({mem_pct:.1f}% of limit) - OOM risk")
218+
elif mem_pct >= HealthMetrics.MEMORY_WARNING_THRESHOLD:
219+
if status == HealthStatus.HEALTHY:
220+
status = HealthStatus.DEGRADED
221+
score -= 15
222+
issues.append(f"Memory usage high ({mem_pct:.1f}% of limit)")
223+
224+
# Ensure score doesn't go negative
225+
score = max(0, score)
226+
227+
return status, issues, score
228+
229+
@staticmethod
230+
def analyze_events(events: List[Dict]) -> Tuple[List[str], bool, bool]:
231+
"""
232+
Analyze Kubernetes events for warning signs.
233+
234+
Args:
235+
events: List of event dictionaries
236+
237+
Returns:
238+
Tuple of (critical_events, has_warnings, has_oom_killed)
239+
"""
240+
critical_events = []
241+
has_warnings = False
242+
has_oom_killed = False
243+
244+
for event in events:
245+
event_type = event.get('type', '')
246+
reason = event.get('reason', '')
247+
message = event.get('message', '')
248+
249+
# Check for OOMKilled
250+
if reason in ['OOMKilling', 'OOMKilled']:
251+
has_oom_killed = True
252+
critical_events.append(f"OOM Event: {message}")
253+
254+
# Check for critical events
255+
elif reason in HealthMetrics.CRITICAL_EVENT_REASONS:
256+
critical_events.append(f"{reason}: {message}")
257+
258+
# Check for warning events
259+
elif reason in HealthMetrics.WARNING_EVENT_REASONS or event_type == 'Warning':
260+
has_warnings = True
261+
262+
return critical_events, has_warnings, has_oom_killed
263+
264+
@staticmethod
265+
def calculate_pod_health(
266+
containers_health: List[Dict],
267+
events: List[Dict],
268+
pod_phase: str
269+
) -> Tuple[HealthStatus, List[str], int]:
270+
"""
271+
Calculate overall pod health based on container health and events.
272+
273+
Args:
274+
containers_health: List of container health assessments
275+
events: Pod events
276+
pod_phase: Pod phase (Running/Pending/Failed)
277+
278+
Returns:
279+
Tuple of (HealthStatus, issues_list, health_score)
280+
"""
281+
issues = []
282+
283+
# Check pod phase first
284+
if pod_phase in ['Failed', 'Unknown']:
285+
return HealthStatus.UNHEALTHY, ["Pod in Failed state"], 0
286+
287+
elif pod_phase == 'Pending':
288+
return HealthStatus.DEGRADED, ["Pod stuck in Pending state"], 30
289+
290+
# Analyze events
291+
critical_events, has_warnings, has_oom_killed = HealthMetrics.analyze_events(events)
292+
293+
if critical_events:
294+
issues.extend(critical_events)
295+
296+
if has_oom_killed:
297+
issues.append("Pod experienced OOM kills - increase memory limits")
298+
299+
# Get worst container status
300+
worst_status = HealthStatus.HEALTHY
301+
min_score = 100
302+
container_issues_count = 0
303+
304+
for container_health in containers_health:
305+
status_str = container_health.get('health_status')
306+
score = container_health.get('health_score', 100)
307+
container_issues = container_health.get('issues', [])
308+
309+
if container_issues:
310+
container_issues_count += len(container_issues)
311+
312+
# Convert string status to enum
313+
try:
314+
status = HealthStatus(status_str)
315+
except (ValueError, TypeError):
316+
status = HealthStatus.HEALTHY
317+
318+
# Track worst status
319+
status_priority = {
320+
HealthStatus.UNHEALTHY: 4,
321+
HealthStatus.CRITICAL: 3,
322+
HealthStatus.DEGRADED: 2,
323+
HealthStatus.HEALTHY: 1
324+
}
325+
326+
if status_priority.get(status, 0) > status_priority.get(worst_status, 0):
327+
worst_status = status
328+
329+
min_score = min(min_score, score)
330+
331+
# Overall pod status is worst container status
332+
pod_status = worst_status
333+
334+
# Apply event-based adjustments
335+
if has_oom_killed and pod_status.value in ['healthy', 'degraded']:
336+
pod_status = HealthStatus.CRITICAL
337+
min_score = min(min_score, 20)
338+
339+
elif critical_events and pod_status == HealthStatus.HEALTHY:
340+
pod_status = HealthStatus.DEGRADED
341+
min_score = min(min_score, 60)
342+
343+
elif has_warnings and pod_status == HealthStatus.HEALTHY:
344+
min_score = min(min_score, 80)
345+
346+
# Add summary issue
347+
if container_issues_count > 0:
348+
issues.append(f"Total issues detected across containers: {container_issues_count}")
349+
350+
return pod_status, issues, min_score

0 commit comments

Comments
 (0)