|
| 1 | +""" |
| 2 | +Enhanced health monitoring utilities for VPC Agent pods. |
| 3 | +Provides degradation detection and early warning signals. |
| 4 | +""" |
| 5 | +import re |
| 6 | +from enum import Enum |
| 7 | +from typing import Dict, List, Optional, Tuple |
| 8 | +from datetime import datetime, timezone |
| 9 | + |
| 10 | + |
| 11 | +class HealthStatus(Enum): |
| 12 | + """Health status levels for containers and pods""" |
| 13 | + HEALTHY = "healthy" # All metrics normal |
| 14 | + DEGRADED = "degraded" # Warning signs detected |
| 15 | + CRITICAL = "critical" # Severe issues, failure imminent |
| 16 | + UNHEALTHY = "unhealthy" # Failed or terminated |
| 17 | + |
| 18 | + |
| 19 | +class HealthMetrics: |
| 20 | + """Calculate health metrics and scores for containers/pods""" |
| 21 | + |
| 22 | + # Resource usage thresholds |
| 23 | + CPU_WARNING_THRESHOLD = 80 # % |
| 24 | + CPU_CRITICAL_THRESHOLD = 95 # % |
| 25 | + MEMORY_WARNING_THRESHOLD = 80 # % |
| 26 | + MEMORY_CRITICAL_THRESHOLD = 95 # % |
| 27 | + |
| 28 | + # Restart count thresholds |
| 29 | + RESTART_WARNING_THRESHOLD = 3 |
| 30 | + RESTART_CRITICAL_THRESHOLD = 10 |
| 31 | + |
| 32 | + # Critical event reasons that indicate imminent failure |
| 33 | + CRITICAL_EVENT_REASONS = { |
| 34 | + 'OOMKilling', 'OOMKilled', 'Evicted', 'Failed', |
| 35 | + 'FailedMount', 'FailedAttachVolume', 'FailedScheduling', |
| 36 | + 'CrashLoopBackOff', 'ImagePullBackOff', 'ErrImagePull' |
| 37 | + } |
| 38 | + |
| 39 | + # Warning event reasons |
| 40 | + WARNING_EVENT_REASONS = { |
| 41 | + 'BackOff', 'Unhealthy', 'ProbeWarning', 'NodeNotReady', |
| 42 | + 'Pulling', 'BackoffLimitExceeded' |
| 43 | + } |
| 44 | + |
| 45 | + @staticmethod |
| 46 | + def parse_resource_value(value: str, resource_type: str) -> Optional[float]: |
| 47 | + """ |
| 48 | + Parse Kubernetes resource values to numeric form. |
| 49 | +
|
| 50 | + Args: |
| 51 | + value: Resource value like "14271868n", "212900Ki", "1500m" |
| 52 | + resource_type: 'cpu' or 'memory' |
| 53 | +
|
| 54 | + Returns: |
| 55 | + Normalized value (CPU in millicores, Memory in MiB) |
| 56 | + """ |
| 57 | + if not value or value == 'unknown': |
| 58 | + return None |
| 59 | + |
| 60 | + try: |
| 61 | + if resource_type == 'cpu': |
| 62 | + # CPU: handle 'n' (nanocores), 'm' (millicores), or plain number |
| 63 | + if value.endswith('n'): |
| 64 | + # nanocores to millicores: divide by 1,000,000 |
| 65 | + return float(value[:-1]) / 1_000_000 |
| 66 | + elif value.endswith('m'): |
| 67 | + # already in millicores |
| 68 | + return float(value[:-1]) |
| 69 | + else: |
| 70 | + # cores to millicores: multiply by 1000 |
| 71 | + return float(value) * 1000 |
| 72 | + |
| 73 | + elif resource_type == 'memory': |
| 74 | + # Memory: handle Ki, Mi, Gi, K, M, G |
| 75 | + units = { |
| 76 | + 'Ki': 1024, |
| 77 | + 'Mi': 1024 * 1024, |
| 78 | + 'Gi': 1024 * 1024 * 1024, |
| 79 | + 'K': 1000, |
| 80 | + 'M': 1000 * 1000, |
| 81 | + 'G': 1000 * 1000 * 1000, |
| 82 | + } |
| 83 | + |
| 84 | + for suffix, multiplier in units.items(): |
| 85 | + if value.endswith(suffix): |
| 86 | + bytes_val = float(value[:-len(suffix)]) * multiplier |
| 87 | + # Convert to MiB for consistency |
| 88 | + return bytes_val / (1024 * 1024) |
| 89 | + |
| 90 | + # Plain bytes |
| 91 | + return float(value) / (1024 * 1024) |
| 92 | + |
| 93 | + except (ValueError, AttributeError): |
| 94 | + return None |
| 95 | + |
| 96 | + return None |
| 97 | + |
| 98 | + @staticmethod |
| 99 | + def calculate_usage_percentage(usage: str, limit: str, resource_type: str) -> Optional[float]: |
| 100 | + """Calculate resource usage as percentage of limit""" |
| 101 | + usage_val = HealthMetrics.parse_resource_value(usage, resource_type) |
| 102 | + limit_val = HealthMetrics.parse_resource_value(limit, resource_type) |
| 103 | + |
| 104 | + if usage_val is None or limit_val is None or limit_val == 0: |
| 105 | + return None |
| 106 | + |
| 107 | + return (usage_val / limit_val) * 100 |
| 108 | + |
| 109 | + @staticmethod |
| 110 | + def assess_container_health( |
| 111 | + container_state: str, |
| 112 | + ready: bool, |
| 113 | + restart_count: int, |
| 114 | + cpu_usage: Optional[str] = None, |
| 115 | + memory_usage: Optional[str] = None, |
| 116 | + cpu_limit: Optional[str] = None, |
| 117 | + memory_limit: Optional[str] = None, |
| 118 | + exit_code: Optional[int] = None, |
| 119 | + termination_reason: Optional[str] = None |
| 120 | + ) -> Tuple[HealthStatus, List[str], int]: |
| 121 | + """ |
| 122 | + Assess container health and return status, issues, and score. |
| 123 | +
|
| 124 | + Args: |
| 125 | + container_state: Current state (running/terminated/waiting) |
| 126 | + ready: Ready status |
| 127 | + restart_count: Number of restarts |
| 128 | + cpu_usage: Current CPU usage |
| 129 | + memory_usage: Current memory usage |
| 130 | + cpu_limit: CPU limit |
| 131 | + memory_limit: Memory limit |
| 132 | + exit_code: Last exit code if terminated |
| 133 | + termination_reason: Reason for termination |
| 134 | +
|
| 135 | + Returns: |
| 136 | + Tuple of (HealthStatus, issues_list, health_score) |
| 137 | + Health score: 0-100 (100 = perfect health) |
| 138 | + """ |
| 139 | + issues = [] |
| 140 | + score = 100 |
| 141 | + status = HealthStatus.HEALTHY |
| 142 | + |
| 143 | + # Check container state |
| 144 | + if container_state == "terminated": |
| 145 | + status = HealthStatus.UNHEALTHY |
| 146 | + score = 0 |
| 147 | + |
| 148 | + if termination_reason == "OOMKilled": |
| 149 | + issues.append("Container was OOM killed - increase memory limit") |
| 150 | + elif exit_code and exit_code != 0: |
| 151 | + issues.append(f"Container terminated with exit code {exit_code}") |
| 152 | + else: |
| 153 | + issues.append("Container is terminated") |
| 154 | + |
| 155 | + elif container_state == "waiting": |
| 156 | + # Waiting state indicates startup problems |
| 157 | + if termination_reason: |
| 158 | + if "CrashLoopBackOff" in termination_reason: |
| 159 | + status = HealthStatus.CRITICAL |
| 160 | + score = 10 |
| 161 | + issues.append("Container in CrashLoopBackOff - check logs for startup errors") |
| 162 | + elif "ImagePullBackOff" in termination_reason or "ErrImagePull" in termination_reason: |
| 163 | + status = HealthStatus.CRITICAL |
| 164 | + score = 15 |
| 165 | + issues.append("Image pull failed - check image name and registry access") |
| 166 | + else: |
| 167 | + status = HealthStatus.DEGRADED |
| 168 | + score = 40 |
| 169 | + issues.append(f"Container waiting: {termination_reason}") |
| 170 | + else: |
| 171 | + status = HealthStatus.DEGRADED |
| 172 | + score = 50 |
| 173 | + issues.append("Container in waiting state") |
| 174 | + |
| 175 | + elif container_state == "running": |
| 176 | + # Check readiness |
| 177 | + if not ready: |
| 178 | + status = HealthStatus.DEGRADED |
| 179 | + score -= 30 |
| 180 | + issues.append("Container not ready - readiness probe failing") |
| 181 | + |
| 182 | + # Check restart count |
| 183 | + if restart_count >= HealthMetrics.RESTART_CRITICAL_THRESHOLD: |
| 184 | + if status == HealthStatus.HEALTHY: |
| 185 | + status = HealthStatus.CRITICAL |
| 186 | + score -= 40 |
| 187 | + issues.append(f"Excessive restarts ({restart_count}) - container unstable") |
| 188 | + elif restart_count >= HealthMetrics.RESTART_WARNING_THRESHOLD: |
| 189 | + if status == HealthStatus.HEALTHY: |
| 190 | + status = HealthStatus.DEGRADED |
| 191 | + score -= 20 |
| 192 | + issues.append(f"Elevated restart count ({restart_count})") |
| 193 | + |
| 194 | + # Check CPU usage |
| 195 | + if cpu_usage and cpu_limit: |
| 196 | + cpu_pct = HealthMetrics.calculate_usage_percentage(cpu_usage, cpu_limit, 'cpu') |
| 197 | + if cpu_pct is not None: |
| 198 | + if cpu_pct >= HealthMetrics.CPU_CRITICAL_THRESHOLD: |
| 199 | + if status == HealthStatus.HEALTHY: |
| 200 | + status = HealthStatus.CRITICAL |
| 201 | + score -= 30 |
| 202 | + issues.append(f"CPU usage critical ({cpu_pct:.1f}% of limit)") |
| 203 | + elif cpu_pct >= HealthMetrics.CPU_WARNING_THRESHOLD: |
| 204 | + if status == HealthStatus.HEALTHY: |
| 205 | + status = HealthStatus.DEGRADED |
| 206 | + score -= 15 |
| 207 | + issues.append(f"CPU usage high ({cpu_pct:.1f}% of limit)") |
| 208 | + |
| 209 | + # Check memory usage |
| 210 | + if memory_usage and memory_limit: |
| 211 | + mem_pct = HealthMetrics.calculate_usage_percentage(memory_usage, memory_limit, 'memory') |
| 212 | + if mem_pct is not None: |
| 213 | + if mem_pct >= HealthMetrics.MEMORY_CRITICAL_THRESHOLD: |
| 214 | + if status == HealthStatus.HEALTHY: |
| 215 | + status = HealthStatus.CRITICAL |
| 216 | + score -= 30 |
| 217 | + issues.append(f"Memory usage critical ({mem_pct:.1f}% of limit) - OOM risk") |
| 218 | + elif mem_pct >= HealthMetrics.MEMORY_WARNING_THRESHOLD: |
| 219 | + if status == HealthStatus.HEALTHY: |
| 220 | + status = HealthStatus.DEGRADED |
| 221 | + score -= 15 |
| 222 | + issues.append(f"Memory usage high ({mem_pct:.1f}% of limit)") |
| 223 | + |
| 224 | + # Ensure score doesn't go negative |
| 225 | + score = max(0, score) |
| 226 | + |
| 227 | + return status, issues, score |
| 228 | + |
| 229 | + @staticmethod |
| 230 | + def analyze_events(events: List[Dict]) -> Tuple[List[str], bool, bool]: |
| 231 | + """ |
| 232 | + Analyze Kubernetes events for warning signs. |
| 233 | +
|
| 234 | + Args: |
| 235 | + events: List of event dictionaries |
| 236 | +
|
| 237 | + Returns: |
| 238 | + Tuple of (critical_events, has_warnings, has_oom_killed) |
| 239 | + """ |
| 240 | + critical_events = [] |
| 241 | + has_warnings = False |
| 242 | + has_oom_killed = False |
| 243 | + |
| 244 | + for event in events: |
| 245 | + event_type = event.get('type', '') |
| 246 | + reason = event.get('reason', '') |
| 247 | + message = event.get('message', '') |
| 248 | + |
| 249 | + # Check for OOMKilled |
| 250 | + if reason in ['OOMKilling', 'OOMKilled']: |
| 251 | + has_oom_killed = True |
| 252 | + critical_events.append(f"OOM Event: {message}") |
| 253 | + |
| 254 | + # Check for critical events |
| 255 | + elif reason in HealthMetrics.CRITICAL_EVENT_REASONS: |
| 256 | + critical_events.append(f"{reason}: {message}") |
| 257 | + |
| 258 | + # Check for warning events |
| 259 | + elif reason in HealthMetrics.WARNING_EVENT_REASONS or event_type == 'Warning': |
| 260 | + has_warnings = True |
| 261 | + |
| 262 | + return critical_events, has_warnings, has_oom_killed |
| 263 | + |
| 264 | + @staticmethod |
| 265 | + def calculate_pod_health( |
| 266 | + containers_health: List[Dict], |
| 267 | + events: List[Dict], |
| 268 | + pod_phase: str |
| 269 | + ) -> Tuple[HealthStatus, List[str], int]: |
| 270 | + """ |
| 271 | + Calculate overall pod health based on container health and events. |
| 272 | +
|
| 273 | + Args: |
| 274 | + containers_health: List of container health assessments |
| 275 | + events: Pod events |
| 276 | + pod_phase: Pod phase (Running/Pending/Failed) |
| 277 | +
|
| 278 | + Returns: |
| 279 | + Tuple of (HealthStatus, issues_list, health_score) |
| 280 | + """ |
| 281 | + issues = [] |
| 282 | + |
| 283 | + # Check pod phase first |
| 284 | + if pod_phase in ['Failed', 'Unknown']: |
| 285 | + return HealthStatus.UNHEALTHY, ["Pod in Failed state"], 0 |
| 286 | + |
| 287 | + elif pod_phase == 'Pending': |
| 288 | + return HealthStatus.DEGRADED, ["Pod stuck in Pending state"], 30 |
| 289 | + |
| 290 | + # Analyze events |
| 291 | + critical_events, has_warnings, has_oom_killed = HealthMetrics.analyze_events(events) |
| 292 | + |
| 293 | + if critical_events: |
| 294 | + issues.extend(critical_events) |
| 295 | + |
| 296 | + if has_oom_killed: |
| 297 | + issues.append("Pod experienced OOM kills - increase memory limits") |
| 298 | + |
| 299 | + # Get worst container status |
| 300 | + worst_status = HealthStatus.HEALTHY |
| 301 | + min_score = 100 |
| 302 | + container_issues_count = 0 |
| 303 | + |
| 304 | + for container_health in containers_health: |
| 305 | + status_str = container_health.get('health_status') |
| 306 | + score = container_health.get('health_score', 100) |
| 307 | + container_issues = container_health.get('issues', []) |
| 308 | + |
| 309 | + if container_issues: |
| 310 | + container_issues_count += len(container_issues) |
| 311 | + |
| 312 | + # Convert string status to enum |
| 313 | + try: |
| 314 | + status = HealthStatus(status_str) |
| 315 | + except (ValueError, TypeError): |
| 316 | + status = HealthStatus.HEALTHY |
| 317 | + |
| 318 | + # Track worst status |
| 319 | + status_priority = { |
| 320 | + HealthStatus.UNHEALTHY: 4, |
| 321 | + HealthStatus.CRITICAL: 3, |
| 322 | + HealthStatus.DEGRADED: 2, |
| 323 | + HealthStatus.HEALTHY: 1 |
| 324 | + } |
| 325 | + |
| 326 | + if status_priority.get(status, 0) > status_priority.get(worst_status, 0): |
| 327 | + worst_status = status |
| 328 | + |
| 329 | + min_score = min(min_score, score) |
| 330 | + |
| 331 | + # Overall pod status is worst container status |
| 332 | + pod_status = worst_status |
| 333 | + |
| 334 | + # Apply event-based adjustments |
| 335 | + if has_oom_killed and pod_status.value in ['healthy', 'degraded']: |
| 336 | + pod_status = HealthStatus.CRITICAL |
| 337 | + min_score = min(min_score, 20) |
| 338 | + |
| 339 | + elif critical_events and pod_status == HealthStatus.HEALTHY: |
| 340 | + pod_status = HealthStatus.DEGRADED |
| 341 | + min_score = min(min_score, 60) |
| 342 | + |
| 343 | + elif has_warnings and pod_status == HealthStatus.HEALTHY: |
| 344 | + min_score = min(min_score, 80) |
| 345 | + |
| 346 | + # Add summary issue |
| 347 | + if container_issues_count > 0: |
| 348 | + issues.append(f"Total issues detected across containers: {container_issues_count}") |
| 349 | + |
| 350 | + return pod_status, issues, min_score |
0 commit comments