Skip to content

Commit a0e5873

Browse files
committed
added more metrics to grafana dashboard + their collection in backend: pod create fail rate, exec queue depth, wai times, resource utilization, security events (access to outer web) + corresponding alerts in alert manager
1 parent 1c693f1 commit a0e5873

File tree

6 files changed

+1016
-70
lines changed

6 files changed

+1016
-70
lines changed

backend/alertmanager/alertmanager.yml

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,85 @@ global:
33
smtp_from: 'alertmanager@example.com'
44

55
route:
6-
group_by: ['alertname']
6+
group_by: ['alertname', 'severity']
77
group_wait: 30s
88
group_interval: 5m
99
repeat_interval: 4h
10-
receiver: 'web.hook'
10+
receiver: 'default'
11+
routes:
12+
- match:
13+
severity: critical
14+
match_re:
15+
alertname: '(NetworkPolicyViolations|PrivilegeEscalationAttempts|CriticalMemoryUtilization|CriticalCPUUtilization)'
16+
group_wait: 10s
17+
group_interval: 1m
18+
repeat_interval: 30m
19+
receiver: 'critical-security'
20+
21+
- match:
22+
severity: critical
23+
group_wait: 15s
24+
group_interval: 2m
25+
repeat_interval: 1h
26+
receiver: 'critical-infrastructure'
27+
28+
- match:
29+
severity: warning
30+
group_wait: 1m
31+
group_interval: 10m
32+
repeat_interval: 6h
33+
receiver: 'warning'
1134

1235
receivers:
13-
- name: 'web.hook'
36+
- name: 'default'
37+
webhook_configs:
38+
- url: 'http://backend:443/api/v1/health'
39+
send_resolved: true
40+
title: 'Integr8sCode Alert: {{ .GroupLabels.alertname }}'
41+
text: |
42+
{{ range .Alerts }}
43+
Alert: {{ .Annotations.summary }}
44+
Description: {{ .Annotations.description }}
45+
Severity: {{ .Labels.severity }}
46+
{{ end }}
47+
48+
- name: 'critical-security'
49+
webhook_configs:
50+
- url: 'http://backend:443/api/v1/health'
51+
send_resolved: true
52+
title: '🚨 CRITICAL SECURITY ALERT: {{ .GroupLabels.alertname }}'
53+
text: |
54+
IMMEDIATE ATTENTION REQUIRED - Security incident detected!
55+
56+
{{ range .Alerts }}
57+
Alert: {{ .Annotations.summary }}
58+
Description: {{ .Annotations.description }}
59+
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
60+
{{ end }}
61+
62+
- name: 'critical-infrastructure'
63+
webhook_configs:
64+
- url: 'http://backend:443/api/v1/health'
65+
send_resolved: true
66+
title: '🔥 CRITICAL INFRASTRUCTURE ALERT: {{ .GroupLabels.alertname }}'
67+
text: |
68+
Critical system issue requiring immediate attention!
69+
70+
{{ range .Alerts }}
71+
Alert: {{ .Annotations.summary }}
72+
Description: {{ .Annotations.description }}
73+
Severity: {{ .Labels.severity }}
74+
{{ end }}
75+
76+
- name: 'warning'
1477
webhook_configs:
1578
- url: 'http://backend:443/api/v1/health'
16-
send_resolved: true
79+
send_resolved: true
80+
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
81+
text: |
82+
Warning condition detected - monitoring recommended.
83+
84+
{{ range .Alerts }}
85+
Alert: {{ .Annotations.summary }}
86+
Description: {{ .Annotations.description }}
87+
{{ end }}

backend/app/core/metrics.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,55 @@ def labels(self, *args: Any, **kwargs: Any) -> 'BoundedLabelsCounter':
4343
"Total number of script errors by type",
4444
["error_type"], # Limited to predefined error types
4545
)
46+
47+
# Pod creation failure metrics
48+
POD_CREATION_FAILURES = Counter(
49+
"pod_creation_failures_total",
50+
"Total number of pod creation failures by reason",
51+
["failure_reason"],
52+
)
53+
54+
# Queue depth and wait time metrics
55+
QUEUE_DEPTH = Gauge(
56+
"execution_queue_depth",
57+
"Current number of executions waiting in queue"
58+
)
59+
60+
QUEUE_WAIT_TIME = Histogram(
61+
"execution_queue_wait_time_seconds",
62+
"Time spent waiting in queue before execution starts",
63+
["lang_and_version"],
64+
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0],
65+
)
66+
67+
# Resource utilization trend metrics
68+
CPU_UTILIZATION = Gauge(
69+
"script_cpu_utilization_millicores",
70+
"CPU utilization in millicores per script execution",
71+
["lang_and_version"]
72+
)
73+
74+
MEMORY_UTILIZATION_PERCENT = Gauge(
75+
"script_memory_utilization_percent",
76+
"Memory utilization as percentage of available memory",
77+
["lang_and_version"]
78+
)
79+
80+
# Security event tracking metrics
81+
SECURITY_EVENTS = Counter(
82+
"security_events_total",
83+
"Total number of security events by type",
84+
["event_type"],
85+
)
86+
87+
NETWORK_POLICY_VIOLATIONS = Counter(
88+
"network_policy_violations_total",
89+
"Total number of network policy violations",
90+
["policy_name"],
91+
)
92+
93+
PRIVILEGE_ESCALATION_ATTEMPTS = Counter(
94+
"privilege_escalation_attempts_total",
95+
"Total number of privilege escalation attempts",
96+
["execution_id"],
97+
)

backend/app/services/execution_service.py

Lines changed: 49 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,14 @@
88
from app.core.logging import logger
99
from app.core.metrics import (
1010
ACTIVE_EXECUTIONS,
11+
CPU_UTILIZATION,
1112
ERROR_COUNTER,
1213
EXECUTION_DURATION,
1314
MEMORY_USAGE,
15+
MEMORY_UTILIZATION_PERCENT,
16+
POD_CREATION_FAILURES,
17+
QUEUE_DEPTH,
18+
QUEUE_WAIT_TIME,
1419
SCRIPT_EXECUTIONS,
1520
)
1621
from app.db.repositories.execution_repository import (
@@ -25,7 +30,6 @@
2530
get_kubernetes_service,
2631
)
2732
from fastapi import Depends
28-
from kubernetes.client.rest import ApiException
2933

3034

3135
class ExecutionStatus(str, Enum):
@@ -53,8 +57,6 @@ async def get_k8s_resource_limits(self) -> Dict[str, Any]:
5357
"supported_runtimes": self.settings.SUPPORTED_RUNTIMES,
5458
}
5559

56-
# for whatever reason mypy is dumb and can't defer type of EXAMPLE_SCRIPTS
57-
# -> ignoring type
5860
async def get_example_scripts(self) -> Dict[str, str]:
5961
return self.settings.EXAMPLE_SCRIPTS # type: ignore
6062

@@ -89,11 +91,11 @@ async def _mark_running_when_scheduled(
8991
)
9092

9193
async def _start_k8s_execution(
92-
self,
93-
execution_id_str: str,
94-
script: str,
95-
lang: str,
96-
lang_version: str,
94+
self,
95+
execution_id_str: str,
96+
script: str,
97+
lang: str,
98+
lang_version: str,
9799
) -> None:
98100
"""
99101
1. Ask KubernetesService to create the Pod.
@@ -111,7 +113,6 @@ async def _start_k8s_execution(
111113
config_map_data={runtime_cfg.file_name: script},
112114
)
113115

114-
# Start background poller ⤵ — we do **not** await it here
115116
asyncio.create_task(
116117
self._mark_running_when_scheduled(pod_name, execution_id_str)
117118
)
@@ -123,6 +124,9 @@ async def _start_k8s_execution(
123124

124125
except Exception as e:
125126
logger.error(f"Failed to request K8s pod creation: {str(e)}", exc_info=True)
127+
128+
POD_CREATION_FAILURES.labels(failure_reason=type(e).__name__).inc()
129+
126130
await self.execution_repo.update_execution(
127131
execution_id_str,
128132
ExecutionUpdate(
@@ -132,34 +136,6 @@ async def _start_k8s_execution(
132136
)
133137
raise IntegrationException(status_code=500, detail="Container creation failed") from e
134138

135-
async def _get_k8s_execution_output(
136-
self,
137-
execution_id_str: str
138-
) -> tuple[Optional[str], Optional[str], Optional[str], Optional[dict]]:
139-
output: Optional[str] = None
140-
error_msg: Optional[str] = None
141-
# assume error unless the try succeeds
142-
phase: Optional[str] = ExecutionStatus.ERROR
143-
resource_usage: Optional[dict] = None
144-
145-
try:
146-
output, phase, resource_usage = await self.k8s_service.get_pod_logs(execution_id_str)
147-
logger.info(
148-
f"Retrieved K8s results for {execution_id_str}. "
149-
f"Phase: {phase}. Resource usage found: {resource_usage is not None}"
150-
)
151-
except KubernetesPodError as e:
152-
error_msg = str(e)
153-
logger.error(f"Error retrieving pod results for {execution_id_str}: {error_msg}")
154-
except ApiException as e:
155-
error_msg = f"Kubernetes API error for {execution_id_str}: {e.status} {e.reason}"
156-
logger.error(error_msg, exc_info=True)
157-
except Exception as e:
158-
error_msg = f"Unexpected error retrieving K8s results for {execution_id_str}: {e}"
159-
logger.error(error_msg, exc_info=True)
160-
161-
return output, error_msg, phase, resource_usage
162-
163139
async def _try_finalize_execution(self, execution: ExecutionInDB) -> Optional[ExecutionInDB]:
164140
try:
165141
metrics, final_phase = await self.k8s_service.get_pod_logs(execution.id)
@@ -175,33 +151,35 @@ async def _try_finalize_execution(self, execution: ExecutionInDB) -> Optional[Ex
175151
logger.info(f"Successfully parsed metrics from pod: {metrics}")
176152

177153
exit_code = metrics.get("exit_code")
178-
res_usage = metrics.get("resource_usage")
179-
180-
if not res_usage:
181-
return None # waiting for results
154+
res_usage = metrics.get("resource_usage", {})
182155

183-
wall_s = res_usage.get("execution_time_wall_seconds") or 0
156+
wall_s = res_usage.get("execution_time_wall_seconds") or 0.0
184157
jiffies = float(res_usage.get("cpu_time_jiffies", 0) or 0)
185158
hertz = float(res_usage.get("clk_tck_hertz", 100) or 100)
186-
cpu_s = jiffies / hertz if hertz > 0 else 0.0 # total CPU-time
159+
cpu_s = jiffies / hertz if hertz > 0 else 0.0
187160

188-
# average CPU in millicores: (CPU-seconds / wall-seconds) × 1000
189161
cpu_millicores = (cpu_s / wall_s * 1000) if wall_s else 0.0
190-
191-
# VmHWM is k*ibi*bytes → MiB = KiB / 1024
192162
peak_kib = float(res_usage.get("peak_memory_kb", 0) or 0)
193163
peak_mib = peak_kib / 1024.0
194164

195-
MEMORY_USAGE.labels(lang_and_version=execution.lang + "-" + execution.lang_version).set(
196-
peak_mib * 1024 * 1024)
165+
lang_and_version: str = f"{execution.lang}-{execution.lang_version}"
166+
167+
EXECUTION_DURATION.labels(lang_and_version=lang_and_version).observe(wall_s)
168+
MEMORY_USAGE.labels(lang_and_version=lang_and_version).set(peak_mib * 1024 * 1024)
169+
CPU_UTILIZATION.labels(lang_and_version=lang_and_version).set(cpu_millicores)
170+
171+
# in settings, pod limit is a string of type <digits><2 letters for unit>
172+
memory_limit_mib = float(self.settings.K8S_POD_MEMORY_LIMIT[:-2])
173+
mem_util_pct = (peak_mib / memory_limit_mib) * 100
174+
MEMORY_UTILIZATION_PERCENT.labels(lang_and_version=lang_and_version).set(mem_util_pct)
197175

198176
final_resource_usage = {
199177
"execution_time": round(wall_s, 6),
200178
"cpu_usage": round(cpu_millicores, 2),
201179
"memory_usage": round(peak_mib, 2),
180+
"pod_phase": final_phase,
202181
}
203182

204-
final_resource_usage["pod_phase"] = final_phase
205183
status: ExecutionStatus = ExecutionStatus.COMPLETED if exit_code == 0 else ExecutionStatus.ERROR
206184
if status == ExecutionStatus.ERROR:
207185
ERROR_COUNTER.labels(error_type="NonZeroExitCode").inc()
@@ -223,8 +201,8 @@ async def _try_finalize_execution(self, execution: ExecutionInDB) -> Optional[Ex
223201
raise IntegrationException(status_code=500, detail="Failed to retrieve execution after update.")
224202

225203
status_label = "success" if updated_execution.status == ExecutionStatus.COMPLETED else "error"
226-
SCRIPT_EXECUTIONS.labels(status=status_label,
227-
lang_and_version=execution.lang + "-" + execution.lang_version).inc()
204+
lang_version_label = f"{updated_execution.lang}-{updated_execution.lang_version}"
205+
SCRIPT_EXECUTIONS.labels(status=status_label, lang_and_version=lang_version_label).inc()
228206
if status_label == "error":
229207
ERROR_COUNTER.labels(error_type="ScriptExecutionError").inc()
230208

@@ -236,33 +214,37 @@ async def execute_script(
236214
lang_version: str = "3.11"
237215
) -> ExecutionInDB:
238216
ACTIVE_EXECUTIONS.inc()
239-
start_time = time()
217+
QUEUE_DEPTH.inc()
240218
inserted_oid = None
219+
lang_and_version = f"{lang}-{lang_version}"
220+
221+
start_time = time()
241222

242223
try:
243-
if lang not in self.settings.SUPPORTED_RUNTIMES.keys():
224+
if lang not in self.settings.SUPPORTED_RUNTIMES:
244225
raise IntegrationException(status_code=400, detail=f"Language '{lang}' not supported.")
245226

246227
if lang_version not in self.settings.SUPPORTED_RUNTIMES[lang]:
247228
raise IntegrationException(status_code=400, detail=f"Language version '{lang_version}' not supported.")
248229

249230
execution_create = ExecutionCreate(
250-
script=script,
251-
lang=lang,
252-
lang_version=lang_version,
253-
status=ExecutionStatus.QUEUED,
231+
script=script, lang=lang, lang_version=lang_version, status=ExecutionStatus.QUEUED
254232
)
255233
execution_to_insert = ExecutionInDB(**execution_create.model_dump())
256234
inserted_oid = await self.execution_repo.create_execution(execution_to_insert)
257235
logger.info(f"Created execution record {inserted_oid} with status QUEUED.")
258236

259237
await self._start_k8s_execution(
260-
execution_id_str=inserted_oid, script=script,
261-
lang=lang, lang_version=lang_version
238+
execution_id_str=str(inserted_oid), script=script, lang=lang, lang_version=lang_version
262239
)
240+
241+
queue_wait_duration = time() - start_time
242+
QUEUE_WAIT_TIME.labels(lang_and_version=lang_and_version).observe(queue_wait_duration)
243+
244+
# Allow a brief moment for the background poller to potentially catch the Running state
263245
await asyncio.sleep(0.1)
264246

265-
final_execution_state = await self.execution_repo.get_execution(inserted_oid)
247+
final_execution_state = await self.execution_repo.get_execution(str(inserted_oid))
266248
if not final_execution_state:
267249
raise IntegrationException(status_code=500, detail="Failed to retrieve execution record after creation")
268250
return final_execution_state
@@ -273,14 +255,16 @@ async def execute_script(
273255
if inserted_oid:
274256
await self.execution_repo.update_execution(
275257
str(inserted_oid),
276-
ExecutionUpdate(status=ExecutionStatus.ERROR,
277-
errors="Script execution failed").model_dump(exclude_unset=True)
258+
ExecutionUpdate(status=ExecutionStatus.ERROR, errors="Script execution failed").model_dump(
259+
exclude_unset=True)
278260
)
279-
raise IntegrationException(status_code=500,
280-
detail="Script execution failed") from e
261+
raise IntegrationException(status_code=500, detail="Script execution failed") from e
281262
finally:
282-
EXECUTION_DURATION.labels(lang_and_version=lang + "-" + lang_version).observe(time() - start_time)
263+
# These metrics track the overall API call, not just script runtime
283264
ACTIVE_EXECUTIONS.dec()
265+
# QUEUE_DEPTH is decremented here to ensure it's always called once,
266+
# avoiding the previous double-decrement bug.
267+
QUEUE_DEPTH.dec()
284268

285269
async def get_execution_result(self, execution_id: str) -> ExecutionInDB:
286270
execution = await self.execution_repo.get_execution(execution_id)

0 commit comments

Comments
 (0)