Skip to content

Commit 1dbc1d9

Browse files
committed
- updated notifications and plots (moved alerting to prometheus, deleted from grafana, added more plots to grafana)
- add more precise calculation of selected metrics
1 parent 15e2555 commit 1dbc1d9

File tree

12 files changed

+433
-164
lines changed

12 files changed

+433
-164
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
global:
2+
smtp_smarthost: 'localhost:587'
3+
smtp_from: '[email protected]'
4+
5+
route:
6+
group_by: ['alertname']
7+
group_wait: 30s
8+
group_interval: 5m
9+
repeat_interval: 4h
10+
receiver: 'web.hook'
11+
12+
receivers:
13+
- name: 'web.hook'
14+
webhook_configs:
15+
- url: 'http://backend:443/api/v1/health'
16+
send_resolved: true

backend/app/core/metrics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def labels(self, *args: Any, **kwargs: Any) -> 'BoundedLabelsCounter':
3535

3636
# Resource usage metrics
3737
MEMORY_USAGE = Gauge(
38-
"script_memory_usage_bytes", "Memory usage per script execution", ["lang_and_version"]
38+
"script_memory_usage_mib", "Memory usage (MiB) per script execution", ["lang_and_version"]
3939
)
4040

4141
ERROR_COUNTER = Counter(

backend/app/services/execution_service.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
ACTIVE_EXECUTIONS,
1111
ERROR_COUNTER,
1212
EXECUTION_DURATION,
13+
MEMORY_USAGE,
1314
SCRIPT_EXECUTIONS,
1415
)
1516
from app.db.repositories.execution_repository import (
@@ -143,16 +144,22 @@ async def _try_finalize_execution(self, execution: ExecutionInDB) -> Optional[Ex
143144
peak_kib = float(res_usage.get("peak_memory_kb", 0) or 0)
144145
peak_mib = peak_kib / 1024.0
145146

147+
MEMORY_USAGE.labels(lang_and_version=execution.lang + "-" + execution.lang_version).set(
148+
peak_mib * 1024 * 1024)
149+
146150
final_resource_usage = {
147151
"execution_time": round(wall_s, 6),
148152
"cpu_usage": round(cpu_millicores, 2),
149153
"memory_usage": round(peak_mib, 2),
150154
}
151155

152156
final_resource_usage["pod_phase"] = final_phase
157+
status: ExecutionStatus = ExecutionStatus.COMPLETED if exit_code == 0 else ExecutionStatus.ERROR
158+
if status == ExecutionStatus.ERROR:
159+
ERROR_COUNTER.labels(error_type="NonZeroExitCode").inc()
153160

154161
update_data = {
155-
"status": ExecutionStatus.COMPLETED if exit_code == 0 else ExecutionStatus.ERROR,
162+
"status": status,
156163
"output": metrics.get("stdout", ""),
157164
"errors": metrics.get("stderr", ""),
158165
"resource_usage": final_resource_usage,

backend/grafana/grafana.ini

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ allow_sign_up = false
77
allow_org_create = false
88
auto_assign_org = true
99

10-
# idk why need host, took idea from https://github.com/grafana/grafana/blob/main/conf/defaults.ini
11-
# if removed - "SMTP not configured, check your grafana.ini config file's [smtp] section"
1210
[smtp]
1311
enabled = false
1412

@@ -24,10 +22,7 @@ org_role = Viewer
2422
enabled = false
2523

2624
[unified_alerting]
27-
enabled = true
28-
29-
[alerting]
3025
enabled = false
3126

32-
;[paths]
33-
;provisioning = /etc/grafana/provisioning
27+
[alerting]
28+
enabled = false

backend/grafana/provisioning/alerting/alerts.yml

Lines changed: 0 additions & 92 deletions
This file was deleted.

backend/grafana/provisioning/alerting/contact-points.yml

Lines changed: 0 additions & 9 deletions
This file was deleted.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
apiVersion: 1
2+
contactPoints: []
3+
policies: []

backend/grafana/provisioning/alerting/notification-policies.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

0 commit comments

Comments
 (0)