Skip to content

Commit ce6a12d

Browse files
authored
Add observability dashboards and alerting configuration (#98)
* Add observability dashboards and alerting configuration * Fix Grafana alert routing and tracing filters
1 parent 0a63db3 commit ce6a12d

File tree

6 files changed

+746
-0
lines changed

6 files changed

+746
-0
lines changed

ops/grafana/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Grafana Provisioning
2+
3+
This directory contains provisioning files for Grafana dashboards and alerting configuration used by Paform.
4+
5+
## Dashboards
6+
7+
* `provisioning/dashboards/` includes JSON definitions for backend, sync pipeline, frontend UX, and tracing explorer dashboards.
8+
* The root `dashboards/dashboards.yaml` file points Grafana at this directory so the dashboards are loaded automatically on startup.
9+
10+
## Alerting Contact Points
11+
12+
* `provisioning/alerting/contact-points.yaml` defines Slack and email receivers. Set the `SLACK_WEBHOOK_URL` environment variable before starting Grafana so the webhook can be injected at runtime.
13+
* `provisioning/alerting/notification-policies.yaml` routes `critical` alerts to Slack and `warning` alerts to email by default. Update label matching if your topology changes.
14+
15+
## Prometheus Alerts
16+
17+
* `../prometheus/alerts.yml` contains alerting rules that surface in Grafana. Mount this file into Prometheus (e.g., via `prometheus.yml` `rule_files`) so the alerts become active.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
apiVersion: 1
2+
policies:
3+
- orgId: 1
4+
receiver: Slack - Oncall
5+
group_by:
6+
- alertname
7+
- service
8+
object_matchers:
9+
- - severity
10+
- =
11+
- critical
12+
continue: true
13+
- orgId: 1
14+
receiver: Email - SRE
15+
group_by:
16+
- alertname
17+
- service
18+
object_matchers:
19+
- - severity
20+
- =
21+
- warning
22+
continue: false
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
{
2+
"annotations": {
3+
"list": [
4+
{
5+
"builtIn": 1,
6+
"datasource": {
7+
"type": "datasource",
8+
"uid": "grafana"
9+
},
10+
"enable": true,
11+
"hide": true,
12+
"iconColor": "rgba(0, 211, 255, 1)",
13+
"name": "Annotations & Alerts",
14+
"type": "dashboard"
15+
}
16+
]
17+
},
18+
"editable": true,
19+
"fiscalYearStartMonth": 0,
20+
"graphTooltip": 0,
21+
"id": null,
22+
"links": [],
23+
"liveNow": false,
24+
"panels": [
25+
{
26+
"datasource": {
27+
"type": "prometheus",
28+
"uid": "Prometheus"
29+
},
30+
"fieldConfig": {
31+
"defaults": {
32+
"unit": "req/s"
33+
},
34+
"overrides": []
35+
},
36+
"gridPos": {
37+
"h": 8,
38+
"w": 12,
39+
"x": 0,
40+
"y": 0
41+
},
42+
"id": 1,
43+
"options": {
44+
"legend": {
45+
"calcs": [],
46+
"displayMode": "table",
47+
"placement": "bottom"
48+
}
49+
},
50+
"targets": [
51+
{
52+
"expr": "sum(rate(http_requests_total{job=\"backend\"}[5m]))",
53+
"legendFormat": "Request rate",
54+
"refId": "A"
55+
}
56+
],
57+
"title": "Request Rate",
58+
"type": "timeseries"
59+
},
60+
{
61+
"datasource": {
62+
"type": "prometheus",
63+
"uid": "Prometheus"
64+
},
65+
"fieldConfig": {
66+
"defaults": {
67+
"unit": "percentunit"
68+
},
69+
"overrides": []
70+
},
71+
"gridPos": {
72+
"h": 8,
73+
"w": 12,
74+
"x": 12,
75+
"y": 0
76+
},
77+
"id": 2,
78+
"options": {
79+
"legend": {
80+
"calcs": [],
81+
"displayMode": "list",
82+
"placement": "bottom"
83+
}
84+
},
85+
"targets": [
86+
{
87+
"expr": "sum(rate(http_requests_total{job=\"backend\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"backend\"}[5m]))",
88+
"legendFormat": "Error rate",
89+
"refId": "A"
90+
}
91+
],
92+
"title": "Error Rate",
93+
"type": "timeseries"
94+
},
95+
{
96+
"datasource": {
97+
"type": "prometheus",
98+
"uid": "Prometheus"
99+
},
100+
"fieldConfig": {
101+
"defaults": {
102+
"unit": "s"
103+
},
104+
"overrides": []
105+
},
106+
"gridPos": {
107+
"h": 8,
108+
"w": 12,
109+
"x": 0,
110+
"y": 8
111+
},
112+
"id": 3,
113+
"options": {
114+
"legend": {
115+
"calcs": [],
116+
"displayMode": "list",
117+
"placement": "bottom"
118+
},
119+
"tooltip": {
120+
"mode": "multi"
121+
}
122+
},
123+
"targets": [
124+
{
125+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"backend\"}[5m])) by (le))",
126+
"legendFormat": "P95 latency",
127+
"refId": "A"
128+
}
129+
],
130+
"title": "Latency (P95)",
131+
"type": "timeseries"
132+
},
133+
{
134+
"datasource": {
135+
"type": "prometheus",
136+
"uid": "Prometheus"
137+
},
138+
"fieldConfig": {
139+
"defaults": {
140+
"unit": "none"
141+
},
142+
"overrides": []
143+
},
144+
"gridPos": {
145+
"h": 8,
146+
"w": 12,
147+
"x": 12,
148+
"y": 8
149+
},
150+
"id": 4,
151+
"options": {
152+
"legend": {
153+
"displayMode": "list",
154+
"placement": "bottom"
155+
}
156+
},
157+
"targets": [
158+
{
159+
"expr": "avg(go_goroutines{job=\"backend\"})",
160+
"legendFormat": "Goroutines",
161+
"refId": "A"
162+
},
163+
{
164+
"expr": "avg(process_resident_memory_bytes{job=\"backend\"})",
165+
"legendFormat": "RSS",
166+
"refId": "B"
167+
}
168+
],
169+
"title": "Saturation",
170+
"type": "timeseries"
171+
}
172+
],
173+
"refresh": "30s",
174+
"schemaVersion": 39,
175+
"style": "dark",
176+
"tags": [
177+
"golden-signals",
178+
"backend"
179+
],
180+
"templating": {
181+
"list": [
182+
{
183+
"current": {
184+
"selected": false,
185+
"text": "All",
186+
"value": ""
187+
},
188+
"datasource": {
189+
"type": "prometheus",
190+
"uid": "Prometheus"
191+
},
192+
"definition": "label_values(http_requests_total{job=\"backend\"}, service)",
193+
"hide": 0,
194+
"includeAll": true,
195+
"label": "Service",
196+
"multi": true,
197+
"name": "service",
198+
"options": [],
199+
"query": "label_values(http_requests_total{job=\"backend\"}, service)",
200+
"refresh": 1,
201+
"regex": "",
202+
"type": "query"
203+
}
204+
]
205+
},
206+
"time": {
207+
"from": "now-6h",
208+
"to": "now"
209+
},
210+
"timepicker": {},
211+
"timezone": "browser",
212+
"title": "Backend Golden Signals",
213+
"uid": "backend-golden-signals",
214+
"version": 1,
215+
"weekStart": ""
216+
}

0 commit comments

Comments
 (0)