Skip to content

Commit 873a4db

Browse files
authored
Tighten API latency budget to 300ms (#532)
1 parent 900bcc0 commit 873a4db

File tree

5 files changed

+30
-9
lines changed

5 files changed

+30
-9
lines changed

docs/release-checklist.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ This checklist ties together continuous integration signal, Grafana alerting, an
1515
- `SUPABASE_URL`
1616
- `SUPABASE_SERVICE_ROLE_KEY`
1717
- Optional: `SHOPIFY_MODULE_CONFIG_TABLE` and `SHOPIFY_ORDER_TABLE` when the Supabase schema uses custom table names.
18-
- Confirm that the **observability-budgets** job has passed. It queries Prometheus and Tempo spanmetrics using `observability-budgets.yml` and fails when P95 latency or error-rate thresholds are exceeded compared to the previous day.
18+
- Confirm that the **observability-budgets** job has passed. It queries Prometheus and Tempo spanmetrics using `observability-budgets.yml` and fails when P95 latency or error-rate thresholds are exceeded compared to the previous day. The API latency SLO is currently **≤ 300 ms at P95**, so any regression beyond that should block the release until resolved.
1919
- Export any new failure signatures into the on-call runbook.
2020

2121
## 2. Review Grafana Dashboards

observability-budgets.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ prometheus:
1010
description: "P95 API latency across all HTTP handlers"
1111
query: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"paform-api\",route!~\"/healthcheck\"}[5m])) by (le))
1212
unit: ms
13-
threshold: 3000
13+
threshold: 300
1414
scale: 1000
1515
- id: api_error_rate
1616
description: "5xx error rate for the API"

ops/grafana/provisioning/dashboards/backend-golden-signals.json

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,24 @@
9999
},
100100
"fieldConfig": {
101101
"defaults": {
102-
"unit": "s"
102+
"thresholds": {
103+
"mode": "absolute",
104+
"steps": [
105+
{
106+
"color": "green",
107+
"value": null
108+
},
109+
{
110+
"color": "orange",
111+
"value": 250
112+
},
113+
{
114+
"color": "red",
115+
"value": 300
116+
}
117+
]
118+
},
119+
"unit": "ms"
103120
},
104121
"overrides": []
105122
},
@@ -122,7 +139,7 @@
122139
},
123140
"targets": [
124141
{
125-
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"backend\"}[5m])) by (le))",
142+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"backend\"}[5m])) by (le)) * 1000",
126143
"legendFormat": "P95 latency",
127144
"refId": "A"
128145
}

ops/grafana/provisioning/dashboards/backend_golden_signals.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,17 @@
103103
"color": "green",
104104
"value": null
105105
},
106+
{
107+
"color": "orange",
108+
"value": 250
109+
},
106110
{
107111
"color": "red",
108-
"value": 0.3
112+
"value": 300
109113
}
110114
]
111115
},
112-
"unit": "s"
116+
"unit": "ms"
113117
},
114118
"overrides": []
115119
},
@@ -134,7 +138,7 @@
134138
},
135139
"targets": [
136140
{
137-
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"backend\"}[5m])) by (le))",
141+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=\"backend\"}[5m])) by (le)) * 1000",
138142
"legendFormat": "p95 latency",
139143
"refId": "A"
140144
}

ops/prometheus/alerts.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ groups:
2020
histogram_quantile(
2121
0.95,
2222
sum(rate(http_request_duration_seconds_bucket{job="backend"}[5m])) by (le)
23-
) > 0.75
23+
) > 0.3
2424
for: 10m
2525
labels:
2626
severity: critical
2727
annotations:
2828
summary: "Backend p95 latency is elevated"
2929
description: |
30-
The 95th percentile API latency has been above 750ms for more than 10 minutes.
30+
The 95th percentile API latency has been above 300ms for more than 10 minutes.
3131
3232
- name: frontend-alerts
3333
rules:

0 commit comments

Comments
 (0)