Update website/docs/api/router.md

samzong · tao12345666333 · samzong · commit 19a6ba7cd9f4 · 2025-09-19T23:20:43.000+08:00
Co-authored-by: Jintao Zhang &lt;zhangjintao9020@gmail.com&gt;
Signed-off-by: samzong &lt;samzong.lu@gmail.com&gt;
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
@@ -253,9 +253,9 @@ sum by (model) (increase(llm_model_cost_total{currency="USD"}[1h]))
 sum by (reason_code) (increase(llm_routing_reason_codes_total[15m]))
 ```
 
-### Error Metrics
+### Request Error Metrics
 
-The router tracks request errors categorized by failure reason for monitoring and debugging.
+The router tracks request-level failures by model and reason so you can monitor both absolute error throughput and the share of requests that fail.
 
 - `llm_request_errors_total{model, reason}`
   - Description: Total number of request errors categorized by failure reason
@@ -269,9 +269,14 @@ Example PromQL queries:
 # Total errors by reason over the last hour
 sum by (reason) (increase(llm_request_errors_total[1h]))
 
-# Error rate by model over the last 15 minutes
-sum by (model) (increase(llm_request_errors_total[15m])) /
-sum by (model) (increase(llm_model_requests_total[15m]))
+# Error throughput (errors/sec) by model over the last 15 minutes.
+# Helpful for incident response because it shows how many failing requests are impacting users.
+sum by (model) (rate(llm_request_errors_total[15m]))
+
+# Error ratio (% of requests failing) by model over the last 15 minutes.
+# Use increase() to align numerator and denominator with the same lookback window.
+100 * sum by (model) (increase(llm_request_errors_total[15m])) /
+    sum by (model) (increase(llm_model_requests_total[15m]))
 
 # PII policy blocks over the last 24 hours
 sum(increase(llm_request_errors_total{reason="pii_policy_denied"}[24h]))