metrics: Add TTFT/TPOT p95 dashboard (#250)

tao12345666333 · rootfs · web-flow · commit cc1dfc31da10 · 2025-09-27T12:17:55.000-04:00
Signed-off-by: Jintao Zhang &lt;zhangjintao9020@gmail.com&gt;
Co-authored-by: Huamin Chen &lt;rootfs@users.noreply.github.com&gt;
diff --git a/deploy/llm-router-dashboard.json b/deploy/llm-router-dashboard.json
@@ -405,6 +405,206 @@
       ],
       "title": "Model Completion Latency (p95)",
       "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "febzoy4cplt6oe"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Seconds",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max",
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.5.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "febzoy4cplt6oe"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "TTFT p95 {{model}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "TTFT (p95) by Model",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "febzoy4cplt6oe"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Seconds per token",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max",
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.5.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "febzoy4cplt6oe"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "TPOT p95 {{model}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "TPOT (p95) by Model (sec/token)",
+      "type": "timeseries"
     }
   ],
   "preload": false,
@@ -438,6 +638,6 @@
   "timezone": "",
   "title": "LLM Router Metrics",
   "uid": "llm-router-metrics",
-  "version": 12,
+  "version": 14,
   "weekStart": ""
 }
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
@@ -327,6 +327,30 @@ sum by (model) (rate(llm_request_errors_total[15m]))
 sum(increase(llm_request_errors_total{reason="pii_policy_denied"}[24h]))
 ```
 
+### TTFT and TPOT Metrics
+
+Time-to-first-token (TTFT) and time-per-output-token (TPOT) are exported as Prometheus histograms and can be visualized at p95 with histogram_quantile.
+
+- `llm_model_ttft_seconds{model}`
+  - Histogram: Exposes `_bucket`, `_sum`, `_count`
+  - Description: Time to first token since the router started processing the request
+  - Example p95 (last 5m) by model:
+
+```prometheus
+histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))
+```
+
+- `llm_model_tpot_seconds{model}`
+  - Histogram: Exposes `_bucket`, `_sum`, `_count`
+  - Description: Seconds per output token (completion latency / completion tokens)
+  - Example p95 (last 5m) by model:
+
+```prometheus
+histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))
+```
+
+These are included in the provided Grafana dashboard at deploy/llm-router-dashboard.json as “TTFT (p95) by Model” and “TPOT (p95) by Model (sec/token)”.
+
 ### Pricing Configuration
 
 Provide per-1M pricing for your models so the router can compute request cost and emit metrics/logs.