Skip to content

Commit cc1dfc3

Browse files
metrics: Add TTFT/TPOT p95 dashboard (#250)
Signed-off-by: Jintao Zhang <[email protected]> Co-authored-by: Huamin Chen <[email protected]>
1 parent a160bfa commit cc1dfc3

File tree

2 files changed

+225
-1
lines changed

2 files changed

+225
-1
lines changed

deploy/llm-router-dashboard.json

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,206 @@
405405
],
406406
"title": "Model Completion Latency (p95)",
407407
"type": "timeseries"
408+
},
409+
{
410+
"datasource": {
411+
"type": "prometheus",
412+
"uid": "febzoy4cplt6oe"
413+
},
414+
"fieldConfig": {
415+
"defaults": {
416+
"color": {
417+
"mode": "palette-classic"
418+
},
419+
"custom": {
420+
"axisBorderShow": false,
421+
"axisCenteredZero": false,
422+
"axisColorMode": "text",
423+
"axisLabel": "Seconds",
424+
"axisPlacement": "auto",
425+
"barAlignment": 0,
426+
"barWidthFactor": 0.6,
427+
"drawStyle": "line",
428+
"fillOpacity": 10,
429+
"gradientMode": "none",
430+
"hideFrom": {
431+
"legend": false,
432+
"tooltip": false,
433+
"viz": false
434+
},
435+
"insertNulls": false,
436+
"lineInterpolation": "smooth",
437+
"lineWidth": 1,
438+
"pointSize": 5,
439+
"scaleDistribution": {
440+
"type": "linear"
441+
},
442+
"showPoints": "auto",
443+
"spanNulls": false,
444+
"stacking": {
445+
"group": "A",
446+
"mode": "none"
447+
},
448+
"thresholdsStyle": {
449+
"mode": "off"
450+
}
451+
},
452+
"mappings": [],
453+
"thresholds": {
454+
"mode": "absolute",
455+
"steps": [
456+
{
457+
"color": "green",
458+
"value": null
459+
},
460+
{
461+
"color": "red",
462+
"value": 80
463+
}
464+
]
465+
},
466+
"unit": "s"
467+
},
468+
"overrides": []
469+
},
470+
"gridPos": {
471+
"h": 8,
472+
"w": 12,
473+
"x": 0,
474+
"y": 16
475+
},
476+
"id": 5,
477+
"options": {
478+
"legend": {
479+
"calcs": [
480+
"mean",
481+
"max",
482+
"lastNotNull"
483+
],
484+
"displayMode": "table",
485+
"placement": "bottom",
486+
"showLegend": true
487+
},
488+
"tooltip": {
489+
"hideZeros": false,
490+
"mode": "multi",
491+
"sort": "none"
492+
}
493+
},
494+
"pluginVersion": "11.5.1",
495+
"targets": [
496+
{
497+
"datasource": {
498+
"type": "prometheus",
499+
"uid": "febzoy4cplt6oe"
500+
},
501+
"editorMode": "code",
502+
"expr": "histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))",
503+
"legendFormat": "TTFT p95 {{model}}",
504+
"range": true,
505+
"refId": "A"
506+
}
507+
],
508+
"title": "TTFT (p95) by Model",
509+
"type": "timeseries"
510+
},
511+
{
512+
"datasource": {
513+
"type": "prometheus",
514+
"uid": "febzoy4cplt6oe"
515+
},
516+
"fieldConfig": {
517+
"defaults": {
518+
"color": {
519+
"mode": "palette-classic"
520+
},
521+
"custom": {
522+
"axisBorderShow": false,
523+
"axisCenteredZero": false,
524+
"axisColorMode": "text",
525+
"axisLabel": "Seconds per token",
526+
"axisPlacement": "auto",
527+
"barAlignment": 0,
528+
"barWidthFactor": 0.6,
529+
"drawStyle": "line",
530+
"fillOpacity": 10,
531+
"gradientMode": "none",
532+
"hideFrom": {
533+
"legend": false,
534+
"tooltip": false,
535+
"viz": false
536+
},
537+
"insertNulls": false,
538+
"lineInterpolation": "smooth",
539+
"lineWidth": 1,
540+
"pointSize": 5,
541+
"scaleDistribution": {
542+
"type": "linear"
543+
},
544+
"showPoints": "auto",
545+
"spanNulls": false,
546+
"stacking": {
547+
"group": "A",
548+
"mode": "none"
549+
},
550+
"thresholdsStyle": {
551+
"mode": "off"
552+
}
553+
},
554+
"mappings": [],
555+
"thresholds": {
556+
"mode": "absolute",
557+
"steps": [
558+
{
559+
"color": "green",
560+
"value": null
561+
}
562+
]
563+
},
564+
"unit": "s"
565+
},
566+
"overrides": []
567+
},
568+
"gridPos": {
569+
"h": 8,
570+
"w": 12,
571+
"x": 12,
572+
"y": 16
573+
},
574+
"id": 6,
575+
"options": {
576+
"legend": {
577+
"calcs": [
578+
"mean",
579+
"max",
580+
"lastNotNull"
581+
],
582+
"displayMode": "table",
583+
"placement": "bottom",
584+
"showLegend": true
585+
},
586+
"tooltip": {
587+
"hideZeros": false,
588+
"mode": "multi",
589+
"sort": "none"
590+
}
591+
},
592+
"pluginVersion": "11.5.1",
593+
"targets": [
594+
{
595+
"datasource": {
596+
"type": "prometheus",
597+
"uid": "febzoy4cplt6oe"
598+
},
599+
"editorMode": "code",
600+
"expr": "histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))",
601+
"legendFormat": "TPOT p95 {{model}}",
602+
"range": true,
603+
"refId": "A"
604+
}
605+
],
606+
"title": "TPOT (p95) by Model (sec/token)",
607+
"type": "timeseries"
408608
}
409609
],
410610
"preload": false,
@@ -438,6 +638,6 @@
438638
"timezone": "",
439639
"title": "LLM Router Metrics",
440640
"uid": "llm-router-metrics",
441-
"version": 12,
641+
"version": 14,
442642
"weekStart": ""
443643
}

website/docs/api/router.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,30 @@ sum by (model) (rate(llm_request_errors_total[15m]))
327327
sum(increase(llm_request_errors_total{reason="pii_policy_denied"}[24h]))
328328
```
329329

330+
### TTFT and TPOT Metrics
331+
332+
Time-to-first-token (TTFT) and time-per-output-token (TPOT) are exported as Prometheus histograms and can be visualized at p95 with histogram_quantile.
333+
334+
- `llm_model_ttft_seconds{model}`
335+
- Histogram: Exposes `_bucket`, `_sum`, `_count`
336+
- Description: Time to first token since the router started processing the request
337+
- Example p95 (last 5m) by model:
338+
339+
```prometheus
340+
histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))
341+
```
342+
343+
- `llm_model_tpot_seconds{model}`
344+
- Histogram: Exposes `_bucket`, `_sum`, `_count`
345+
- Description: Seconds per output token (completion latency / completion tokens)
346+
- Example p95 (last 5m) by model:
347+
348+
```prometheus
349+
histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))
350+
```
351+
352+
These are included in the provided Grafana dashboard at deploy/llm-router-dashboard.json as “TTFT (p95) by Model” and “TPOT (p95) by Model (sec/token)”.
353+
330354
### Pricing Configuration
331355

332356
Provide per-1M pricing for your models so the router can compute request cost and emit metrics/logs.

0 commit comments

Comments
 (0)