Skip to content

Commit 65e114c

Browse files
o11y: Add TTFT and TPOT histograms for SLOs (#126)
* o11y: Add TTFT and TPOT histograms for SLOs Signed-off-by: Jintao Zhang <[email protected]> * go mod tidy Signed-off-by: Jintao Zhang <[email protected]> --------- Signed-off-by: Jintao Zhang <[email protected]>
1 parent ea63386 commit 65e114c

File tree

8 files changed

+189
-6
lines changed

8 files changed

+189
-6
lines changed

src/semantic-router/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ require (
1919
github.com/onsi/gomega v1.38.0
2020
github.com/openai/openai-go v1.12.0
2121
github.com/prometheus/client_golang v1.23.0
22+
github.com/prometheus/client_model v0.6.2
2223
github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
2324
go.uber.org/zap v1.27.0
2425
google.golang.org/grpc v1.71.1
@@ -47,7 +48,6 @@ require (
4748
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4849
github.com/pkg/errors v0.9.1 // indirect
4950
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
50-
github.com/prometheus/client_model v0.6.2 // indirect
5151
github.com/prometheus/common v0.65.0 // indirect
5252
github.com/prometheus/procfs v0.16.1 // indirect
5353
github.com/rogpeppe/go-internal v1.12.0 // indirect
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package extproc
2+
3+
import (
4+
"encoding/json"
5+
"time"
6+
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
10+
core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
11+
ext_proc "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
12+
"github.com/prometheus/client_golang/prometheus"
13+
dto "github.com/prometheus/client_model/go"
14+
)
15+
16+
func getHistogramSampleCount(metricName, model string) uint64 {
17+
mf, _ := prometheus.DefaultGatherer.Gather()
18+
for _, fam := range mf {
19+
if fam.GetName() != metricName || fam.GetType() != dto.MetricType_HISTOGRAM {
20+
continue
21+
}
22+
for _, m := range fam.GetMetric() {
23+
labels := m.GetLabel()
24+
match := false
25+
for _, l := range labels {
26+
if l.GetName() == "model" && l.GetValue() == model {
27+
match = true
28+
break
29+
}
30+
}
31+
if match {
32+
h := m.GetHistogram()
33+
if h != nil && h.SampleCount != nil {
34+
return h.GetSampleCount()
35+
}
36+
}
37+
}
38+
}
39+
return 0
40+
}
41+
42+
var _ = Describe("Metrics recording", func() {
43+
var router *OpenAIRouter
44+
45+
BeforeEach(func() {
46+
// Use a minimal router that doesn't require external models
47+
router = &OpenAIRouter{}
48+
// Initialize internal maps used by handlers
49+
router.InitializeForTesting()
50+
})
51+
52+
It("records TTFT on response headers", func() {
53+
ctx := &RequestContext{
54+
RequestModel: "model-a",
55+
ProcessingStartTime: time.Now().Add(-75 * time.Millisecond),
56+
}
57+
58+
before := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
59+
60+
respHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
61+
ResponseHeaders: &ext_proc.HttpHeaders{
62+
Headers: &core.HeaderMap{Headers: []*core.HeaderValue{{Key: "content-type", Value: "application/json"}}},
63+
},
64+
}
65+
66+
response, err := router.handleResponseHeaders(respHeaders, ctx)
67+
Expect(err).NotTo(HaveOccurred())
68+
Expect(response.GetResponseHeaders()).NotTo(BeNil())
69+
70+
after := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
71+
Expect(after).To(BeNumerically(">", before))
72+
Expect(ctx.TTFTRecorded).To(BeTrue())
73+
Expect(ctx.TTFTSeconds).To(BeNumerically(">", 0))
74+
})
75+
76+
It("records TPOT on response body", func() {
77+
ctx := &RequestContext{
78+
RequestID: "tpot-test-1",
79+
RequestModel: "model-a",
80+
StartTime: time.Now().Add(-1 * time.Second),
81+
}
82+
83+
before := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
84+
85+
openAIResponse := map[string]interface{}{
86+
"id": "chatcmpl-xyz",
87+
"object": "chat.completion",
88+
"created": time.Now().Unix(),
89+
"model": ctx.RequestModel,
90+
"usage": map[string]interface{}{
91+
"prompt_tokens": 10,
92+
"completion_tokens": 5,
93+
"total_tokens": 15,
94+
},
95+
"choices": []map[string]interface{}{
96+
{
97+
"message": map[string]interface{}{"role": "assistant", "content": "Hello"},
98+
"finish_reason": "stop",
99+
},
100+
},
101+
}
102+
respBodyJSON, err := json.Marshal(openAIResponse)
103+
Expect(err).NotTo(HaveOccurred())
104+
105+
respBody := &ext_proc.ProcessingRequest_ResponseBody{
106+
ResponseBody: &ext_proc.HttpBody{Body: respBodyJSON},
107+
}
108+
109+
response, err := router.handleResponseBody(respBody, ctx)
110+
Expect(err).NotTo(HaveOccurred())
111+
Expect(response.GetResponseBody()).NotTo(BeNil())
112+
113+
after := getHistogramSampleCount("llm_model_tpot_seconds", ctx.RequestModel)
114+
Expect(after).To(BeNumerically(">", before))
115+
})
116+
})

src/semantic-router/pkg/extproc/processor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ func (r *OpenAIRouter) Process(stream ext_proc.ExternalProcessor_ProcessServer)
7474
}
7575

7676
case *ext_proc.ProcessingRequest_ResponseHeaders:
77-
response, err := r.handleResponseHeaders(v)
77+
response, err := r.handleResponseHeaders(v, ctx)
7878
if err != nil {
7979
return err
8080
}

src/semantic-router/pkg/extproc/request_handler.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ type RequestContext struct {
108108
RequestQuery string
109109
StartTime time.Time
110110
ProcessingStartTime time.Time
111+
112+
// TTFT tracking
113+
TTFTRecorded bool
114+
TTFTSeconds float64
111115
}
112116

113117
// handleRequestHeaders processes the request headers

src/semantic-router/pkg/extproc/request_processing_test.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,13 @@ var _ = Describe("Request Processing", func() {
342342
},
343343
}
344344

345-
response, err := router.HandleResponseHeaders(responseHeaders)
345+
ctx := &extproc.RequestContext{
346+
Headers: make(map[string]string),
347+
RequestModel: "model-a",
348+
ProcessingStartTime: time.Now().Add(-50 * time.Millisecond),
349+
}
350+
351+
response, err := router.HandleResponseHeaders(responseHeaders, ctx)
346352
Expect(err).NotTo(HaveOccurred())
347353
Expect(response).NotTo(BeNil())
348354

src/semantic-router/pkg/extproc/response_handler.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,16 @@ import (
1313
)
1414

1515
// handleResponseHeaders processes the response headers
16-
func (r *OpenAIRouter) handleResponseHeaders(_ *ext_proc.ProcessingRequest_ResponseHeaders) (*ext_proc.ProcessingResponse, error) {
16+
func (r *OpenAIRouter) handleResponseHeaders(_ *ext_proc.ProcessingRequest_ResponseHeaders, ctx *RequestContext) (*ext_proc.ProcessingResponse, error) {
17+
// Best-effort TTFT measurement: record on first response headers if we have a start time and model
18+
if ctx != nil && !ctx.TTFTRecorded && !ctx.ProcessingStartTime.IsZero() && ctx.RequestModel != "" {
19+
ttft := time.Since(ctx.ProcessingStartTime).Seconds()
20+
if ttft > 0 {
21+
metrics.RecordModelTTFT(ctx.RequestModel, ttft)
22+
ctx.TTFTSeconds = ttft
23+
ctx.TTFTRecorded = true
24+
}
25+
}
1726

1827
// Allow the response to continue without modification
1928
response := &ext_proc.ProcessingResponse{
@@ -53,6 +62,12 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
5362
)
5463
metrics.RecordModelCompletionLatency(ctx.RequestModel, completionLatency.Seconds())
5564

65+
// Record TPOT (time per output token) if completion tokens are available
66+
if completionTokens > 0 {
67+
timePerToken := completionLatency.Seconds() / float64(completionTokens)
68+
metrics.RecordModelTPOT(ctx.RequestModel, timePerToken)
69+
}
70+
5671
// Compute and record cost if pricing is configured
5772
if r.Config != nil {
5873
promptRatePer1M, completionRatePer1M, currency, ok := r.Config.GetModelPricing(ctx.RequestModel)

src/semantic-router/pkg/extproc/testing_helpers_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ func (r *OpenAIRouter) HandleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
1717
}
1818

1919
// HandleResponseHeaders exposes handleResponseHeaders for testing
20-
func (r *OpenAIRouter) HandleResponseHeaders(v *ext_proc.ProcessingRequest_ResponseHeaders) (*ext_proc.ProcessingResponse, error) {
21-
return r.handleResponseHeaders(v)
20+
func (r *OpenAIRouter) HandleResponseHeaders(v *ext_proc.ProcessingRequest_ResponseHeaders, ctx *RequestContext) (*ext_proc.ProcessingResponse, error) {
21+
return r.handleResponseHeaders(v, ctx)
2222
}
2323

2424
// HandleResponseBody exposes handleResponseBody for testing

src/semantic-router/pkg/metrics/metrics.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,26 @@ var (
166166
[]string{"model"},
167167
)
168168

169+
// ModelTTFT tracks time to first token by model
170+
ModelTTFT = promauto.NewHistogramVec(
171+
prometheus.HistogramOpts{
172+
Name: "llm_model_ttft_seconds",
173+
Help: "Time to first token for LLM model responses in seconds",
174+
Buckets: prometheus.DefBuckets,
175+
},
176+
[]string{"model"},
177+
)
178+
179+
// ModelTPOT tracks time per output token by model
180+
ModelTPOT = promauto.NewHistogramVec(
181+
prometheus.HistogramOpts{
182+
Name: "llm_model_tpot_seconds",
183+
Help: "Time per output token (completion latency / completion tokens) for LLM model responses in seconds",
184+
Buckets: prometheus.DefBuckets,
185+
},
186+
[]string{"model"},
187+
)
188+
169189
// ModelRoutingLatency tracks the latency of model routing
170190
ModelRoutingLatency = promauto.NewHistogram(
171191
prometheus.HistogramOpts{
@@ -384,6 +404,28 @@ func RecordModelCompletionLatency(model string, seconds float64) {
384404
ModelCompletionLatency.WithLabelValues(model).Observe(seconds)
385405
}
386406

407+
// RecordModelTTFT records time to first token for a model
408+
func RecordModelTTFT(model string, seconds float64) {
409+
if seconds <= 0 {
410+
return
411+
}
412+
if model == "" {
413+
model = "unknown"
414+
}
415+
ModelTTFT.WithLabelValues(model).Observe(seconds)
416+
}
417+
418+
// RecordModelTPOT records time per output token (seconds per token) for a model
419+
func RecordModelTPOT(model string, secondsPerToken float64) {
420+
if secondsPerToken <= 0 {
421+
return
422+
}
423+
if model == "" {
424+
model = "unknown"
425+
}
426+
ModelTPOT.WithLabelValues(model).Observe(secondsPerToken)
427+
}
428+
387429
// RecordModelRoutingLatency records the latency of model routing
388430
func RecordModelRoutingLatency(seconds float64) {
389431
ModelRoutingLatency.Observe(seconds)

0 commit comments

Comments
 (0)