Skip to content

Commit 0970b49

Browse files
sallyomclaude
andcommitted
update tracing for true TTFT
Co-Authored-By: Claude <[email protected]> Signed-off-by: sallyom <[email protected]>
1 parent 1a1aef0 commit 0970b49

File tree

4 files changed

+47
-27
lines changed

4 files changed

+47
-27
lines changed

pkg/sidecar/proxy/chat_completions.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,14 @@ func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request)
7676
}
7777
}
7878

79+
// Add span for header parsing (always executed)
80+
_, headerSpan := tracer.Start(ctx, "llm_d.pd_proxy.parse_headers")
81+
headerSpan.SetAttributes(
82+
attribute.Int("llm_d.pd_proxy.prefill_headers_count", numHosts),
83+
attribute.Bool("llm_d.pd_proxy.prefiller_sampling_enabled", s.config.EnablePrefillerSampling),
84+
)
85+
headerSpan.End()
86+
7987
if len(prefillHostPort) == 0 {
8088
s.logger.V(4).Info("skip disaggregated prefill")
8189
span.SetAttributes(

pkg/sidecar/proxy/connector_lmcache.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,25 +135,32 @@ func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, pref
135135
decodeSpan.SetAttributes(attribute.Float64("llm_d.pd_proxy.decode.duration_ms", float64(decodeDuration.Milliseconds())))
136136
decodeSpan.SetStatus(codes.Ok, "")
137137

138-
// Calculate end-to-end P/D metrics and add to parent span
138+
// Calculate end-to-end P/D metrics and add to decode span
139139
// These metrics represent the "true" TTFT and latency from the coordinator's perspective
140-
if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
140+
// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
141+
if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
141142
// Get request start time from context
142143
var totalDuration time.Duration
144+
var trueTTFT time.Duration
143145
if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
144146
if requestStart, ok := requestStartValue.(time.Time); ok {
145147
totalDuration = time.Since(requestStart)
148+
149+
// The "true TTFT" in P/D mode is the time until the decoder can start generating
150+
// This includes: gateway routing + scheduling + prefill time + KV transfer coordination overhead
151+
// The decode vLLM will report a low TTFT (since KV is already transferred),
152+
// but this captures the real end-to-end TTFT from the client's perspective
153+
//
154+
// True TTFT = time from gateway request start to decode start
155+
// This includes all coordinator overhead that vLLM-level metrics miss
156+
trueTTFT = decodeStart.Sub(requestStart)
146157
}
147158
}
148159

149-
// The "true TTFT" in P/D mode is the time until the decoder can start generating
150-
// This includes: prefill time + KV transfer coordination overhead
151-
trueTTFT := prefillDuration
152-
153160
// KV transfer overhead: time between prefill completion and decode start
154161
kvTransferOverhead := decodeStart.Sub(prefillStart.Add(prefillDuration))
155162

156-
parentSpan.SetAttributes(
163+
currentSpan.SetAttributes(
157164
// End-to-end P/D timing metrics
158165
attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),
159166
attribute.Float64("llm_d.pd_proxy.true_ttft_ms", float64(trueTTFT.Milliseconds())),

pkg/sidecar/proxy/connector_nixlv2.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -223,25 +223,28 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
223223
decodeSpan.SetAttributes(attribute.Float64("llm_d.pd_proxy.decode.duration_ms", float64(decodeDuration.Milliseconds())))
224224
decodeSpan.SetStatus(codes.Ok, "")
225225

226-
// Calculate end-to-end P/D metrics and add to parent span
226+
// Calculate end-to-end P/D metrics and add to decode span
227227
// These metrics represent the "true" TTFT and latency from the coordinator's perspective
228-
if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
228+
// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
229+
if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
229230
// Get request start time from context
230231
var totalDuration time.Duration
232+
var trueTTFT time.Duration
231233
if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
232234
if requestStart, ok := requestStartValue.(time.Time); ok {
233235
totalDuration = time.Since(requestStart)
236+
237+
// The "true TTFT" in P/D mode is the time until the decoder can start generating
238+
// This includes: gateway routing + scheduling + prefill time + KV transfer coordination overhead
239+
// The decode vLLM will report a low TTFT (since KV is already transferred),
240+
// but this captures the real end-to-end TTFT from the client's perspective
241+
//
242+
// True TTFT = time from gateway request start to decode start
243+
// This includes all coordinator overhead that vLLM-level metrics miss
244+
trueTTFT = decodeStart.Sub(requestStart)
234245
}
235246
}
236247

237-
// The "true TTFT" in P/D mode is the time until the decoder can start generating
238-
// This includes: prefill time + KV transfer coordination overhead
239-
// The decode vLLM will report a low TTFT (since KV is already transferred),
240-
// but this captures the real end-to-end TTFT from the client's perspective
241-
//
242-
// True TTFT = prefill duration (includes model prefill + KV cache transfer)
243-
trueTTFT := prefillDuration
244-
245248
// KV transfer overhead: time between prefill vLLM completion and decode request start
246249
// This captures the coordination overhead between prefill and decode stages
247250
// Note: This is an approximation - ideally we'd measure from prefill vLLM completion
@@ -253,7 +256,7 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
253256
// 2. Calculate: (total_decode_time - decode_ttft) / (num_output_tokens - 1)
254257
// This is complex and requires response intercepting, so we defer to trace analysis
255258

256-
parentSpan.SetAttributes(
259+
currentSpan.SetAttributes(
257260
// End-to-end P/D timing metrics
258261
// These are the metrics that should be used instead of per-instance vLLM metrics
259262
attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),

pkg/sidecar/proxy/connector_sglang.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -150,24 +150,26 @@ func (s *Server) sendSGLangConcurrentRequests(w http.ResponseWriter, r *http.Req
150150
)
151151
decodeSpan.SetStatus(codes.Ok, "")
152152

153-
// Calculate end-to-end P/D metrics and add to parent span
153+
// Calculate end-to-end P/D metrics and add to decode span
154154
// Note: SGLang runs prefill and decode concurrently, so timing is different from sequential P/D
155-
if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
155+
// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
156+
if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
156157
// Get request start time from context
157158
var totalDuration time.Duration
159+
var trueTTFT time.Duration
158160
if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
159161
if requestStart, ok := requestStartValue.(time.Time); ok {
160162
totalDuration = time.Since(requestStart)
163+
164+
// For SGLang, prefill and decode run concurrently, but True TTFT still needs to capture
165+
// the full coordinator overhead from gateway start to when decode can begin generating.
166+
// This includes: gateway routing + scheduling overhead + time to start decode request
167+
// Note: In concurrent mode, this is different from sequential P/D where we wait for prefill
168+
trueTTFT = decodeStart.Sub(requestStart)
161169
}
162170
}
163171

164-
// For SGLang, since prefill is async and decode runs concurrently:
165-
// - True TTFT is dominated by decode start time (prefill runs in parallel)
166-
// - Total duration is primarily decode duration (not prefill + decode)
167-
// - Prefill duration is tracked separately in the async goroutine
168-
trueTTFT := decodeDuration // In concurrent mode, TTFT is the decode time
169-
170-
parentSpan.SetAttributes(
172+
currentSpan.SetAttributes(
171173
// End-to-end P/D timing metrics for concurrent P/D
172174
attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),
173175
attribute.Float64("llm_d.pd_proxy.true_ttft_ms", float64(trueTTFT.Milliseconds())),

0 commit comments

Comments
 (0)