update tracing for true TTFT

sallyom · claude · sallyom · commit 0970b4903ba3 · 2026-01-05T14:49:48.000-05:00
Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: sallyom &lt;somalley@redhat.com&gt;
diff --git a/pkg/sidecar/proxy/chat_completions.go b/pkg/sidecar/proxy/chat_completions.go
@@ -76,6 +76,14 @@ func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request)
 		}
 	}
 
+	// Add span for header parsing (always executed)
+	_, headerSpan := tracer.Start(ctx, "llm_d.pd_proxy.parse_headers")
+	headerSpan.SetAttributes(
+		attribute.Int("llm_d.pd_proxy.prefill_headers_count", numHosts),
+		attribute.Bool("llm_d.pd_proxy.prefiller_sampling_enabled", s.config.EnablePrefillerSampling),
+	)
+	headerSpan.End()
+
 	if len(prefillHostPort) == 0 {
 		s.logger.V(4).Info("skip disaggregated prefill")
 		span.SetAttributes(
diff --git a/pkg/sidecar/proxy/connector_lmcache.go b/pkg/sidecar/proxy/connector_lmcache.go
@@ -135,25 +135,32 @@ func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, pref
 	decodeSpan.SetAttributes(attribute.Float64("llm_d.pd_proxy.decode.duration_ms", float64(decodeDuration.Milliseconds())))
 	decodeSpan.SetStatus(codes.Ok, "")
 
-	// Calculate end-to-end P/D metrics and add to parent span
+	// Calculate end-to-end P/D metrics and add to decode span
 	// These metrics represent the "true" TTFT and latency from the coordinator's perspective
-	if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
+	// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
+	if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
 		// Get request start time from context
 		var totalDuration time.Duration
+		var trueTTFT time.Duration
 		if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
 			if requestStart, ok := requestStartValue.(time.Time); ok {
 				totalDuration = time.Since(requestStart)
+
+				// The "true TTFT" in P/D mode is the time until the decoder can start generating
+				// This includes: gateway routing + scheduling + prefill time + KV transfer coordination overhead
+				// The decode vLLM will report a low TTFT (since KV is already transferred),
+				// but this captures the real end-to-end TTFT from the client's perspective
+				//
+				// True TTFT = time from gateway request start to decode start
+				// This includes all coordinator overhead that vLLM-level metrics miss
+				trueTTFT = decodeStart.Sub(requestStart)
 			}
 		}
 
-		// The "true TTFT" in P/D mode is the time until the decoder can start generating
-		// This includes: prefill time + KV transfer coordination overhead
-		trueTTFT := prefillDuration
-
 		// KV transfer overhead: time between prefill completion and decode start
 		kvTransferOverhead := decodeStart.Sub(prefillStart.Add(prefillDuration))
 
-		parentSpan.SetAttributes(
+		currentSpan.SetAttributes(
 			// End-to-end P/D timing metrics
 			attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),
 			attribute.Float64("llm_d.pd_proxy.true_ttft_ms", float64(trueTTFT.Milliseconds())),
diff --git a/pkg/sidecar/proxy/connector_nixlv2.go b/pkg/sidecar/proxy/connector_nixlv2.go
@@ -223,25 +223,28 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
 	decodeSpan.SetAttributes(attribute.Float64("llm_d.pd_proxy.decode.duration_ms", float64(decodeDuration.Milliseconds())))
 	decodeSpan.SetStatus(codes.Ok, "")
 
-	// Calculate end-to-end P/D metrics and add to parent span
+	// Calculate end-to-end P/D metrics and add to decode span
 	// These metrics represent the "true" TTFT and latency from the coordinator's perspective
-	if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
+	// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
+	if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
 		// Get request start time from context
 		var totalDuration time.Duration
+		var trueTTFT time.Duration
 		if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
 			if requestStart, ok := requestStartValue.(time.Time); ok {
 				totalDuration = time.Since(requestStart)
+
+				// The "true TTFT" in P/D mode is the time until the decoder can start generating
+				// This includes: gateway routing + scheduling + prefill time + KV transfer coordination overhead
+				// The decode vLLM will report a low TTFT (since KV is already transferred),
+				// but this captures the real end-to-end TTFT from the client's perspective
+				//
+				// True TTFT = time from gateway request start to decode start
+				// This includes all coordinator overhead that vLLM-level metrics miss
+				trueTTFT = decodeStart.Sub(requestStart)
 			}
 		}
 
-		// The "true TTFT" in P/D mode is the time until the decoder can start generating
-		// This includes: prefill time + KV transfer coordination overhead
-		// The decode vLLM will report a low TTFT (since KV is already transferred),
-		// but this captures the real end-to-end TTFT from the client's perspective
-		//
-		// True TTFT = prefill duration (includes model prefill + KV cache transfer)
-		trueTTFT := prefillDuration
-
 		// KV transfer overhead: time between prefill vLLM completion and decode request start
 		// This captures the coordination overhead between prefill and decode stages
 		// Note: This is an approximation - ideally we'd measure from prefill vLLM completion
@@ -253,7 +256,7 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
 		// 2. Calculate: (total_decode_time - decode_ttft) / (num_output_tokens - 1)
 		// This is complex and requires response intercepting, so we defer to trace analysis
 
-		parentSpan.SetAttributes(
+		currentSpan.SetAttributes(
 			// End-to-end P/D timing metrics
 			// These are the metrics that should be used instead of per-instance vLLM metrics
 			attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),
diff --git a/pkg/sidecar/proxy/connector_sglang.go b/pkg/sidecar/proxy/connector_sglang.go
@@ -150,24 +150,26 @@ func (s *Server) sendSGLangConcurrentRequests(w http.ResponseWriter, r *http.Req
 	)
 	decodeSpan.SetStatus(codes.Ok, "")
 
-	// Calculate end-to-end P/D metrics and add to parent span
+	// Calculate end-to-end P/D metrics and add to decode span
 	// Note: SGLang runs prefill and decode concurrently, so timing is different from sequential P/D
-	if parentSpan := trace.SpanFromContext(ctx); parentSpan.SpanContext().IsValid() {
+	// Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
+	if currentSpan := trace.SpanFromContext(ctx); currentSpan.SpanContext().IsValid() {
 		// Get request start time from context
 		var totalDuration time.Duration
+		var trueTTFT time.Duration
 		if requestStartValue := ctx.Value("request_start_time"); requestStartValue != nil {
 			if requestStart, ok := requestStartValue.(time.Time); ok {
 				totalDuration = time.Since(requestStart)
+
+				// For SGLang, prefill and decode run concurrently, but True TTFT still needs to capture
+				// the full coordinator overhead from gateway start to when decode can begin generating.
+				// This includes: gateway routing + scheduling overhead + time to start decode request
+				// Note: In concurrent mode, this is different from sequential P/D where we wait for prefill
+				trueTTFT = decodeStart.Sub(requestStart)
 			}
 		}
 
-		// For SGLang, since prefill is async and decode runs concurrently:
-		// - True TTFT is dominated by decode start time (prefill runs in parallel)
-		// - Total duration is primarily decode duration (not prefill + decode)
-		// - Prefill duration is tracked separately in the async goroutine
-		trueTTFT := decodeDuration // In concurrent mode, TTFT is the decode time
-
-		parentSpan.SetAttributes(
+		currentSpan.SetAttributes(
 			// End-to-end P/D timing metrics for concurrent P/D
 			attribute.Float64("llm_d.pd_proxy.total_duration_ms", float64(totalDuration.Milliseconds())),
 			attribute.Float64("llm_d.pd_proxy.true_ttft_ms", float64(trueTTFT.Milliseconds())),

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,14 @@ func (s Server) chatCompletionsHandler(w http.ResponseWriter, r http.Request)`
`76`	`76`	`}`
`77`	`77`	`}`
`78`	`78`
	`79`	`+ // Add span for header parsing (always executed)`
	`80`	`+ _, headerSpan := tracer.Start(ctx, "llm_d.pd_proxy.parse_headers")`
	`81`	`+ headerSpan.SetAttributes(`
	`82`	`+ attribute.Int("llm_d.pd_proxy.prefill_headers_count", numHosts),`
	`83`	`+ attribute.Bool("llm_d.pd_proxy.prefiller_sampling_enabled", s.config.EnablePrefillerSampling),`
	`84`	`+ )`
	`85`	`+ headerSpan.End()`
	`86`	`+`
`79`	`87`	`if len(prefillHostPort) == 0 {`
`80`	`88`	`s.logger.V(4).Info("skip disaggregated prefill")`
`81`	`89`	`span.SetAttributes(`