@@ -223,25 +223,28 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
223223 decodeSpan .SetAttributes (attribute .Float64 ("llm_d.pd_proxy.decode.duration_ms" , float64 (decodeDuration .Milliseconds ())))
224224 decodeSpan .SetStatus (codes .Ok , "" )
225225
226- // Calculate end-to-end P/D metrics and add to parent span
226+ // Calculate end-to-end P/D metrics and add to decode span
227227 // These metrics represent the "true" TTFT and latency from the coordinator's perspective
228- if parentSpan := trace .SpanFromContext (ctx ); parentSpan .SpanContext ().IsValid () {
228+ // Note: After tracer.Start() above, ctx contains the decode span, so SpanFromContext returns it
229+ if currentSpan := trace .SpanFromContext (ctx ); currentSpan .SpanContext ().IsValid () {
229230 // Get request start time from context
230231 var totalDuration time.Duration
232+ var trueTTFT time.Duration
231233 if requestStartValue := ctx .Value ("request_start_time" ); requestStartValue != nil {
232234 if requestStart , ok := requestStartValue .(time.Time ); ok {
233235 totalDuration = time .Since (requestStart )
236+
237+ // The "true TTFT" in P/D mode is the time until the decoder can start generating
238+ // This includes: gateway routing + scheduling + prefill time + KV transfer coordination overhead
239+ // The decode vLLM will report a low TTFT (since KV is already transferred),
240+ // but this captures the real end-to-end TTFT from the client's perspective
241+ //
242+ // True TTFT = time from gateway request start to decode start
243+ // This includes all coordinator overhead that vLLM-level metrics miss
244+ trueTTFT = decodeStart .Sub (requestStart )
234245 }
235246 }
236247
237- // The "true TTFT" in P/D mode is the time until the decoder can start generating
238- // This includes: prefill time + KV transfer coordination overhead
239- // The decode vLLM will report a low TTFT (since KV is already transferred),
240- // but this captures the real end-to-end TTFT from the client's perspective
241- //
242- // True TTFT = prefill duration (includes model prefill + KV cache transfer)
243- trueTTFT := prefillDuration
244-
245248 // KV transfer overhead: time between prefill vLLM completion and decode request start
246249 // This captures the coordination overhead between prefill and decode stages
247250 // Note: This is an approximation - ideally we'd measure from prefill vLLM completion
@@ -253,7 +256,7 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
253256 // 2. Calculate: (total_decode_time - decode_ttft) / (num_output_tokens - 1)
254257 // This is complex and requires response intercepting, so we defer to trace analysis
255258
256- parentSpan .SetAttributes (
259+ currentSpan .SetAttributes (
257260 // End-to-end P/D timing metrics
258261 // These are the metrics that should be used instead of per-instance vLLM metrics
259262 attribute .Float64 ("llm_d.pd_proxy.total_duration_ms" , float64 (totalDuration .Milliseconds ())),
0 commit comments