@@ -3,6 +3,7 @@ package extproc
33import (
44 "encoding/json"
55 "strconv"
6+ "strings"
67 "time"
78
89 core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
@@ -17,6 +18,9 @@ import (
1718func (r * OpenAIRouter ) handleResponseHeaders (v * ext_proc.ProcessingRequest_ResponseHeaders , ctx * RequestContext ) (* ext_proc.ProcessingResponse , error ) {
1819 // Detect upstream HTTP status and record non-2xx as errors
1920 if v != nil && v .ResponseHeaders != nil && v .ResponseHeaders .Headers != nil {
21+ // Determine if the response is streaming based on Content-Type
22+ ctx .IsStreamingResponse = isStreamingContentType (v .ResponseHeaders .Headers )
23+
2024 if statusCode := getStatusFromHeaders (v .ResponseHeaders .Headers ); statusCode != 0 {
2125 if statusCode >= 500 {
2226 metrics .RecordRequestError (getModelFromCtx (ctx ), "upstream_5xx" )
@@ -26,8 +30,10 @@ func (r *OpenAIRouter) handleResponseHeaders(v *ext_proc.ProcessingRequest_Respo
2630 }
2731 }
2832
29- // Best-effort TTFT measurement: record on first response headers if we have a start time and model
30- if ctx != nil && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
33+ // Best-effort TTFT measurement:
34+ // - For non-streaming responses, record on first response headers (approx TTFB ~= TTFT)
35+ // - For streaming responses (SSE), defer TTFT until the first response body chunk arrives
36+ if ctx != nil && ! ctx .IsStreamingResponse && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
3137 ttft := time .Since (ctx .ProcessingStartTime ).Seconds ()
3238 if ttft > 0 {
3339 metrics .RecordModelTTFT (ctx .RequestModel , ttft )
@@ -79,13 +85,58 @@ func getModelFromCtx(ctx *RequestContext) string {
7985 return ctx .RequestModel
8086}
8187
88+ // isStreamingContentType checks if the response content-type indicates streaming (SSE)
89+ func isStreamingContentType (headerMap * core.HeaderMap ) bool {
90+ if headerMap == nil {
91+ return false
92+ }
93+ for _ , hv := range headerMap .Headers {
94+ if strings .ToLower (hv .Key ) == "content-type" {
95+ val := hv .Value
96+ if val == "" && len (hv .RawValue ) > 0 {
97+ val = string (hv .RawValue )
98+ }
99+ if strings .Contains (strings .ToLower (val ), "text/event-stream" ) {
100+ return true
101+ }
102+ }
103+ }
104+ return false
105+ }
106+
82107// handleResponseBody processes the response body
83108func (r * OpenAIRouter ) handleResponseBody (v * ext_proc.ProcessingRequest_ResponseBody , ctx * RequestContext ) (* ext_proc.ProcessingResponse , error ) {
84109 completionLatency := time .Since (ctx .StartTime )
85110
86111 // Process the response for caching
87112 responseBody := v .ResponseBody .Body
88113
114+ // If this is a streaming response (e.g., SSE), record TTFT on the first body chunk
115+ // and skip JSON parsing/caching which are not applicable for SSE chunks.
116+ if ctx .IsStreamingResponse {
117+ if ctx != nil && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
118+ ttft := time .Since (ctx .ProcessingStartTime ).Seconds ()
119+ if ttft > 0 {
120+ metrics .RecordModelTTFT (ctx .RequestModel , ttft )
121+ ctx .TTFTSeconds = ttft
122+ ctx .TTFTRecorded = true
123+ observability .Infof ("Recorded TTFT on first streamed body chunk: %.3fs" , ttft )
124+ }
125+ }
126+
127+ // For streaming chunks, just continue (no token parsing or cache update)
128+ response := & ext_proc.ProcessingResponse {
129+ Response : & ext_proc.ProcessingResponse_ResponseBody {
130+ ResponseBody : & ext_proc.BodyResponse {
131+ Response : & ext_proc.CommonResponse {
132+ Status : ext_proc .CommonResponse_CONTINUE ,
133+ },
134+ },
135+ },
136+ }
137+ return response , nil
138+ }
139+
89140 // Parse tokens from the response JSON using OpenAI SDK types
90141 var parsed openai.ChatCompletion
91142 if err := json .Unmarshal (responseBody , & parsed ); err != nil {
0 commit comments