diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 0fb7f7ba9..6e5fb2f0a 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -83,6 +83,7 @@ type RequestContext struct { Usage Usage ResponseSize int ResponseComplete bool + FirstTokenReceived bool ResponseStatusCode string RequestRunning bool Request *Request diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go index 844f0523b..69aa6ce57 100644 --- a/pkg/epp/requestcontrol/director.go +++ b/pkg/epp/requestcontrol/director.go @@ -298,10 +298,17 @@ func (d *Director) HandleResponseReceived(ctx context.Context, reqCtx *handlers. func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk") + + isFirstToken := !reqCtx.FirstTokenReceived + if isFirstToken { + reqCtx.FirstTokenReceived = true + } + response := &Response{ - RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], - Headers: reqCtx.Response.Headers, - EndOfStream: reqCtx.ResponseComplete, + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + IsFirstToken: isFirstToken, + EndOfStream: reqCtx.ResponseComplete, } d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) diff --git a/pkg/epp/requestcontrol/types.go b/pkg/epp/requestcontrol/types.go index 7a6725678..4cc50034b 100644 --- a/pkg/epp/requestcontrol/types.go +++ b/pkg/epp/requestcontrol/types.go @@ -26,6 +26,10 @@ type Response struct { Body string // IsStreaming indicates whether or not the response is being streamed by the model IsStreaming bool + // IsFirstToken when true indicates this is the first chunk of a streaming response. + // This is useful for plugins that need to perform actions at the time-to-first-token (TTFT) moment, + // such as marking prefill completion in disaggregated inference architectures. + IsFirstToken bool // EndOfStream when true indicates that this invocation contains the last chunk of the response EndOfStream bool // ReqMetadata is a map of metadata that can be passed from Envoy.