Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/epp/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ type RequestContext struct {
Usage Usage
ResponseSize int
ResponseComplete bool
FirstTokenReceived bool
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, can we make this a UNIX timestamp so its dual purpose:

  • Informing that the token has been received
  • TTFT calculation

ResponseStatusCode string
RequestRunning bool
Request *Request
Expand Down
13 changes: 10 additions & 3 deletions pkg/epp/requestcontrol/director.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,17 @@ func (d *Director) HandleResponseReceived(ctx context.Context, reqCtx *handlers.
func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) {
logger := log.FromContext(ctx).WithValues("stage", "bodyChunk")
logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk")

isFirstToken := !reqCtx.FirstTokenReceived
if isFirstToken {
reqCtx.FirstTokenReceived = true
}

response := &Response{
RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
Headers: reqCtx.Response.Headers,
EndOfStream: reqCtx.ResponseComplete,
RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
Headers: reqCtx.Response.Headers,
IsFirstToken: isFirstToken,
EndOfStream: reqCtx.ResponseComplete,
}

d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod)
Expand Down
4 changes: 4 additions & 0 deletions pkg/epp/requestcontrol/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ type Response struct {
Body string
// IsStreaming indicates whether or not the response is being streamed by the model
IsStreaming bool
// IsFirstToken when true indicates this is the first chunk of a streaming response.
// This is useful for plugins that need to perform actions at the time-to-first-token (TTFT) moment,
// such as marking prefill completion in disaggregated inference architectures.
IsFirstToken bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// EndOfStream when true indicates that this invocation contains the last chunk of the response
EndOfStream bool
// ReqMetadata is a map of metadata that can be passed from Envoy.
Expand Down