feat: add native TTFT, TPOT, ITL, and E2E latency tracking to framework

sats-23 · sats-23 · commit 321ee6b3cfa2 · 2026-02-01T12:22:57.000+05:30
-Implements critical inference metrics directly within the IGW framework, removing the dependency on the SLO predictor plugin for observability.
-Framework now natively tracks Time to First Token (TTFT), Time to Predict Output Token (TPOT), Inter-Token Latency (ITL), Decode Duration and End-to-End (E2E) latency for all inference requests.
-Added tests to validate metrics tracking

Signed-off-by: Sathvik &lt;Sathvik.S@ibm.com&gt;
diff --git a/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks.go b/pkg/epp/framework/plugins/scheduling/scorer/predictedlatency/requestcontrol_hooks.go
@@ -218,7 +218,7 @@ func (t *PredictedLatency) ResponseComplete(ctx context.Context, request *schedu
 
 	if predictedLatencyCtx.ttft > 0 {
 		logger.V(logutil.TRACE).Info("Averages calculated", "avgActualTTFT", predictedLatencyCtx.ttft, "avgPredictedTTFT", predictedLatencyCtx.predictedTTFT)
-		metrics.RecordRequestTTFT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.ttft/1000)
+		// metrics.RecordRequestTTFT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.ttft/1000)
 		metrics.RecordRequestPredictedTTFT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.predictedTTFT/1000)
 		if predictedLatencyCtx.ttftSLO > 0 {
 			metrics.RecordRequestTTFTWithSLO(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.ttft, predictedLatencyCtx.ttftSLO)
@@ -227,7 +227,7 @@ func (t *PredictedLatency) ResponseComplete(ctx context.Context, request *schedu
 
 	if predictedLatencyCtx.avgTPOT > 0 {
 		logger.V(logutil.TRACE).Info("Averages calculated", "avgActualTPOT", predictedLatencyCtx.avgTPOT, "avgPredictedTPOT", predictedLatencyCtx.avgPredictedTPOT)
-		metrics.RecordRequestTPOT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.avgTPOT/1000)
+		// metrics.RecordRequestTPOT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.avgTPOT/1000)
 		metrics.RecordRequestPredictedTPOT(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.avgPredictedTPOT/1000)
 		if predictedLatencyCtx.avgTPOTSLO > 0 {
 			metrics.RecordRequestTPOTWithSLO(ctx, predictedLatencyCtx.incomingModelName, request.TargetModel, predictedLatencyCtx.avgTPOT, predictedLatencyCtx.avgTPOTSLO)
diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go
@@ -21,6 +21,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"time"
 
 	configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
@@ -133,6 +134,22 @@ func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context,
 		logger.Error(err, "error in HandleResponseBodyStreaming")
 	}
 
+	// Record TTFT on the first token chunk.
+	// We check for "data: " prefix to ensure it's a data chunk, and exclude "[DONE]" message.
+	if reqCtx.GeneratedTokenCount == 0 && strings.Contains(responseText, streamingRespPrefix) && !strings.Contains(responseText, streamingEndMsg) {
+		ttft := time.Since(reqCtx.RequestReceivedTimestamp).Seconds()
+		reqCtx.TTFT = ttft
+		metrics.RecordRequestTTFT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, ttft)
+		reqCtx.GeneratedTokenCount = 1
+		reqCtx.LastTokenTimestamp = time.Now()
+	} else if reqCtx.GeneratedTokenCount > 0 && strings.Contains(responseText, streamingRespPrefix) && !strings.Contains(responseText, streamingEndMsg) {
+		// Record ITL for subsequent tokens
+		itl := time.Since(reqCtx.LastTokenTimestamp).Seconds()
+		metrics.RecordRequestITL(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, itl)
+		reqCtx.LastTokenTimestamp = time.Now()
+		reqCtx.GeneratedTokenCount++
+	}
+
 	// Parse usage on EVERY chunk to catch split streams (where usage and [DONE] are in different chunks).
 	if resp := parseRespForUsage(ctx, responseText); resp.Usage.TotalTokens > 0 {
 		reqCtx.Usage = resp.Usage
@@ -147,6 +164,22 @@ func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context,
 			cachedToken = reqCtx.Usage.PromptTokenDetails.CachedTokens
 		}
 		metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, cachedToken)
+
+		// Record Time Per Output Token
+		// TPOT = (Total Duration - TTFT) / (OutputTokens - 1)
+		if reqCtx.Usage.CompletionTokens > 1 && reqCtx.TTFT > 0 {
+			totalDuration := time.Since(reqCtx.RequestReceivedTimestamp).Seconds()
+			generationDuration := totalDuration - reqCtx.TTFT
+			metrics.RecordRequestDecodeDuration(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, generationDuration)
+			metrics.RecordRequestE2ELatency(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, totalDuration)
+
+			// Avoid division by zero just in case
+			if count := float64(reqCtx.Usage.CompletionTokens - 1); count > 0 {
+				avgTPOT := generationDuration / count
+				reqCtx.TPOT = avgTPOT
+				metrics.RecordRequestTPOT(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, avgTPOT)
+			}
+		}
 	}
 }
 
diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"encoding/json"
 	"testing"
+	"time"
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
@@ -327,3 +328,68 @@ func TestGenerateResponseHeaders_Sanitization(t *testing.T) {
 	assert.NotContains(t, gotHeaders, metadata.DestinationEndpointKey)
 	assert.NotContains(t, gotHeaders, "content-length")
 }
+
+func TestHandleResponseBodyModelStreaming_Metrics(t *testing.T) {
+	t.Parallel()
+	ctx := context.Background()
+
+	t.Run("TTFT Recording", func(t *testing.T) {
+		server := &StreamingServer{director: &mockDirector{}}
+		reqCtx := &RequestContext{
+			RequestReceivedTimestamp: time.Now().Add(-100 * time.Millisecond),
+			IncomingModelName:        "model-a",
+			TargetModelName:          "model-b",
+		}
+
+		chunk := `data: {"choices":[{"text":"First token"}]}`
+		server.HandleResponseBodyModelStreaming(ctx, reqCtx, chunk)
+
+		assert.Greater(t, reqCtx.TTFT, 0.0, "TTFT should be recorded and greater than 0")
+		assert.Equal(t, 1, reqCtx.GeneratedTokenCount, "GeneratedTokenCount should be 1")
+		assert.False(t, reqCtx.LastTokenTimestamp.IsZero(), "LastTokenTimestamp should be set")
+	})
+
+	t.Run("ITL Recording", func(t *testing.T) {
+		server := &StreamingServer{director: &mockDirector{}}
+		reqCtx := &RequestContext{
+			RequestReceivedTimestamp: time.Now().Add(-1 * time.Second),
+			IncomingModelName:        "model-a",
+			TargetModelName:          "model-b",
+			// Simulate first token already received
+			GeneratedTokenCount: 1,
+			LastTokenTimestamp:  time.Now().Add(-50 * time.Millisecond),
+			TTFT:                0.1,
+		}
+
+		chunk := `data: {"choices":[{"text":"Second token"}]}`
+		server.HandleResponseBodyModelStreaming(ctx, reqCtx, chunk)
+
+		// ITL is not stored in ReqCtx, but we can verify state updates
+		assert.Equal(t, 2, reqCtx.GeneratedTokenCount, "GeneratedTokenCount should increment")
+		assert.True(t, time.Since(reqCtx.LastTokenTimestamp) < 10*time.Millisecond, "LastTokenTimestamp should be updated to Now")
+	})
+
+	t.Run("TPOT and E2E Recording", func(t *testing.T) {
+		server := &StreamingServer{director: &mockDirector{}}
+		reqCtx := &RequestContext{
+			RequestReceivedTimestamp: time.Now().Add(-1 * time.Second),
+			IncomingModelName:        "model-a",
+			TargetModelName:          "model-b",
+			TTFT:                     0.1,
+			GeneratedTokenCount:      10,
+		}
+
+		// Usage that triggers TPOT calc
+		chunk := `data: {"usage":{"prompt_tokens":5,"completion_tokens":11,"total_tokens":16}}` + "\n" + `data: [DONE]`
+		server.HandleResponseBodyModelStreaming(ctx, reqCtx, chunk)
+
+		assert.True(t, reqCtx.ResponseComplete, "Response should be complete")
+		assert.Greater(t, reqCtx.TPOT, 0.0, "TPOT should be calculated")
+
+		// Expected TPOT calc: (TotalDuration - TTFT) / (CompletionTokens - 1)
+		// TotalDuration ~ 1.0s, TTFT = 0.1s -> GenDuration ~ 0.9s
+		// Tokens - 1 = 10
+		// TPOT ~ 0.09
+		assert.InDelta(t, 0.09, reqCtx.TPOT, 0.05, "TPOT should be approximately correct")
+	})
+}
diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
@@ -99,6 +99,12 @@ type RequestContext struct {
 
 	Response *Response
 
+	// Metrics
+	TTFT                float64
+	TPOT                float64
+	LastTokenTimestamp  time.Time
+	GeneratedTokenCount int
+
 	reqHeaderResp  *extProcPb.ProcessingResponse
 	reqBodyResp    []*extProcPb.ProcessingResponse
 	reqTrailerResp *extProcPb.ProcessingResponse
@@ -145,7 +151,8 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 	// Create request context to share states during life time of an HTTP request.
 	// See https://github.com/envoyproxy/envoy/issues/17540.
 	reqCtx := &RequestContext{
-		RequestState: RequestReceived,
+		RequestState:             RequestReceived,
+		RequestReceivedTimestamp: time.Now(),
 		Request: &Request{
 			Headers:  make(map[string]string),
 			Body:     make(map[string]any),
diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
@@ -57,6 +57,10 @@ const (
 	TypeTTFTPredictionDuration = "ttft_prediction_duration"
 	TypeTTFTSLOViolation       = "ttft_slo_violation"
 	TypeTTFTSLOThreshold       = "ttft_slo_threshold"
+
+	TypeITL            = "itl"
+	TypeDecodeDuration = "decode_duration"
+	TypeE2ELatency     = "e2e_latency"
 )
 
 var (
@@ -176,6 +180,36 @@ var (
 		ModelLabels,
 	)
 
+	requestITL = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: InferenceObjectiveComponent,
+			Name:      "request_itl_seconds",
+			Help:      metricsutil.HelpMsgWithStability("Inference model Inter-Token Latency distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
+			Buckets:   TPOTBuckets,
+		},
+		ModelLabels,
+	)
+
+	requestDecodeDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: InferenceObjectiveComponent,
+			Name:      "request_decode_duration_seconds",
+			Help:      metricsutil.HelpMsgWithStability("Inference model Decode Duration distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
+			Buckets:   GeneralLatencyBuckets,
+		},
+		ModelLabels,
+	)
+
+	requestE2ELatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: InferenceObjectiveComponent,
+			Name:      "request_e2e_latency_seconds",
+			Help:      metricsutil.HelpMsgWithStability("Inference model E2E Latency (TTFT + Decode) distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
+			Buckets:   GeneralLatencyBuckets,
+		},
+		ModelLabels,
+	)
+
 	sloViolationCounter = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: InferenceObjectiveComponent,
@@ -443,6 +477,9 @@ func Register(customCollectors ...prometheus.Collector) {
 		metrics.Registry.MustRegister(requestPredictedTTFT)
 		metrics.Registry.MustRegister(requestTPOTPredictionDuration)
 		metrics.Registry.MustRegister(requestTTFTPredictionDuration)
+		metrics.Registry.MustRegister(requestITL)
+		metrics.Registry.MustRegister(requestDecodeDuration)
+		metrics.Registry.MustRegister(requestE2ELatency)
 
 		// Register SLO violation counters
 		metrics.Registry.MustRegister(sloViolationCounter)
@@ -490,6 +527,9 @@ func Reset() {
 	requestPredictedTTFT.Reset()
 	requestTPOTPredictionDuration.Reset()
 	requestTTFTPredictionDuration.Reset()
+	requestITL.Reset()
+	requestDecodeDuration.Reset()
+	requestE2ELatency.Reset()
 
 	// Reset SLO violation counter
 	sloViolationCounter.Reset()
@@ -667,6 +707,42 @@ func RecordRequestTTFTPredictionDuration(ctx context.Context, modelName, targetM
 	return true
 }
 
+// RecordRequestITL records the Inter-Token Latency.
+func RecordRequestITL(ctx context.Context, modelName, targetModelName string, itl float64) bool {
+	if itl < 0 {
+		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "ITL value must be non-negative",
+			"modelName", modelName, "targetModelName", targetModelName, "itl", itl)
+		return false
+	}
+	requestITL.WithLabelValues(modelName, targetModelName).Observe(itl)
+	inferenceGauges.WithLabelValues(modelName, targetModelName, TypeITL).Set(itl)
+	return true
+}
+
+// RecordRequestDecodeDuration records the Decode Duration.
+func RecordRequestDecodeDuration(ctx context.Context, modelName, targetModelName string, duration float64) bool {
+	if duration < 0 {
+		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Decode duration value must be non-negative",
+			"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
+		return false
+	}
+	requestDecodeDuration.WithLabelValues(modelName, targetModelName).Observe(duration)
+	inferenceGauges.WithLabelValues(modelName, targetModelName, TypeDecodeDuration).Set(duration)
+	return true
+}
+
+// RecordRequestE2ELatency records the E2E Latency (TTFT + Decode).
+func RecordRequestE2ELatency(ctx context.Context, modelName, targetModelName string, duration float64) bool {
+	if duration < 0 {
+		log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "E2E latency value must be non-negative",
+			"modelName", modelName, "targetModelName", targetModelName, "duration", duration)
+		return false
+	}
+	requestE2ELatency.WithLabelValues(modelName, targetModelName).Observe(duration)
+	inferenceGauges.WithLabelValues(modelName, targetModelName, TypeE2ELatency).Set(duration)
+	return true
+}
+
 // RecordResponseSizes records the response sizes.
 func RecordResponseSizes(modelName, targetModelName string, size int) {
 	responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))