Add e2e request latency histogram to prometheus metrics.

mayabar · mayabar · commit f578dc096460 · 2025-10-28T10:19:11.000+02:00
Add reportHistogramValue function to be used for reporting values in histogram metrics

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -253,6 +253,13 @@ type Metrics struct {
 	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
+	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
+	// each value in this array is a value for the corresponding bucket.
+	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// Buckets upper boundaries in seconds are:
+	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
+	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
 }
 
 type LorasMetrics struct {
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
@@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
 
+var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
+
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -111,6 +111,21 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:e2e_request_latency_seconds",
+			Help:      "Histogram of end to end request latency in seconds.",
+			Buckets:   common.E2ERequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.e2eReqLatency); err != nil {
+		s.logger.Error(err, "Prometheus end to end request latency histogram register failed")
+		return err
+	}
+
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -215,6 +230,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
 			s.metrics.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
 		}
+
+		if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
+			s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+		}
 	}
 
 	s.metrics.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
@@ -317,25 +336,14 @@ func (s *VllmSimulator) reportWaitingRequests() {
 	}
 }
 
-// reportTTFT sets information about time to first token
-func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
-	if s.config.FakeMetrics != nil {
-		return
-	}
-	if s.metrics.ttft != nil {
-		s.metrics.ttft.WithLabelValues(
-			s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
-	}
-}
-
-// reportTPOT sets information about time per output token
-func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
+// reportHistogramValue sets the given value in the given histogram
+func (s *VllmSimulator) reportHistogramValue(hist *prometheus.HistogramVec, val float64) {
 	if s.config.FakeMetrics != nil {
 		return
 	}
-	if s.metrics.tpot != nil {
-		s.metrics.tpot.WithLabelValues(
-			s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
+	if hist != nil {
+		hist.WithLabelValues(
+			s.getDisplayedModelName(s.config.Model)).Observe(val)
 	}
 }
 
@@ -359,6 +367,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.ttftUpdater(ctx)
 	go s.tpotUpdater(ctx)
 	go s.recordRequestUpdater(ctx)
+	go s.e2eReqLatencyUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -406,7 +415,7 @@ func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case value := <-s.metrics.ttftChan:
-			s.reportTTFT(value)
+			s.reportHistogramValue(s.metrics.ttft, value)
 		}
 	}
 }
@@ -418,7 +427,19 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case value := <-s.metrics.tpotChan:
-			s.reportTPOT(value)
+			s.reportHistogramValue(s.metrics.tpot, value)
+		}
+	}
+}
+
+// tpotUpdater updates the time per output token metric by listening on the relevant channel
+func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.e2eReqLatencyChan:
+			s.reportHistogramValue(s.metrics.e2eReqLatency, value)
 		}
 	}
 }
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -93,6 +93,8 @@ type metricsData struct {
 	ttftChan chan float64
 	// tpotChan is a channel to update time per output token
 	tpotChan chan float64
+	// e2eReqLatencyChan is a channel to update request e2e latency
+	e2eReqLatencyChan chan float64
 	// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
 	kvCacheUsageChan chan float64
 	// registry is a Prometheus registry
@@ -107,6 +109,8 @@ type metricsData struct {
 	ttft *prometheus.HistogramVec
 	// tpot is prometheus histogram for time per output token in seconds
 	tpot *prometheus.HistogramVec
+	// e2eReqLatency is prometheus histogram of end to end request latency in seconds
+	e2eReqLatency *prometheus.HistogramVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
 	// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -271,6 +275,7 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 	s.metrics.kvCacheUsageChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
 
 	s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -460,6 +465,11 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
 
 // handleCompletions general completion requests handler, support both text and chat completion APIs
 func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) {
+	startTime := time.Now()
+	defer func() {
+		s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds()
+	}()
+
 	// Check if we should inject a failure
 	if shouldInjectFailure(s.config) {
 		failure := getRandomFailure(s.config)