Additional metrics - vllm:request_queue_time_seconds, vllm:request_inference_time_seconds, vllm:request_prefill_time_seconds, and vllm:request_decode_time_seconds

mayabar · mayabar · commit a38361e05ffb · 2025-10-28T12:56:10.000+02:00
Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
@@ -32,7 +32,7 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
 
-var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
 
 // ValidateContextWindow checks if the request fits within the model's context window
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -116,7 +116,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 			Subsystem: "",
 			Name:      "vllm:e2e_request_latency_seconds",
 			Help:      "Histogram of end to end request latency in seconds.",
-			Buckets:   common.E2ERequestLatencyBucketsBoundaries,
+			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
 		[]string{vllmapi.PromLabelModelName},
 	)
@@ -126,6 +126,66 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.reqQueueTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_queue_time_seconds",
+			Help:      "Histogram of time spent in WAITING phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqQueueTime); err != nil {
+		s.logger.Error(err, "Prometheus request queue time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_inference_time_seconds",
+			Help:      "Histogram of time spent in RUNNING phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil {
+		s.logger.Error(err, "Prometheus request inerence time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_prefill_time_seconds",
+			Help:      "Histogram of time spent in PREFILL phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqPrefillTime); err != nil {
+		s.logger.Error(err, "Prometheus request prefill time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_queue_time_seconds",
+			Help:      "Histogram of time spent in DECODE phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqDecodeTime); err != nil {
+		s.logger.Error(err, "Prometheus request decode time histogram register failed")
+		return err
+	}
+
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -232,7 +292,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		}
 
 		if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
-			s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+			s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
 		}
 	}
 
@@ -368,6 +428,10 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.tpotUpdater(ctx)
 	go s.recordRequestUpdater(ctx)
 	go s.e2eReqLatencyUpdater(ctx)
+	go s.reqQueueTimeUpdater(ctx)
+	go s.reqInferenceTimeUpdater(ctx)
+	go s.reqPrefillTimeUpdater(ctx)
+	go s.reqDecodeTimeUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -432,7 +496,7 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
 	}
 }
 
-// tpotUpdater updates the time per output token metric by listening on the relevant channel
+// e2eReqLatencyUpdater updates the e2e request latency metric by listening on the relevant channel
 func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
 	for {
 		select {
@@ -444,6 +508,54 @@ func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
 	}
 }
 
+// reqQueueTimeUpdater updates the request queue time metric by listening on the relevant channel
+func (s *VllmSimulator) reqQueueTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqQueueTimeChan:
+			s.reportHistogramValue(s.metrics.reqQueueTime, value)
+		}
+	}
+}
+
+// reqInferenceTimeUpdater updates the request inference time metric by listening on the relevant channel
+func (s *VllmSimulator) reqInferenceTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqInferenceTimeChan:
+			s.reportHistogramValue(s.metrics.reqInferenceTime, value)
+		}
+	}
+}
+
+// reqPrefillTimeUpdater updates the request prefill time metric by listening on the relevant channel
+func (s *VllmSimulator) reqPrefillTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqPrefillTimeChan:
+			s.reportHistogramValue(s.metrics.reqPrefillTime, value)
+		}
+	}
+}
+
+// reqDecodeTimeUpdater updates the request decode time metric by listening on the relevant channel
+func (s *VllmSimulator) reqDecodeTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqDecodeTimeChan:
+			s.reportHistogramValue(s.metrics.reqDecodeTime, value)
+		}
+	}
+}
+
 // lorasUpdater updates the running loras metric by listening on the relevant channel
 // one function updates both waiting and running loras since they a part of the same prometheus gauge
 func (s *VllmSimulator) lorasUpdater(ctx context.Context) {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -95,6 +95,14 @@ type metricsData struct {
 	tpotChan chan float64
 	// e2eReqLatencyChan is a channel to update request e2e latency
 	e2eReqLatencyChan chan float64
+	// reqQueueTimeChan is a channel to update request queue time
+	reqQueueTimeChan chan float64
+	// reqInferenceTimeChan is a channel to update request inference time
+	reqInferenceTimeChan chan float64
+	// reqPrefillTimeChan is a channel to update request prefill time
+	reqPrefillTimeChan chan float64
+	// reqDecodeTimeChan is a channel to update request decode time
+	reqDecodeTimeChan chan float64
 	// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
 	kvCacheUsageChan chan float64
 	// registry is a Prometheus registry
@@ -111,6 +119,14 @@ type metricsData struct {
 	tpot *prometheus.HistogramVec
 	// e2eReqLatency is prometheus histogram of end to end request latency in seconds
 	e2eReqLatency *prometheus.HistogramVec
+	// reqQueueTime is prometheus histogram of request queue time in seconds
+	reqQueueTime *prometheus.HistogramVec
+	// reqInferenceTime is prometheus histogram of request inference time in seconds
+	reqInferenceTime *prometheus.HistogramVec
+	// reqPrefillTime is prometheus histogram of request prefill time in seconds
+	reqPrefillTime *prometheus.HistogramVec
+	// reqDecodeTime is prometheus histogram of request decode time in seconds
+	reqDecodeTime *prometheus.HistogramVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
 	// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -139,6 +155,11 @@ type requestCompleted struct {
 	model  string
 }
 
+type waitingQueueItem struct {
+	reqCtx      *openaiserverapi.CompletionReqCtx
+	enqueueTime time.Time
+}
+
 // VllmSimulator simulates vLLM server supporting OpenAI API
 type VllmSimulator struct {
 	// logger is used for information and errors logging
@@ -276,6 +297,10 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 	s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqQueueTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqInferenceTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqPrefillTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqDecodeTimeChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
 
 	s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -575,28 +600,32 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
-func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
+func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string,
+	toolCalls []openaiserverapi.ToolCall, modelName string, finishReason string, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
+	startPrefill := time.Now()
 	ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	// report ttft in seconds
 	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
+	common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
 
+	startDecode := time.Now()
 	for range usageData.CompletionTokens - 1 {
 		perTokenLatency := s.getInterTokenLatency()
 		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
 
 		// report tpot in seconds
 		common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan")
 	}
-	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
+	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
 
+	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
 	s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID())
 }
 
@@ -639,7 +668,7 @@ func (s *VllmSimulator) enqueue(req *openaiserverapi.CompletionReqCtx) error {
 	if s.waitingQueue.Len() >= s.queueCapacity {
 		return errors.New("waiting requests queue is full")
 	}
-	s.waitingQueue.PushBack(req)
+	s.waitingQueue.PushBack(waitingQueueItem{req, time.Now()})
 	return nil
 }
 
@@ -650,20 +679,22 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 
 	// Find first request for a loaded LoRA
 	for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
-		req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
-		if ok && req != nil && s.loraIsLoaded(req.CompletionReq.GetModel()) {
+		item, ok := elem.Value.(waitingQueueItem)
+		if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
-			s.incrementLora(req.CompletionReq.GetModel())
-			return req
+			s.incrementLora(item.reqCtx.CompletionReq.GetModel())
+			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			return item.reqCtx
 		}
 	}
 
 	// All the requests require a LoRA that is not loaded, check if we can load a LoRA
 	for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
-		req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
-		if ok && req != nil && s.loadLora(req.CompletionReq.GetModel()) {
+		item, ok := elem.Value.(waitingQueueItem)
+		if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
-			return req
+			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			return item.reqCtx
 		}
 	}
 
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -102,12 +102,15 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
 	tc *openaiserverapi.ToolCall, finishReason string) {
+	startPrefill := time.Now()
 	// time to first token delay
 	ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 	// report ttft in seconds
 	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
+	common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
 
+	startDecode := time.Now()
 	for i, token := range genTokens {
 		if i != 0 {
 			interTokenLat := s.getInterTokenLatency()
@@ -148,6 +151,8 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 		}
 	}
 
+	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
+
 	// send the last chunk if finish reason is stop
 	var chunk openaiserverapi.CompletionRespChunk
 	if finishReason == dataset.StopFinishReason {
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
@@ -19,6 +19,7 @@ package llmdinferencesim
 
 import (
 	"context"
+	"time"
 
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -59,6 +60,11 @@ type requestProcessor interface {
 }
 
 func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) {
+	start := time.Now()
+	defer func() {
+		s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds()
+	}()
+
 	req := reqCtx.CompletionReq
 	model := req.GetModel()
 	displayModel := s.getDisplayedModelName(model)