Skip to content

Commit 1876128

Browse files
committed
Additional metrics - vllm:request_queue_time_seconds, vllm:request_inference_time_seconds, vllm:request_prefill_time_seconds, and vllm:request_decode_time_seconds
Signed-off-by: Maya Barnea <[email protected]>
1 parent f578dc0 commit 1876128

File tree

5 files changed

+169
-15
lines changed

5 files changed

+169
-15
lines changed

pkg/common/utils.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
3232
var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
3333
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
3434

35-
var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
35+
var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
3636
20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
3737

3838
// ValidateContextWindow checks if the request fits within the model's context window

pkg/llm-d-inference-sim/metrics.go

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
116116
Subsystem: "",
117117
Name: "vllm:e2e_request_latency_seconds",
118118
Help: "Histogram of end to end request latency in seconds.",
119-
Buckets: common.E2ERequestLatencyBucketsBoundaries,
119+
Buckets: common.RequestLatencyBucketsBoundaries,
120120
},
121121
[]string{vllmapi.PromLabelModelName},
122122
)
@@ -126,6 +126,66 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
126126
return err
127127
}
128128

129+
s.metrics.reqQueueTime = prometheus.NewHistogramVec(
130+
prometheus.HistogramOpts{
131+
Subsystem: "",
132+
Name: "vllm:request_queue_time_seconds",
133+
Help: "Histogram of time spent in WAITING phase for request.",
134+
Buckets: common.RequestLatencyBucketsBoundaries,
135+
},
136+
[]string{vllmapi.PromLabelModelName},
137+
)
138+
139+
if err := s.metrics.registry.Register(s.metrics.reqQueueTime); err != nil {
140+
s.logger.Error(err, "Prometheus request queue time histogram register failed")
141+
return err
142+
}
143+
144+
s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
145+
prometheus.HistogramOpts{
146+
Subsystem: "",
147+
Name: "vllm:request_inference_time_seconds",
148+
Help: "Histogram of time spent in RUNNING phase for request.",
149+
Buckets: common.RequestLatencyBucketsBoundaries,
150+
},
151+
[]string{vllmapi.PromLabelModelName},
152+
)
153+
154+
if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil {
155+
s.logger.Error(err, "Prometheus request inerence time histogram register failed")
156+
return err
157+
}
158+
159+
s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
160+
prometheus.HistogramOpts{
161+
Subsystem: "",
162+
Name: "vllm:request_prefill_time_seconds",
163+
Help: "Histogram of time spent in PREFILL phase for request.",
164+
Buckets: common.RequestLatencyBucketsBoundaries,
165+
},
166+
[]string{vllmapi.PromLabelModelName},
167+
)
168+
169+
if err := s.metrics.registry.Register(s.metrics.reqPrefillTime); err != nil {
170+
s.logger.Error(err, "Prometheus request prefill time histogram register failed")
171+
return err
172+
}
173+
174+
s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
175+
prometheus.HistogramOpts{
176+
Subsystem: "",
177+
Name: "vllm:request_queue_time_seconds",
178+
Help: "Histogram of time spent in DECODE phase for request.",
179+
Buckets: common.RequestLatencyBucketsBoundaries,
180+
},
181+
[]string{vllmapi.PromLabelModelName},
182+
)
183+
184+
if err := s.metrics.registry.Register(s.metrics.reqDecodeTime); err != nil {
185+
s.logger.Error(err, "Prometheus request decode time histogram register failed")
186+
return err
187+
}
188+
129189
s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
130190
prometheus.GaugeOpts{
131191
Subsystem: "",
@@ -232,7 +292,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
232292
}
233293

234294
if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
235-
s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
295+
s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
236296
}
237297
}
238298

@@ -368,6 +428,10 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
368428
go s.tpotUpdater(ctx)
369429
go s.recordRequestUpdater(ctx)
370430
go s.e2eReqLatencyUpdater(ctx)
431+
go s.reqQueueTimeUpdater(ctx)
432+
go s.reqInferenceTimeUpdater(ctx)
433+
go s.reqPrefillTimeUpdater(ctx)
434+
go s.reqDecodeTimeUpdater(ctx)
371435
}
372436

373437
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -432,7 +496,7 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
432496
}
433497
}
434498

435-
// tpotUpdater updates the time per output token metric by listening on the relevant channel
499+
// e2eReqLatencyUpdater updates the e2e request latency metric by listening on the relevant channel
436500
func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
437501
for {
438502
select {
@@ -444,6 +508,54 @@ func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
444508
}
445509
}
446510

511+
// reqQueueTimeUpdater updates the request queue time metric by listening on the relevant channel
512+
func (s *VllmSimulator) reqQueueTimeUpdater(ctx context.Context) {
513+
for {
514+
select {
515+
case <-ctx.Done():
516+
return
517+
case value := <-s.metrics.reqQueueTimeChan:
518+
s.reportHistogramValue(s.metrics.reqQueueTime, value)
519+
}
520+
}
521+
}
522+
523+
// reqInferenceTimeUpdater updates the request inference time metric by listening on the relevant channel
524+
func (s *VllmSimulator) reqInferenceTimeUpdater(ctx context.Context) {
525+
for {
526+
select {
527+
case <-ctx.Done():
528+
return
529+
case value := <-s.metrics.reqInferenceTimeChan:
530+
s.reportHistogramValue(s.metrics.reqInferenceTime, value)
531+
}
532+
}
533+
}
534+
535+
// reqPrefillTimeUpdater updates the request prefill time metric by listening on the relevant channel
536+
func (s *VllmSimulator) reqPrefillTimeUpdater(ctx context.Context) {
537+
for {
538+
select {
539+
case <-ctx.Done():
540+
return
541+
case value := <-s.metrics.reqPrefillTimeChan:
542+
s.reportHistogramValue(s.metrics.reqPrefillTime, value)
543+
}
544+
}
545+
}
546+
547+
// reqDecodeTimeUpdater updates the request decode time metric by listening on the relevant channel
548+
func (s *VllmSimulator) reqDecodeTimeUpdater(ctx context.Context) {
549+
for {
550+
select {
551+
case <-ctx.Done():
552+
return
553+
case value := <-s.metrics.reqDecodeTimeChan:
554+
s.reportHistogramValue(s.metrics.reqDecodeTime, value)
555+
}
556+
}
557+
}
558+
447559
// lorasUpdater updates the running loras metric by listening on the relevant channel
448560
// one function updates both waiting and running loras since they a part of the same prometheus gauge
449561
func (s *VllmSimulator) lorasUpdater(ctx context.Context) {

pkg/llm-d-inference-sim/simulator.go

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ type metricsData struct {
9595
tpotChan chan float64
9696
// e2eReqLatencyChan is a channel to update request e2e latency
9797
e2eReqLatencyChan chan float64
98+
// reqQueueTimeChan is a channel to update request queue time
99+
reqQueueTimeChan chan float64
100+
// reqInferenceTimeChan is a channel to update request inference time
101+
reqInferenceTimeChan chan float64
102+
// reqPrefillTimeChan is a channel to update request prefill time
103+
reqPrefillTimeChan chan float64
104+
// reqDecodeTimeChan is a channel to update request decode time
105+
reqDecodeTimeChan chan float64
98106
// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
99107
kvCacheUsageChan chan float64
100108
// registry is a Prometheus registry
@@ -111,6 +119,14 @@ type metricsData struct {
111119
tpot *prometheus.HistogramVec
112120
// e2eReqLatency is prometheus histogram of end to end request latency in seconds
113121
e2eReqLatency *prometheus.HistogramVec
122+
// reqQueueTime is prometheus histogram of request queue time in seconds
123+
reqQueueTime *prometheus.HistogramVec
124+
// reqInferenceTime is prometheus histogram of request inference time in seconds
125+
reqInferenceTime *prometheus.HistogramVec
126+
// reqPrefillTime is prometheus histogram of request prefill time in seconds
127+
reqPrefillTime *prometheus.HistogramVec
128+
// reqDecodeTime is prometheus histogram of request decode time in seconds
129+
reqDecodeTime *prometheus.HistogramVec
114130
// kvCacheUsagePercentage is prometheus gauge
115131
kvCacheUsagePercentage *prometheus.GaugeVec
116132
// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -139,6 +155,11 @@ type requestCompleted struct {
139155
model string
140156
}
141157

158+
type waitingQueueItem struct {
159+
reqCtx *openaiserverapi.CompletionReqCtx
160+
enqueueTime time.Time
161+
}
162+
142163
// VllmSimulator simulates vLLM server supporting OpenAI API
143164
type VllmSimulator struct {
144165
// logger is used for information and errors logging
@@ -276,6 +297,10 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
276297
s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
277298
s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
278299
s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
300+
s.metrics.reqQueueTimeChan = make(chan float64, maxNumberOfRequests)
301+
s.metrics.reqInferenceTimeChan = make(chan float64, maxNumberOfRequests)
302+
s.metrics.reqPrefillTimeChan = make(chan float64, maxNumberOfRequests)
303+
s.metrics.reqDecodeTimeChan = make(chan float64, maxNumberOfRequests)
279304
s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
280305

281306
s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -575,28 +600,32 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
575600
// from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
576601
// finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
577602
// usageData - usage (tokens statistics) for this response
578-
func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
579-
modelName string, finishReason string, usageData *openaiserverapi.Usage) {
603+
func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string,
604+
toolCalls []openaiserverapi.ToolCall, modelName string, finishReason string, usageData *openaiserverapi.Usage) {
580605
resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
581606
reqCtx.CompletionReq.IsDoRemoteDecode())
582607

583608
// calculate how long to wait before returning the response, time is based on number of tokens
584609
nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
610+
startPrefill := time.Now()
585611
ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
586612
time.Sleep(time.Duration(ttft) * time.Millisecond)
587613

588614
// report ttft in seconds
589615
common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
616+
common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
590617

618+
startDecode := time.Now()
591619
for range usageData.CompletionTokens - 1 {
592620
perTokenLatency := s.getInterTokenLatency()
593621
time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
594622

595623
// report tpot in seconds
596624
common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan")
597625
}
598-
s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
626+
s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
599627

628+
s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
600629
s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID())
601630
}
602631

@@ -639,7 +668,7 @@ func (s *VllmSimulator) enqueue(req *openaiserverapi.CompletionReqCtx) error {
639668
if s.waitingQueue.Len() >= s.queueCapacity {
640669
return errors.New("waiting requests queue is full")
641670
}
642-
s.waitingQueue.PushBack(req)
671+
s.waitingQueue.PushBack(waitingQueueItem{req, time.Now()})
643672
return nil
644673
}
645674

@@ -650,20 +679,22 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
650679

651680
// Find first request for a loaded LoRA
652681
for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
653-
req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
654-
if ok && req != nil && s.loraIsLoaded(req.CompletionReq.GetModel()) {
682+
item, ok := elem.Value.(waitingQueueItem)
683+
if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) {
655684
s.waitingQueue.Remove(elem)
656-
s.incrementLora(req.CompletionReq.GetModel())
657-
return req
685+
s.incrementLora(item.reqCtx.CompletionReq.GetModel())
686+
s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
687+
return item.reqCtx
658688
}
659689
}
660690

661691
// All the requests require a LoRA that is not loaded, check if we can load a LoRA
662692
for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
663-
req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
664-
if ok && req != nil && s.loadLora(req.CompletionReq.GetModel()) {
693+
item, ok := elem.Value.(waitingQueueItem)
694+
if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) {
665695
s.waitingQueue.Remove(elem)
666-
return req
696+
s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
697+
return item.reqCtx
667698
}
668699
}
669700

pkg/llm-d-inference-sim/streaming.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,15 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
102102
// sendTokenChunks creates and sends response chunks
103103
func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
104104
tc *openaiserverapi.ToolCall, finishReason string) {
105+
startPrefill := time.Now()
105106
// time to first token delay
106107
ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
107108
time.Sleep(time.Duration(ttft) * time.Millisecond)
108109
// report ttft in seconds
109110
common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
111+
common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
110112

113+
startDecode := time.Now()
111114
for i, token := range genTokens {
112115
if i != 0 {
113116
interTokenLat := s.getInterTokenLatency()
@@ -148,6 +151,8 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
148151
}
149152
}
150153

154+
s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
155+
151156
// send the last chunk if finish reason is stop
152157
var chunk openaiserverapi.CompletionRespChunk
153158
if finishReason == dataset.StopFinishReason {

pkg/llm-d-inference-sim/worker.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package llmdinferencesim
1919

2020
import (
2121
"context"
22+
"time"
2223

2324
"github.com/go-logr/logr"
2425
"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -59,6 +60,11 @@ type requestProcessor interface {
5960
}
6061

6162
func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) {
63+
start := time.Now()
64+
defer func() {
65+
s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds()
66+
}()
67+
6268
req := reqCtx.CompletionReq
6369
model := req.GetModel()
6470
displayModel := s.getDisplayedModelName(model)

0 commit comments

Comments
 (0)