@@ -95,6 +95,14 @@ type metricsData struct {
9595 tpotChan chan float64
9696 // e2eReqLatencyChan is a channel to update request e2e latency
9797 e2eReqLatencyChan chan float64
98+ // reqQueueTimeChan is a channel to update request queue time
99+ reqQueueTimeChan chan float64
100+ // reqInferenceTimeChan is a channel to update request inference time
101+ reqInferenceTimeChan chan float64
102+ // reqPrefillTimeChan is a channel to update request prefill time
103+ reqPrefillTimeChan chan float64
104+ // reqDecodeTimeChan is a channel to update request decode time
105+ reqDecodeTimeChan chan float64
98106 // kvCacheUsageChan is a channel to update kvCacheUsagePercentage
99107 kvCacheUsageChan chan float64
100108 // registry is a Prometheus registry
@@ -111,6 +119,14 @@ type metricsData struct {
111119 tpot * prometheus.HistogramVec
112120 // e2eReqLatency is prometheus histogram of end to end request latency in seconds
113121 e2eReqLatency * prometheus.HistogramVec
122+ // reqQueueTime is prometheus histogram of request queue time in seconds
123+ reqQueueTime * prometheus.HistogramVec
124+ // reqInferenceTime is prometheus histogram of request inference time in seconds
125+ reqInferenceTime * prometheus.HistogramVec
126+ // reqPrefillTime is prometheus histogram of request prefill time in seconds
127+ reqPrefillTime * prometheus.HistogramVec
128+ // reqDecodeTime is prometheus histogram of request decode time in seconds
129+ reqDecodeTime * prometheus.HistogramVec
114130 // kvCacheUsagePercentage is prometheus gauge
115131 kvCacheUsagePercentage * prometheus.GaugeVec
116132 // requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -139,6 +155,11 @@ type requestCompleted struct {
139155 model string
140156}
141157
158+ type waitingQueueItem struct {
159+ reqCtx * openaiserverapi.CompletionReqCtx
160+ enqueueTime time.Time
161+ }
162+
142163// VllmSimulator simulates vLLM server supporting OpenAI API
143164type VllmSimulator struct {
144165 // logger is used for information and errors logging
@@ -276,6 +297,10 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
276297 s .metrics .ttftChan = make (chan float64 , maxNumberOfRequests )
277298 s .metrics .tpotChan = make (chan float64 , maxNumberOfRequests )
278299 s .metrics .e2eReqLatencyChan = make (chan float64 , maxNumberOfRequests )
300+ s .metrics .reqQueueTimeChan = make (chan float64 , maxNumberOfRequests )
301+ s .metrics .reqInferenceTimeChan = make (chan float64 , maxNumberOfRequests )
302+ s .metrics .reqPrefillTimeChan = make (chan float64 , maxNumberOfRequests )
303+ s .metrics .reqDecodeTimeChan = make (chan float64 , maxNumberOfRequests )
279304 s .metrics .requestSuccessChan = make (chan requestSuccessEvent , maxNumberOfRequests )
280305
281306 s .newRequests = make (chan * openaiserverapi.CompletionReqCtx , maxNumberOfRequests )
@@ -575,28 +600,32 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
575600// from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
576601// finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
577602// usageData - usage (tokens statistics) for this response
578- func (s * VllmSimulator ) sendResponse (reqCtx * openaiserverapi.CompletionReqCtx , respTokens []string , toolCalls []openaiserverapi. ToolCall ,
579- modelName string , finishReason string , usageData * openaiserverapi.Usage ) {
603+ func (s * VllmSimulator ) sendResponse (reqCtx * openaiserverapi.CompletionReqCtx , respTokens []string ,
604+ toolCalls []openaiserverapi. ToolCall , modelName string , finishReason string , usageData * openaiserverapi.Usage ) {
580605 resp := s .createCompletionResponse (reqCtx .IsChatCompletion , respTokens , toolCalls , & finishReason , usageData , modelName ,
581606 reqCtx .CompletionReq .IsDoRemoteDecode ())
582607
583608 // calculate how long to wait before returning the response, time is based on number of tokens
584609 nCachedPromptTokens := reqCtx .CompletionReq .GetNumberOfCachedPromptTokens ()
610+ startPrefill := time .Now ()
585611 ttft := s .getWaitTimeToFirstToken (usageData .PromptTokens , nCachedPromptTokens , reqCtx .CompletionReq .IsDoRemotePrefill ())
586612 time .Sleep (time .Duration (ttft ) * time .Millisecond )
587613
588614 // report ttft in seconds
589615 common .WriteToChannel (s .metrics .ttftChan , (float64 (ttft ) / 1000 ), s .logger , "metrics.ttftChan" )
616+ common .WriteToChannel (s .metrics .reqPrefillTimeChan , time .Since (startPrefill ).Seconds (), s .logger , "metrics.reqPrefillTimeChan" )
590617
618+ startDecode := time .Now ()
591619 for range usageData .CompletionTokens - 1 {
592620 perTokenLatency := s .getInterTokenLatency ()
593621 time .Sleep (time .Duration (perTokenLatency ) * time .Millisecond )
594622
595623 // report tpot in seconds
596624 common .WriteToChannel (s .metrics .tpotChan , (float64 (perTokenLatency ) / 1000 ), s .logger , "metrics.tpotChan" )
597625 }
598- s .sendCompletionResponse ( reqCtx . HTTPReqCtx , resp )
626+ s .metrics . reqDecodeTimeChan <- time . Since ( startDecode ). Seconds ( )
599627
628+ s .sendCompletionResponse (reqCtx .HTTPReqCtx , resp )
600629 s .responseSentCallback (modelName , reqCtx .IsChatCompletion , reqCtx .CompletionReq .GetRequestID ())
601630}
602631
@@ -639,7 +668,7 @@ func (s *VllmSimulator) enqueue(req *openaiserverapi.CompletionReqCtx) error {
639668 if s .waitingQueue .Len () >= s .queueCapacity {
640669 return errors .New ("waiting requests queue is full" )
641670 }
642- s .waitingQueue .PushBack (req )
671+ s .waitingQueue .PushBack (waitingQueueItem { req , time . Now ()} )
643672 return nil
644673}
645674
@@ -650,20 +679,22 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
650679
651680 // Find first request for a loaded LoRA
652681 for elem := s .waitingQueue .Front (); elem != nil ; elem = elem .Next () {
653- req , ok := elem .Value .(* openaiserverapi. CompletionReqCtx )
654- if ok && req != nil && s .loraIsLoaded (req .CompletionReq .GetModel ()) {
682+ item , ok := elem .Value .(waitingQueueItem )
683+ if ok && item . reqCtx != nil && s .loraIsLoaded (item . reqCtx .CompletionReq .GetModel ()) {
655684 s .waitingQueue .Remove (elem )
656- s .incrementLora (req .CompletionReq .GetModel ())
657- return req
685+ s .incrementLora (item .reqCtx .CompletionReq .GetModel ())
686+ s .metrics .reqQueueTimeChan <- time .Since (item .enqueueTime ).Seconds ()
687+ return item .reqCtx
658688 }
659689 }
660690
661691 // All the requests require a LoRA that is not loaded, check if we can load a LoRA
662692 for elem := s .waitingQueue .Front (); elem != nil ; elem = elem .Next () {
663- req , ok := elem .Value .(* openaiserverapi. CompletionReqCtx )
664- if ok && req != nil && s .loadLora (req .CompletionReq .GetModel ()) {
693+ item , ok := elem .Value .(waitingQueueItem )
694+ if ok && item . reqCtx != nil && s .loadLora (item . reqCtx .CompletionReq .GetModel ()) {
665695 s .waitingQueue .Remove (elem )
666- return req
696+ s .metrics .reqQueueTimeChan <- time .Since (item .enqueueTime ).Seconds ()
697+ return item .reqCtx
667698 }
668699 }
669700
0 commit comments