@@ -20,6 +20,7 @@ package llmdinferencesim
2020
2121import  (
2222	"context" 
23+ 	"math" 
2324	"strconv" 
2425	"strings" 
2526	"sync" 
@@ -65,6 +66,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
6566		return  err 
6667	}
6768
69+ 	// not supported for now, reports constant value 
6870	s .waitingRequests  =  prometheus .NewGaugeVec (
6971		prometheus.GaugeOpts {
7072			Subsystem : "" ,
@@ -123,6 +125,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
123125		return  err 
124126	}
125127
128+ 	s .requestPromptTokens  =  prometheus .NewHistogramVec (
129+ 		prometheus.HistogramOpts {
130+ 			Subsystem : "" ,
131+ 			Name :      "vllm:request_prompt_tokens" ,
132+ 			Help :      "Number of prefill tokens processed." ,
133+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
134+ 		},
135+ 		[]string {vllmapi .PromLabelModelName },
136+ 	)
137+ 	if  err  :=  s .registry .Register (s .requestPromptTokens ); err  !=  nil  {
138+ 		s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
139+ 		return  err 
140+ 	}
141+ 
142+ 	s .requestGenerationTokens  =  prometheus .NewHistogramVec (
143+ 		prometheus.HistogramOpts {
144+ 			Subsystem : "" ,
145+ 			Name :      "vllm:request_generation_tokens" ,
146+ 			Help :      "Number of generation tokens processed." ,
147+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
148+ 		},
149+ 		[]string {vllmapi .PromLabelModelName },
150+ 	)
151+ 	if  err  :=  s .registry .Register (s .requestGenerationTokens ); err  !=  nil  {
152+ 		s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
153+ 		return  err 
154+ 	}
155+ 
156+ 	s .requestParamsMaxTokens  =  prometheus .NewHistogramVec (
157+ 		prometheus.HistogramOpts {
158+ 			Subsystem : "" ,
159+ 			Name :      "vllm:request_params_max_tokens" ,
160+ 			Help :      "Histogram of the max_tokens request parameter." ,
161+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
162+ 		},
163+ 		[]string {vllmapi .PromLabelModelName },
164+ 	)
165+ 	if  err  :=  s .registry .Register (s .requestParamsMaxTokens ); err  !=  nil  {
166+ 		s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
167+ 		return  err 
168+ 	}
169+ 
170+ 	s .requestSuccessTotal  =  prometheus .NewCounterVec (
171+ 		prometheus.CounterOpts {
172+ 			Subsystem : "" ,
173+ 			Name :      "vllm:request_success_total" ,
174+ 			Help :      "Count of successfully processed requests." ,
175+ 		},
176+ 		[]string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
177+ 	)
178+ 	if  err  :=  s .registry .Register (s .requestSuccessTotal ); err  !=  nil  {
179+ 		s .logger .Error (err , "Prometheus request_success_total counter register failed" )
180+ 		return  err 
181+ 	}
182+ 
126183	s .setInitialPrometheusMetrics ()
127184
128185	return  nil 
@@ -132,6 +189,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
132189// the fake metrics if set 
133190func  (s  * VllmSimulator ) setInitialPrometheusMetrics () {
134191	var  nRunningReqs , nWaitingReqs , kvCacheUsage  float64 
192+ 	modelName  :=  s .getDisplayedModelName (s .config .Model )
135193	if  s .config .FakeMetrics  !=  nil  {
136194		nRunningReqs  =  float64 (s .config .FakeMetrics .RunningRequests )
137195		nWaitingReqs  =  float64 (s .config .FakeMetrics .WaitingRequests )
@@ -144,9 +202,21 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
144202		if  s .config .FakeMetrics .TPOTBucketValues  !=  nil  {
145203			s .initFakeHistogram (s .tpot , common .TPOTBucketsBoundaries , s .config .FakeMetrics .TPOTBucketValues )
146204		}
205+ 		for  _ , requestPromptToken  :=  range  s .config .FakeMetrics .RequestPromptTokens  {
206+ 			s .requestPromptTokens .WithLabelValues (modelName ).Observe (requestPromptToken )
207+ 		}
208+ 		for  _ , requestGenerationToken  :=  range  s .config .FakeMetrics .RequestGenerationTokens  {
209+ 			s .requestGenerationTokens .WithLabelValues (modelName ).Observe (requestGenerationToken )
210+ 		}
211+ 		for  _ , requestParamsMaxToken  :=  range  s .config .FakeMetrics .RequestParamsMaxTokens  {
212+ 			s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (requestParamsMaxToken )
213+ 		}
214+ 		for  reason , requestSuccessTotal  :=  range  s .config .FakeMetrics .RequestSuccessTotal  {
215+ 			s .requestSuccessTotal .WithLabelValues (modelName , reason ).Add (float64 (requestSuccessTotal ))
216+ 		}
217+ 
147218	}
148219
149- 	modelName  :=  s .getDisplayedModelName (s .config .Model )
150220	s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
151221	s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
152222	s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
@@ -288,6 +358,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
288358	go  s .kvCacheUsageUpdater (ctx )
289359	go  s .ttftUpdater (ctx )
290360	go  s .tpotUpdater (ctx )
361+ 	go  s .recordRequestUpdater (ctx )
291362}
292363
293364// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel 
@@ -396,3 +467,75 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
396467		s .logger .Error (nil , "Zero model reference" , "model" , lora )
397468	}
398469}
470+ 
471+ // recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric 
472+ // for successfully completed requests. 
473+ func  (s  * VllmSimulator ) recordRequestUpdater (ctx  context.Context ) {
474+ 	for  {
475+ 		select  {
476+ 		case  <- ctx .Done ():
477+ 			return 
478+ 		case  event  :=  <- s .requestSuccessChan :
479+ 			s .recordRequestMetricsOnSuccess (
480+ 				event .promptTokens ,
481+ 				event .generationTokens ,
482+ 				event .maxTokens ,
483+ 				event .finishReason ,
484+ 			)
485+ 		}
486+ 	}
487+ }
488+ 
489+ // requestSuccessEvent represents the data associated with a successfully completed request, 
490+ // which is sent through the requestSuccessChan for asynchronous metrics recording. 
491+ type  requestSuccessEvent  struct  {
492+ 	// promptTokens is the number of input (prompt) tokens in the request 
493+ 	promptTokens  int 
494+ 	// generationTokens is the number of generated (output) tokens in the response 
495+ 	generationTokens  int 
496+ 	// maxTokens is the maximum number of tokens allowed for generation (if specified in the request) 
497+ 	maxTokens  * int64 
498+ 	// finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls") 
499+ 	finishReason  string 
500+ }
501+ 
502+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request 
503+ func  (s  * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
504+ 	generationTokens  int , maxTokens  * int64 , finishReason  string ) {
505+ 	modelName  :=  s .getDisplayedModelName (s .config .Model )
506+ 	s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
507+ 	s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
508+ 	if  maxTokens  !=  nil  {
509+ 		s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
510+ 	}
511+ 	s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
512+ }
513+ 
514+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5]. 
515+ // This matches vLLM's build_1_2_5_buckets() in metrics.py. 
516+ // 
517+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175 
518+ func  build125Buckets (maxValue  int ) []float64  {
519+ 	if  maxValue  <=  0  {
520+ 		return  []float64 {}
521+ 	}
522+ 	var  buckets  []float64 
523+ 	exponent  :=  0 
524+ 	mantissa  :=  []int {1 , 2 , 5 }
525+ 
526+ 	for  {
527+ 		complete  :=  true 
528+ 		for  _ , m  :=  range  mantissa  {
529+ 			value  :=  m  *  int (math .Pow10 (exponent ))
530+ 			if  value  <=  maxValue  {
531+ 				buckets  =  append (buckets , float64 (value ))
532+ 				complete  =  false 
533+ 			}
534+ 		}
535+ 		if  complete  {
536+ 			break 
537+ 		}
538+ 		exponent ++ 
539+ 	}
540+ 	return  buckets 
541+ }
0 commit comments