@@ -20,6 +20,7 @@ package llmdinferencesim
2020
2121import  (
2222	"context" 
23+ 	"math" 
2324	"strconv" 
2425	"strings" 
2526	"sync" 
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495		return  err 
9596	}
9697
98+ 	s .requestPromptTokens  =  prometheus .NewHistogramVec (
99+ 		prometheus.HistogramOpts {
100+ 			Subsystem : "" ,
101+ 			Name :      "vllm:request_prompt_tokens" ,
102+ 			Help :      "Number of prefill tokens processed." ,
103+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
104+ 		},
105+ 		[]string {vllmapi .PromLabelModelName },
106+ 	)
107+ 	if  err  :=  s .registry .Register (s .requestPromptTokens ); err  !=  nil  {
108+ 		s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
109+ 		return  err 
110+ 	}
111+ 
112+ 	s .requestGenerationTokens  =  prometheus .NewHistogramVec (
113+ 		prometheus.HistogramOpts {
114+ 			Subsystem : "" ,
115+ 			Name :      "vllm:request_generation_tokens" ,
116+ 			Help :      "Number of generation tokens processed." ,
117+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
118+ 		},
119+ 		[]string {vllmapi .PromLabelModelName },
120+ 	)
121+ 	if  err  :=  s .registry .Register (s .requestGenerationTokens ); err  !=  nil  {
122+ 		s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
123+ 		return  err 
124+ 	}
125+ 
126+ 	s .requestParamsMaxTokens  =  prometheus .NewHistogramVec (
127+ 		prometheus.HistogramOpts {
128+ 			Subsystem : "" ,
129+ 			Name :      "vllm:request_params_max_tokens" ,
130+ 			Help :      "Histogram of the max_tokens request parameter." ,
131+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
132+ 		},
133+ 		[]string {vllmapi .PromLabelModelName },
134+ 	)
135+ 	if  err  :=  s .registry .Register (s .requestParamsMaxTokens ); err  !=  nil  {
136+ 		s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
137+ 		return  err 
138+ 	}
139+ 
140+ 	s .requestSuccessTotal  =  prometheus .NewCounterVec (
141+ 		prometheus.CounterOpts {
142+ 			Subsystem : "" ,
143+ 			Name :      "vllm:request_success_total" ,
144+ 			Help :      "Count of successfully processed requests." ,
145+ 		},
146+ 		[]string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
147+ 	)
148+ 	if  err  :=  s .registry .Register (s .requestSuccessTotal ); err  !=  nil  {
149+ 		s .logger .Error (err , "Prometheus request_success_total counter register failed" )
150+ 		return  err 
151+ 	}
152+ 
97153	s .setInitialPrometheusMetrics ()
98154
99155	return  nil 
@@ -103,12 +159,25 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
103159// the fake metrics if set 
104160func  (s  * VllmSimulator ) setInitialPrometheusMetrics () {
105161	var  nRunningReqs , nWaitingReqs , kvCacheUsage  float64 
162+ 	modelName  :=  s .getDisplayedModelName (s .config .Model )
106163	if  s .config .FakeMetrics  !=  nil  {
107164		nRunningReqs  =  float64 (s .config .FakeMetrics .RunningRequests )
108165		nWaitingReqs  =  float64 (s .config .FakeMetrics .WaitingRequests )
109166		kvCacheUsage  =  float64 (s .config .FakeMetrics .KVCacheUsagePercentage )
167+ 		for  _ , requestPromptToken  :=  range  s .config .FakeMetrics .RequestPromptTokens  {
168+ 			s .requestPromptTokens .WithLabelValues (modelName ).Observe (requestPromptToken )
169+ 		}
170+ 		for  _ , requestGenerationToken  :=  range  s .config .FakeMetrics .RequestGenerationTokens  {
171+ 			s .requestGenerationTokens .WithLabelValues (modelName ).Observe (requestGenerationToken )
172+ 		}
173+ 		for  _ , requestParamsMaxToken  :=  range  s .config .FakeMetrics .RequestParamsMaxTokens  {
174+ 			s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (requestParamsMaxToken )
175+ 		}
176+ 		for  reason , requestSuccessTotal  :=  range  s .config .FakeMetrics .RequestSuccessTotal  {
177+ 			s .requestSuccessTotal .WithLabelValues (modelName , reason ).Add (float64 (requestSuccessTotal ))
178+ 		}
179+ 
110180	}
111- 	modelName  :=  s .getDisplayedModelName (s .config .Model )
112181	s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
113182	s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
114183	s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
@@ -198,6 +267,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198267	go  s .runningRequestsUpdater (ctx )
199268	go  s .lorasUpdater (ctx )
200269	go  s .kvCacheUsageUpdater (ctx )
270+ 	go  s .recordRequestUpdater (ctx )
201271}
202272
203273// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel 
@@ -282,3 +352,75 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282352		s .logger .Error (nil , "Zero model reference" , "model" , lora )
283353	}
284354}
355+ 
356+ // recordRequestUpdater listens on requestSuccessChan and drives the Prometheus metric 
357+ // for successfully completed requests. 
358+ func  (s  * VllmSimulator ) recordRequestUpdater (ctx  context.Context ) {
359+ 	for  {
360+ 		select  {
361+ 		case  <- ctx .Done ():
362+ 			return 
363+ 		case  event  :=  <- s .requestSuccessChan :
364+ 			s .recordRequestMetricsOnSuccess (
365+ 				event .promptTokens ,
366+ 				event .generationTokens ,
367+ 				event .maxTokens ,
368+ 				event .finishReason ,
369+ 			)
370+ 		}
371+ 	}
372+ }
373+ 
374+ // requestSuccessEvent represents the data associated with a successfully completed request, 
375+ // which is sent through the requestSuccessChan for asynchronous metrics recording. 
376+ type  requestSuccessEvent  struct  {
377+ 	// promptTokens is the number of input (prompt) tokens in the request 
378+ 	promptTokens  int 
379+ 	// generationTokens is the number of generated (output) tokens in the response 
380+ 	generationTokens  int 
381+ 	// maxTokens is the maximum number of tokens allowed for generation (if specified in the request) 
382+ 	maxTokens  * int64 
383+ 	// finishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls") 
384+ 	finishReason  string 
385+ }
386+ 
387+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request 
388+ func  (s  * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
389+ 	generationTokens  int , maxTokens  * int64 , finishReason  string ) {
390+ 	modelName  :=  s .getDisplayedModelName (s .config .Model )
391+ 	s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
392+ 	s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
393+ 	if  maxTokens  !=  nil  {
394+ 		s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
395+ 	}
396+ 	s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
397+ }
398+ 
399+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5]. 
400+ // This matches vLLM's build_1_2_5_buckets() in metrics.py. 
401+ // 
402+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175 
403+ func  build125Buckets (maxValue  int ) []float64  {
404+ 	if  maxValue  <=  0  {
405+ 		return  []float64 {}
406+ 	}
407+ 	var  buckets  []float64 
408+ 	exponent  :=  0 
409+ 	mantissa  :=  []int {1 , 2 , 5 }
410+ 
411+ 	for  {
412+ 		complete  :=  true 
413+ 		for  _ , m  :=  range  mantissa  {
414+ 			value  :=  m  *  int (math .Pow10 (exponent ))
415+ 			if  value  <=  maxValue  {
416+ 				buckets  =  append (buckets , float64 (value ))
417+ 				complete  =  false 
418+ 			}
419+ 		}
420+ 		if  complete  {
421+ 			break 
422+ 		}
423+ 		exponent ++ 
424+ 	}
425+ 	return  buckets 
426+ }
0 commit comments