@@ -20,6 +20,7 @@ package llmdinferencesim
2020
2121import  (
2222	"context" 
23+ 	"math" 
2324	"strconv" 
2425	"strings" 
2526	"sync" 
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495		return  err 
9596	}
9697
98+ 	s .requestPromptTokens  =  prometheus .NewHistogramVec (
99+ 		prometheus.HistogramOpts {
100+ 			Subsystem : "" ,
101+ 			Name :      "vllm:request_prompt_tokens" ,
102+ 			Help :      "Number of input prompt tokens in the request." ,
103+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
104+ 		},
105+ 		[]string {vllmapi .PromLabelModelName },
106+ 	)
107+ 	if  err  :=  s .registry .Register (s .requestPromptTokens ); err  !=  nil  {
108+ 		s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
109+ 		return  err 
110+ 	}
111+ 
112+ 	s .requestGenerationTokens  =  prometheus .NewHistogramVec (
113+ 		prometheus.HistogramOpts {
114+ 			Subsystem : "" ,
115+ 			Name :      "vllm:request_generation_tokens" ,
116+ 			Help :      "Number of generation tokens processed." ,
117+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
118+ 		},
119+ 		[]string {vllmapi .PromLabelModelName },
120+ 	)
121+ 	if  err  :=  s .registry .Register (s .requestGenerationTokens ); err  !=  nil  {
122+ 		s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
123+ 		return  err 
124+ 	}
125+ 
126+ 	s .requestParamsMaxTokens  =  prometheus .NewHistogramVec (
127+ 		prometheus.HistogramOpts {
128+ 			Subsystem : "" ,
129+ 			Name :      "vllm:request_params_max_tokens" ,
130+ 			Help :      "Histogram of the max_tokens request parameter." ,
131+ 			Buckets :   build125Buckets (s .config .MaxModelLen ),
132+ 		},
133+ 		[]string {vllmapi .PromLabelModelName },
134+ 	)
135+ 	if  err  :=  s .registry .Register (s .requestParamsMaxTokens ); err  !=  nil  {
136+ 		s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
137+ 		return  err 
138+ 	}
139+ 
140+ 	s .requestSuccessTotal  =  prometheus .NewCounterVec (
141+ 		prometheus.CounterOpts {
142+ 			Subsystem : "" ,
143+ 			Name :      "vllm:request_success_total" ,
144+ 			Help :      "Count of successfully processed requests." ,
145+ 		},
146+ 		[]string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
147+ 	)
148+ 	if  err  :=  s .registry .Register (s .requestSuccessTotal ); err  !=  nil  {
149+ 		s .logger .Error (err , "Prometheus request_success_total counter register failed" )
150+ 		return  err 
151+ 	}
152+ 
97153	s .setInitialPrometheusMetrics ()
98154
99155	return  nil 
@@ -102,16 +158,18 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
102158// setInitialPrometheusMetrics sends the default values to prometheus or 
103159// the fake metrics if set 
104160func  (s  * VllmSimulator ) setInitialPrometheusMetrics () {
105- 	var  nRunningReqs , nWaitingReqs , kvCacheUsage  float64 
161+ 	var  nRunningReqs , nWaitingReqs , kvCacheUsage ,  requestSuccessTotal  float64 
106162	if  s .config .FakeMetrics  !=  nil  {
107163		nRunningReqs  =  float64 (s .config .FakeMetrics .RunningRequests )
108164		nWaitingReqs  =  float64 (s .config .FakeMetrics .WaitingRequests )
109165		kvCacheUsage  =  float64 (s .config .FakeMetrics .KVCacheUsagePercentage )
166+ 		requestSuccessTotal  =  float64 (s .config .FakeMetrics .RequestSuccessTotal )
110167	}
111168	modelName  :=  s .getDisplayedModelName (s .config .Model )
112169	s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
113170	s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
114171	s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
172+ 	s .requestSuccessTotal .WithLabelValues (modelName , "stop" ).Add (requestSuccessTotal )
115173
116174	if  s .config .FakeMetrics  !=  nil  &&  len (s .config .FakeMetrics .LoraMetrics ) !=  0  {
117175		for  _ , metrics  :=  range  s .config .FakeMetrics .LoraMetrics  {
@@ -198,6 +256,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198256	go  s .runningRequestsUpdater (ctx )
199257	go  s .lorasUpdater (ctx )
200258	go  s .kvCacheUsageUpdater (ctx )
259+ 	go  s .recordRequestUpdater (ctx )
201260}
202261
203262// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel 
@@ -282,3 +341,71 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282341		s .logger .Error (nil , "Zero model reference" , "model" , lora )
283342	}
284343}
344+ 
345+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request. 
346+ func  (s  * VllmSimulator ) recordRequestUpdater (ctx  context.Context ) {
347+ 	for  {
348+ 		select  {
349+ 		case  <- ctx .Done ():
350+ 			return 
351+ 		case  event  :=  <- s .requestSuccessChan :
352+ 			s .recordRequestMetricsOnSuccess (
353+ 				event .PromptTokens ,
354+ 				event .GenerationTokens ,
355+ 				event .MaxTokens ,
356+ 				event .FinishReason ,
357+ 			)
358+ 		}
359+ 	}
360+ }
361+ 
362+ // requestSuccessEvent represents the data associated with a successfully completed request, 
363+ // which is sent through the requestSuccessChan for asynchronous metrics recording. 
364+ type  requestSuccessEvent  struct  {
365+ 	// PromptTokens is the number of input (prompt) tokens in the request 
366+ 	PromptTokens  int 
367+ 	// GenerationTokens is the number of generated (output) tokens in the response 
368+ 	GenerationTokens  int 
369+ 	// MaxTokens is the maximum number of tokens allowed for generation (if specified in the request) 
370+ 	MaxTokens  * int64 
371+ 	// FinishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls") 
372+ 	FinishReason  string 
373+ }
374+ 
375+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request 
376+ func  (s  * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
377+ 	generationTokens  int , maxTokens  * int64 , finishReason  string ) {
378+ 	modelName  :=  s .getDisplayedModelName (s .config .Model )
379+ 	s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
380+ 	s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
381+ 	if  maxTokens  !=  nil  {
382+ 		s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
383+ 	}
384+ 	s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
385+ }
386+ 
387+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5]. 
388+ // This matches vLLM's build_1_2_5_buckets() in metrics.py. 
389+ // 
390+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175 
391+ func  build125Buckets (maxValue  int ) []float64  {
392+ 	var  buckets  []float64 
393+ 	exponent  :=  0 
394+ 	mantissa  :=  []int {1 , 2 , 5 }
395+ 
396+ 	for  {
397+ 		complete  :=  true 
398+ 		for  _ , m  :=  range  mantissa  {
399+ 			value  :=  m  *  int (math .Pow10 (exponent ))
400+ 			if  value  <=  maxValue  {
401+ 				buckets  =  append (buckets , float64 (value ))
402+ 				complete  =  false 
403+ 			}
404+ 		}
405+ 		if  complete  {
406+ 			break 
407+ 		}
408+ 		exponent ++ 
409+ 	}
410+ 	return  buckets 
411+ }
0 commit comments