@@ -19,7 +19,9 @@ package llmdinferencesim
1919import  (
2020	"context" 
2121	"errors" 
22+ 	"fmt" 
2223	"io" 
24+ 	"math" 
2325	"net/http" 
2426	"os" 
2527	"reflect" 
@@ -118,7 +120,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
118120
119121	It ("Should record correct prompt and generation token counts" , func () {
120122		modelName  :=  "testmodel" 
121- 		prompt  :=  strings .Repeat ("hello " , 10 )
123+ 		prompt  :=  strings .Repeat ("hello " , 25 )
122124		maxTokens  :=  25 
123125
124126		ctx  :=  context .TODO ()
@@ -153,10 +155,38 @@ var _ = Describe("Simulator metrics", Ordered, func() {
153155		data , err  :=  io .ReadAll (metricsResp .Body )
154156		Expect (err ).NotTo (HaveOccurred ())
155157		metrics  :=  string (data )
158+ 		// request_prompt_tokens_bucket 
159+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 0` ))
160+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 0` ))
161+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 0` ))
162+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 0` ))
163+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 0` ))
156164		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 1` ))
165+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1` ))
166+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1` ))
167+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1` ))
168+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1` ))
169+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1` ))
170+ 		// request_params_max_tokens_bucket 
171+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0` ))
172+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 0` ))
173+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 0` ))
174+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 0` ))
175+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 0` ))
157176		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 1` ))
177+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 1` ))
178+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 1` ))
179+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 1` ))
180+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 1` ))
181+ 		Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 1` ))
182+ 		// request_generation_tokens 
183+ 		// We do not verify the distribution of the number of tokens generated per request, 
184+ 		// as the number of generated tokens is unpredictable in this test. 
185+ 		// Therefore, we only verify the number of requests and the total number of generated tokens, 
186+ 		// and skip the bucket distribution. 
158187		Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_count{model_name="testmodel"} 1` ))
159- 		Expect (metrics ).To (ContainSubstring (`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 1` ))
188+ 		// request_success_total 
189+ 		Expect (metrics ).To (MatchRegexp (`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1` ))
160190	})
161191
162192	It ("Should send correct lora metrics" , func () {
@@ -518,7 +548,32 @@ var _ = Describe("Simulator metrics", Ordered, func() {
518548			ctx  :=  context .TODO ()
519549			args  :=  []string {"cmd" , "--model" , model , "--mode" , common .ModeRandom ,
520550				"--fake-metrics" ,
521- 				"{\" running-requests\" :10,\" waiting-requests\" :30,\" kv-cache-usage\" :0.4,\" loras\" :[{\" running\" :\" lora4,lora2\" ,\" waiting\" :\" lora3\" ,\" timestamp\" :1257894567},{\" running\" :\" lora4,lora3\" ,\" waiting\" :\" \" ,\" timestamp\" :1257894569}]}" ,
551+ 				`{`  + 
552+ 					`"running-requests":10,`  + 
553+ 					`"waiting-requests":30,`  + 
554+ 					`"kv-cache-usage":0.4,`  + 
555+ 					`"request-success-total":{`  + 
556+ 					`"stop":20,`  + 
557+ 					`"length":0,`  + 
558+ 					`"tool_calls":0,`  + 
559+ 					`"remote_decode":0`  + 
560+ 					`},`  + 
561+ 					`"request-prompt-tokens":[10,20,30],`  + 
562+ 					`"request-generation-tokens":[10,20,30],`  + 
563+ 					`"request-params-max-tokens":[10,20,30],`  + 
564+ 					`"loras":[`  + 
565+ 					`{`  + 
566+ 					`"running":"lora4,lora2",`  + 
567+ 					`"waiting":"lora3",`  + 
568+ 					`"timestamp":1257894567`  + 
569+ 					`},`  + 
570+ 					`{`  + 
571+ 					`"running":"lora4,lora3",`  + 
572+ 					`"waiting":"",`  + 
573+ 					`"timestamp":1257894569`  + 
574+ 					`}`  + 
575+ 					`]`  + 
576+ 					`}` ,
522577			}
523578
524579			client , err  :=  startServerWithArgs (ctx , common .ModeRandom , args , nil )
@@ -536,6 +591,48 @@ var _ = Describe("Simulator metrics", Ordered, func() {
536591			Expect (metrics ).To (ContainSubstring ("vllm:gpu_cache_usage_perc{model_name=\" my_model\" } 0.4" ))
537592			Expect (metrics ).To (ContainSubstring ("vllm:lora_requests_info{max_lora=\" 1\" ,running_lora_adapters=\" lora4,lora2\" ,waiting_lora_adapters=\" lora3\" } 1.257894567e+09" ))
538593			Expect (metrics ).To (ContainSubstring ("vllm:lora_requests_info{max_lora=\" 1\" ,running_lora_adapters=\" lora4,lora3\" ,waiting_lora_adapters=\" \" } 1.257894569e+09" ))
594+ 
595+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="1"} 10` ))
596+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="2"} 30` ))
597+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="5"} 60` ))
598+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="10"} 60` ))
599+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="20"} 60` ))
600+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="50"} 60` ))
601+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="100"} 60` ))
602+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="200"} 60` ))
603+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="500"} 60` ))
604+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="1000"} 60` ))
605+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_generation_tokens_bucket{model_name="my_model",le="+Inf"} 60` ))
606+ 
607+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1"} 10` ))
608+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="2"} 30` ))
609+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="5"} 60` ))
610+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="10"} 60` ))
611+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="20"} 60` ))
612+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="50"} 60` ))
613+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="100"} 60` ))
614+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="200"} 60` ))
615+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="500"} 60` ))
616+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1000"} 60` ))
617+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="my_model",le="+Inf"} 60` ))
618+ 
619+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1"} 10` ))
620+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="2"} 30` ))
621+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="5"} 60` ))
622+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="10"} 60` ))
623+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="20"} 60` ))
624+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="50"} 60` ))
625+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="100"} 60` ))
626+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="200"} 60` ))
627+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="500"} 60` ))
628+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1000"} 60` ))
629+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="my_model",le="+Inf"} 60` ))
630+ 
631+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_success_total{finish_reason="length",model_name="my_model"} 0` ))
632+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_success_total{finish_reason="remote_decode",model_name="my_model"} 0` ))
633+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_success_total{finish_reason="stop",model_name="my_model"} 20` ))
634+ 			Expect (metrics ).To (ContainSubstring (`vllm:request_success_total{finish_reason="tool_calls",model_name="my_model"} 0` ))
635+ 
539636		})
540637	})
541638})
@@ -691,3 +788,104 @@ func TestBuild125Buckets(t *testing.T) {
691788		})
692789	}
693790}
791+ 
792+ func  validateSamplesInBuckets (t  * testing.T , samples  []float64 , boundaries  []float64 , counts  []float64 ) {
793+ 	fullCounts  :=  padCountsToFull (boundaries , counts )
794+ 	// Now validate using fullCounts 
795+ 	sortedSamples  :=  make ([]float64 , len (samples ))
796+ 	copy (sortedSamples , samples )
797+ 	sort .Float64s (sortedSamples )
798+ 
799+ 	actualCounts  :=  make ([]int , len (fullCounts ))
800+ 	sampleIndex  :=  0 
801+ 
802+ 	for  bucketIndex  :=  range  fullCounts  {
803+ 		var  upperBound  float64 
804+ 		if  bucketIndex  ==  len (boundaries ) {
805+ 			upperBound  =  math .Inf (+ 1 )
806+ 		} else  {
807+ 			upperBound  =  boundaries [bucketIndex ]
808+ 		}
809+ 
810+ 		for  sampleIndex  <  len (sortedSamples ) &&  sortedSamples [sampleIndex ] <=  upperBound  {
811+ 			actualCounts [bucketIndex ]++ 
812+ 			sampleIndex ++ 
813+ 		}
814+ 	}
815+ 
816+ 	// Verify each bucket 
817+ 	for  i , want  :=  range  fullCounts  {
818+ 		if  actualCounts [i ] !=  int (want ) {
819+ 			var  lowerStr , upperStr  string 
820+ 			if  i  ==  0  {
821+ 				lowerStr  =  "-Inf" 
822+ 			} else  {
823+ 				lowerStr  =  fmt .Sprintf ("%.3f" , boundaries [i - 1 ])
824+ 			}
825+ 			if  i  ==  len (boundaries ) {
826+ 				upperStr  =  "+Inf" 
827+ 			} else  {
828+ 				upperStr  =  fmt .Sprintf ("%.3f" , boundaries [i ])
829+ 			}
830+ 			t .Errorf ("bucket[%d] (%s, %s]: want %d, got %d" ,
831+ 				i , lowerStr , upperStr , int (want ), actualCounts [i ])
832+ 		}
833+ 	}
834+ 
835+ 	totalExpected  :=  0 
836+ 	for  _ , c  :=  range  fullCounts  {
837+ 		totalExpected  +=  int (c )
838+ 	}
839+ 	if  len (samples ) !=  totalExpected  {
840+ 		t .Errorf ("total samples: want %d, got %d" , totalExpected , len (samples ))
841+ 	}
842+ }
843+ 
844+ func  TestGenerateSamplesFromBuckets (t  * testing.T ) {
845+ 	tests  :=  []struct  {
846+ 		name             string 
847+ 		boundaries       []float64 
848+ 		counts           []float64 
849+ 		expectedSamples  int 
850+ 	}{
851+ 		{
852+ 			name :            "short counts with non-zero in middle" ,
853+ 			boundaries :      []float64 {1 , 2 , 5 , 10 , 20 , 50 , 100 , 200 , 500 , 1000 },
854+ 			counts :          []float64 {0 , 0 , 0 , 5 , 0 , 5 },
855+ 			expectedSamples : 10 ,
856+ 		},
857+ 		{
858+ 			name :            "empty boundaries → 1 bucket" ,
859+ 			boundaries :      []float64 {},
860+ 			counts :          []float64 {7 },
861+ 			expectedSamples : 7 ,
862+ 		},
863+ 		{
864+ 			name :            "single boundary → 2 buckets, short counts" ,
865+ 			boundaries :      []float64 {10.0 },
866+ 			counts :          []float64 {3 },
867+ 			expectedSamples : 3 ,
868+ 		},
869+ 		{
870+ 			name :            "full counts provided" ,
871+ 			boundaries :      []float64 {1 , 2 , 5 },
872+ 			counts :          []float64 {1 , 0 , 2 , 1 },
873+ 			expectedSamples : 4 ,
874+ 		},
875+ 		{
876+ 			name :            "all zeros (short)" ,
877+ 			boundaries :      []float64 {1 , 2 , 5 },
878+ 			counts :          []float64 {},
879+ 			expectedSamples : 0 ,
880+ 		},
881+ 	}
882+ 	for  _ , tt  :=  range  tests  {
883+ 		t .Run (tt .name , func (t  * testing.T ) {
884+ 			samples  :=  generateSamplesFromBuckets (tt .boundaries , tt .counts )
885+ 			if  len (samples ) !=  tt .expectedSamples  {
886+ 				t .Fatalf ("sample count mismatch: want %d, got %d" , tt .expectedSamples , len (samples ))
887+ 			}
888+ 			validateSamplesInBuckets (t , samples , tt .boundaries , tt .counts )
889+ 		})
890+ 	}
891+ }
0 commit comments