@@ -21,6 +21,7 @@ import (
2121	"errors" 
2222	"io" 
2323	"net/http" 
24+ 	"os" 
2425	"regexp" 
2526	"sort" 
2627	"strconv" 
@@ -314,6 +315,161 @@ var _ = Describe("Simulator metrics", Ordered, func() {
314315		Expect (bothRunningTimestamp  <=  emptyTimestamp ).To (BeTrue ())
315316	})
316317
318+ 	Context ("kv cache metrics" , func () {
319+ 		tmpDir  :=  "./tests-tmp/" 
320+ 		AfterAll (func () {
321+ 			err  :=  os .RemoveAll (tmpDir )
322+ 			Expect (err ).NotTo (HaveOccurred ())
323+ 		})
324+ 		It ("Should send correct kv cache usage metrics" , func () {
325+ 			modelName  :=  "Qwen/Qwen2-0.5B" 
326+ 			// Three requests, there are should be two blocks in the kv cache, because 
327+ 			// the first and the second prompt share a block. 
328+ 			ctx  :=  context .TODO ()
329+ 			args  :=  []string {"cmd" , "--model" , modelName , "--mode" , common .ModeRandom ,
330+ 				"--enable-kvcache" , "true" , "--kv-cache-size" , "16" , "--block-size" , "8" ,
331+ 				"--time-to-first-token" , "5000" , "--tokenizers-cache-dir" , tmpDir }
332+ 
333+ 			client , err  :=  startServerWithArgs (ctx , common .ModeRandom , args , nil )
334+ 			Expect (err ).NotTo (HaveOccurred ())
335+ 
336+ 			openaiclient  :=  openai .NewClient (
337+ 				option .WithBaseURL (baseURL ),
338+ 				option .WithHTTPClient (client ))
339+ 
340+ 			paramsArray  :=  []openai.CompletionNewParams {
341+ 				{
342+ 					Prompt : openai.CompletionNewParamsPromptUnion {
343+ 						OfString : openai .String ("What is the weather like in Haifa today? Is it cold?" ),
344+ 					},
345+ 					Model : openai .CompletionNewParamsModel (modelName ),
346+ 				},
347+ 				{
348+ 					Prompt : openai.CompletionNewParamsPromptUnion {
349+ 						OfString : openai .String ("What is the weather like in Haifa today?" ),
350+ 					},
351+ 					Model : openai .CompletionNewParamsModel (modelName ),
352+ 				},
353+ 				{
354+ 					Prompt : openai.CompletionNewParamsPromptUnion {
355+ 						OfString : openai .String ("What is the weather like in New York today?" ),
356+ 					},
357+ 					Model : openai .CompletionNewParamsModel (modelName ),
358+ 				},
359+ 			}
360+ 
361+ 			for  _ , params  :=  range  paramsArray  {
362+ 				go  func () {
363+ 					defer  GinkgoRecover ()
364+ 					_ , err  :=  openaiclient .Completions .New (ctx , params )
365+ 					Expect (err ).NotTo (HaveOccurred ())
366+ 				}()
367+ 			}
368+ 
369+ 			var  wg  sync.WaitGroup 
370+ 			wg .Add (1 )
371+ 			go  func () {
372+ 				defer  wg .Done ()
373+ 				defer  GinkgoRecover ()
374+ 
375+ 				time .Sleep (4  *  time .Second )
376+ 				metricsResp , err  :=  client .Get (metricsUrl )
377+ 				Expect (err ).NotTo (HaveOccurred ())
378+ 				Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
379+ 
380+ 				data , err  :=  io .ReadAll (metricsResp .Body )
381+ 				Expect (err ).NotTo (HaveOccurred ())
382+ 				metrics  :=  string (data )
383+ 				// Expect three running requests and two blocks in the kv cache - usage 2/16=0.125 
384+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_running{model_name=\" Qwen/Qwen2-0.5B\" } 3" ))
385+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_waiting{model_name=\" Qwen/Qwen2-0.5B\" } 0" ))
386+ 				Expect (metrics ).To (ContainSubstring ("vllm:gpu_cache_usage_perc{model_name=\" Qwen/Qwen2-0.5B\" } 0.125" ))
387+ 
388+ 				time .Sleep (3  *  time .Second )
389+ 				metricsResp , err  =  client .Get (metricsUrl )
390+ 				Expect (err ).NotTo (HaveOccurred ())
391+ 				Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
392+ 
393+ 				data , err  =  io .ReadAll (metricsResp .Body )
394+ 				Expect (err ).NotTo (HaveOccurred ())
395+ 				metrics  =  string (data )
396+ 				// The requests finished running, expect 0 usage 
397+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_running{model_name=\" Qwen/Qwen2-0.5B\" } 0" ))
398+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_waiting{model_name=\" Qwen/Qwen2-0.5B\" } 0" ))
399+ 				Expect (metrics ).To (ContainSubstring ("vllm:gpu_cache_usage_perc{model_name=\" Qwen/Qwen2-0.5B\" } 0" ))
400+ 			}()
401+ 			wg .Wait ()
402+ 		})
403+ 
404+ 		It ("Should send correct kv cache usage metrics for sequentual requests" , func () {
405+ 			modelName  :=  "Qwen/Qwen2-0.5B" 
406+ 			ctx  :=  context .TODO ()
407+ 			args  :=  []string {"cmd" , "--model" , modelName , "--mode" , common .ModeRandom ,
408+ 				"--enable-kvcache" , "true" , "--kv-cache-size" , "16" , "--block-size" , "8" ,
409+ 				"--time-to-first-token" , "5000" , "--tokenizers-cache-dir" , tmpDir , "--max-num-seqs" , "2" }
410+ 
411+ 			client , err  :=  startServerWithArgs (ctx , common .ModeRandom , args , nil )
412+ 			Expect (err ).NotTo (HaveOccurred ())
413+ 
414+ 			openaiclient  :=  openai .NewClient (
415+ 				option .WithBaseURL (baseURL ),
416+ 				option .WithHTTPClient (client ))
417+ 
418+ 			paramsArray  :=  []openai.CompletionNewParams {
419+ 				{
420+ 					Prompt : openai.CompletionNewParamsPromptUnion {
421+ 						OfString : openai .String ("What is the weather like in Haifa today? Is it cold?" ),
422+ 					},
423+ 					Model : openai .CompletionNewParamsModel (modelName ),
424+ 				},
425+ 				{
426+ 					Prompt : openai.CompletionNewParamsPromptUnion {
427+ 						OfString : openai .String ("What is the weather like in Haifa today?" ),
428+ 					},
429+ 					Model : openai .CompletionNewParamsModel (modelName ),
430+ 				},
431+ 				{
432+ 					Prompt : openai.CompletionNewParamsPromptUnion {
433+ 						OfString : openai .String ("What is the weather like in New York today?" ),
434+ 					},
435+ 					Model : openai .CompletionNewParamsModel (modelName ),
436+ 				},
437+ 			}
438+ 
439+ 			for  i , params  :=  range  paramsArray  {
440+ 				go  func () {
441+ 					defer  GinkgoRecover ()
442+ 					time .Sleep (time .Duration (i * 500 ) *  time .Millisecond )
443+ 					_ , err  :=  openaiclient .Completions .New (ctx , params )
444+ 					Expect (err ).NotTo (HaveOccurred ())
445+ 				}()
446+ 			}
447+ 
448+ 			var  wg  sync.WaitGroup 
449+ 			wg .Add (1 )
450+ 			go  func () {
451+ 				defer  wg .Done ()
452+ 				defer  GinkgoRecover ()
453+ 
454+ 				time .Sleep (3  *  time .Second )
455+ 				metricsResp , err  :=  client .Get (metricsUrl )
456+ 				Expect (err ).NotTo (HaveOccurred ())
457+ 				Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
458+ 
459+ 				data , err  :=  io .ReadAll (metricsResp .Body )
460+ 				Expect (err ).NotTo (HaveOccurred ())
461+ 				metrics  :=  string (data )
462+ 				// The requests were sent with 500 millisecond intervals, and the first two should be still running. 
463+ 				// The third is waiting, and is still not in the kv-cache. 
464+ 				// We expect one block in the kv-cache, usage 1/16=0.0625. 
465+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_running{model_name=\" Qwen/Qwen2-0.5B\" } 2" ))
466+ 				Expect (metrics ).To (ContainSubstring ("vllm:num_requests_waiting{model_name=\" Qwen/Qwen2-0.5B\" } 1" ))
467+ 				Expect (metrics ).To (ContainSubstring ("vllm:gpu_cache_usage_perc{model_name=\" Qwen/Qwen2-0.5B\" } 0.0625" ))
468+ 			}()
469+ 			wg .Wait ()
470+ 		})
471+ 	})
472+ 
317473	Context ("fake metrics" , func () {
318474		It ("Should respond with fake metrics to /metrics" , func () {
319475			ctx  :=  context .TODO ()
0 commit comments