@@ -393,14 +393,14 @@ var _ = Describe("Simulator metrics", Ordered, func() {
393393 modelName := "my_model"
394394 // Send one request, check that ttft and tpot are as defined in the simulator command line params
395395 ctx := context .TODO ()
396- args := []string {"cmd" , "--model" , modelName , "--mode" , common .ModeRandom ,
396+ // use mode echo to be sure that response is more than one token - this makes sure that tpot is reported to prometheus
397+ args := []string {"cmd" , "--model" , modelName , "--mode" , common .ModeEcho ,
397398 "--time-to-first-token" , "200" , "--inter-token-latency" , "100" }
398399
399400 client , err := startServerWithArgs (ctx , common .ModeRandom , args , nil )
400401 Expect (err ).NotTo (HaveOccurred ())
401402
402403 openaiclient , params := getOpenAIClientAndChatParams (client , modelName , userMessage , false )
403- params .MaxTokens = openai .Int (5 )
404404
405405 var reqWg , metricsWg sync.WaitGroup
406406 metricsWg .Add (1 )
@@ -451,7 +451,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
451451 Expect (metrics ).To (ContainSubstring ("vllm:time_to_first_token_seconds_bucket{model_name=\" my_model\" ,le=\" 640\" } 1" ))
452452 Expect (metrics ).To (ContainSubstring ("vllm:time_to_first_token_seconds_bucket{model_name=\" my_model\" ,le=\" 2560\" } 1" ))
453453 Expect (metrics ).To (ContainSubstring ("vllm:time_to_first_token_seconds_bucket{model_name=\" my_model\" ,le=\" +Inf\" } 1" ))
454- // tpot
454+ // check tpot only is it exists in metrics, when a single
455455 Expect (metrics ).To (ContainSubstring ("vllm:time_per_output_token_seconds_bucket{model_name=\" my_model\" ,le=\" 0.01\" } 0" ))
456456 Expect (metrics ).To (ContainSubstring ("vllm:time_per_output_token_seconds_bucket{model_name=\" my_model\" ,le=\" 0.025\" } 0" ))
457457 Expect (metrics ).To (ContainSubstring ("vllm:time_per_output_token_seconds_bucket{model_name=\" my_model\" ,le=\" 0.05\" } 0" ))
0 commit comments