@@ -19,7 +19,9 @@ package llmdinferencesim
1919import (
2020 "context"
2121 "errors"
22+ "fmt"
2223 "io"
24+ "math"
2325 "net/http"
2426 "os"
2527 "reflect"
@@ -164,7 +166,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
164166 Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1` ))
165167 Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1` ))
166168 Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1` ))
167- Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100 "} 1` ))
169+ Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000 "} 1` ))
168170 Expect (metrics ).To (ContainSubstring (`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1` ))
169171 // request_params_max_tokens_bucket
170172 Expect (metrics ).To (ContainSubstring (`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0` ))
@@ -815,6 +817,93 @@ var _ = Describe("Simulator metrics", Ordered, func() {
815817 Expect (metrics ).To (ContainSubstring ("vllm:time_to_first_token_seconds_bucket{model_name=\" my_model\" ,le=\" +Inf\" } 1" ))
816818 })
817819 })
820+
821+ Context ("latency metrics" , func () {
822+ DescribeTable ("should calculate all latency related metrics correctly for a single request" ,
823+ func (testName string , doRemotePrefill bool , doRemoteDecode bool , kvcacheTransferLatency int , kvCacheTransferTimePerToken int ,
824+ ttft int , prefillTimePerToken int , interTokenLatency int ) {
825+ // Expect(true).To(BeFalse())
826+ // send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
827+ modelName := "my_model"
828+ // Send one request, check that ttft and tpot are as defined in the simulator command line params
829+ ctx := context .TODO ()
830+ args := []string {"cmd" , "--model" , modelName , "--mode" , common .ModeEcho ,
831+ "--kv-cache-transfer-latency" , strconv .Itoa (kvcacheTransferLatency ),
832+ "--kv-cache-transfer-time-per-token" , strconv .Itoa (kvCacheTransferTimePerToken ),
833+ "--time-to-first-token" , strconv .Itoa (ttft ),
834+ "--prefill-time-per-token" , strconv .Itoa (prefillTimePerToken ),
835+ "--inter-token-latency" , strconv .Itoa (interTokenLatency ),
836+ }
837+
838+ client , err := startServerWithArgs (ctx , common .ModeRandom , args , nil )
839+ Expect (err ).NotTo (HaveOccurred ())
840+
841+ // TODO - pass isStreaming
842+ openaiclient , params := getOpenAIClientAndChatParams (client , modelName , "1 2 3 4" , false )
843+ // TODO - how to test remote prefill/decode
844+
845+ var reqWg , metricsWg sync.WaitGroup
846+ metricsWg .Add (1 )
847+ reqWg .Add (1 )
848+
849+ // send a single request
850+ go func () {
851+ defer reqWg .Done ()
852+ defer GinkgoRecover ()
853+
854+ _ , err := openaiclient .Chat .Completions .New (ctx , params )
855+ Expect (err ).NotTo (HaveOccurred ())
856+ }()
857+
858+ // wait untill request processing was finished, send /mertics request
859+ reqWg .Wait ()
860+ time .Sleep (300 * time .Millisecond )
861+ metricsResp , err := client .Get (metricsUrl )
862+ Expect (err ).NotTo (HaveOccurred ())
863+ Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
864+
865+ data , err := io .ReadAll (metricsResp .Body )
866+ Expect (err ).NotTo (HaveOccurred ())
867+ metrics := string (data )
868+
869+ numOfTokens := 4
870+ var expectedPrefillTime float64
871+ // TODO take into consideration remote prefill
872+ if ttft > 0 {
873+ // time-to-first-token overwrites calculation of prefill time based on number of input tokens
874+ expectedPrefillTime = float64 (ttft ) / 1000
875+
876+ } else {
877+ expectedPrefillTime = float64 (numOfTokens * prefillTimePerToken ) / 1000
878+ }
879+ expectedDecodeTime := float64 (interTokenLatency * (numOfTokens - 1 )) / 1000
880+ expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
881+
882+ prevBoundary := math .Inf (- 1 )
883+
884+ for _ , bucketBoudary := range common .RequestLatencyBucketsBoundaries {
885+ checkBucketBoundary (metrics , modelName , prefillTimeMetricName , bucketBoudary , prevBoundary , expectedPrefillTime )
886+ checkBucketBoundary (metrics , modelName , decodeTimeMetricName , bucketBoudary , prevBoundary , expectedDecodeTime )
887+ checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , bucketBoudary , prevBoundary , expectedE2ELatency )
888+
889+ prevBoundary = bucketBoudary
890+ }
891+ // check the last bucket
892+ lastBoundary := common .RequestLatencyBucketsBoundaries [len (common .RequestLatencyBucketsBoundaries )- 1 ]
893+ checkBucketBoundary (metrics , modelName , prefillTimeMetricName , math .Inf (1 ), lastBoundary , expectedPrefillTime )
894+ checkBucketBoundary (metrics , modelName , decodeTimeMetricName , math .Inf (1 ), lastBoundary , expectedDecodeTime )
895+ checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , math .Inf (1 ), lastBoundary , expectedE2ELatency )
896+ },
897+ func (testName string , doRemotePrefill bool , doRemoteDecode bool , kvcacheTransferLatency int , kvCacheTransferTimePerToken int ,
898+ ttft int , prefillTimePerToken int , interTokenLatency int ) string {
899+ return fmt .Sprintf ("%s\n doRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d" ,
900+ testName , doRemotePrefill , doRemoteDecode , kvcacheTransferLatency , kvCacheTransferTimePerToken , ttft , prefillTimePerToken , interTokenLatency )
901+ },
902+ // pay attention: do not define times close to bucket boundaries, this can lead to test failure
903+ Entry (nil , "constant prefil + inter token time" , false , false , 0 , 0 , 900 , 0 , 100 ),
904+ Entry (nil , "prefill per token + inter token time" , false , false , 0 , 0 , 0 , 100 , 100 ),
905+ )
906+ })
818907})
819908
820909// isLoraMetricPresent checks if a matching metric exists
@@ -1022,3 +1111,29 @@ func TestBuild125Buckets(t *testing.T) {
10221111 })
10231112 }
10241113}
1114+
1115+ func getFloatBucketMetricLine (model string , metric string , bucketBoundary float64 , count int ) string {
1116+ buckerBoundStr := "+Inf"
1117+ if bucketBoundary != math .Inf (1 ) {
1118+ buckerBoundStr = fmt .Sprintf ("%g" , bucketBoundary )
1119+ }
1120+ return fmt .Sprintf ("%s_bucket{model_name=\" %s\" ,le=\" %s\" } %d" , metric , model , buckerBoundStr , count )
1121+ }
1122+
1123+ func checkBucketBoundary (metrics string , modelName string , metricName string , bucketBoudary float64 ,
1124+ prevBoundary float64 , expectedValue float64 ) {
1125+ if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary - expectedValue ) < 0.005 {
1126+ // expected time is too close to the bucket boudary
1127+ // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
1128+ // since there is some additional calculation time
1129+ fmt .Printf ("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n " ,
1130+ prevBoundary , bucketBoudary , expectedValue )
1131+ return
1132+ }
1133+ expectedCount := 0
1134+ if bucketBoudary > expectedValue {
1135+ expectedCount = 1
1136+ }
1137+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (modelName , metricName , bucketBoudary , expectedCount )))
1138+
1139+ }
0 commit comments