@@ -820,88 +820,26 @@ var _ = Describe("Simulator metrics", Ordered, func() {
820820
821821 Context ("latency metrics" , func () {
822822 DescribeTable ("should calculate all latency related metrics correctly for a single request" ,
823- func (testName string , doRemotePrefill bool , doRemoteDecode bool , kvcacheTransferLatency int , kvCacheTransferTimePerToken int ,
824- ttft int , prefillTimePerToken int , interTokenLatency int ) {
825- // Expect(true).To(BeFalse())
826- // send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
823+ func (testNamePrefix string , ttft int , prefillTimePerToken int , interTokenLatency int ) {
824+ // send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too
827825 modelName := "my_model"
828- // Send one request, check that ttft and tpot are as defined in the simulator command line params
829- ctx := context .TODO ()
830- args := []string {"cmd" , "--model" , modelName , "--mode" , common .ModeEcho ,
831- "--kv-cache-transfer-latency" , strconv .Itoa (kvcacheTransferLatency ),
832- "--kv-cache-transfer-time-per-token" , strconv .Itoa (kvCacheTransferTimePerToken ),
833- "--time-to-first-token" , strconv .Itoa (ttft ),
834- "--prefill-time-per-token" , strconv .Itoa (prefillTimePerToken ),
835- "--inter-token-latency" , strconv .Itoa (interTokenLatency ),
836- }
837-
838- client , err := startServerWithArgs (ctx , common .ModeRandom , args , nil )
839- Expect (err ).NotTo (HaveOccurred ())
840-
841- // TODO - pass isStreaming
842- openaiclient , params := getOpenAIClientAndChatParams (client , modelName , "1 2 3 4" , false )
843- // TODO - how to test remote prefill/decode
844-
845- var reqWg , metricsWg sync.WaitGroup
846- metricsWg .Add (1 )
847- reqWg .Add (1 )
848-
849- // send a single request
850- go func () {
851- defer reqWg .Done ()
852- defer GinkgoRecover ()
853-
854- _ , err := openaiclient .Chat .Completions .New (ctx , params )
855- Expect (err ).NotTo (HaveOccurred ())
856- }()
826+ prompt := "1 2 3 4"
857827
858- // wait untill request processing was finished, send /mertics request
859- reqWg .Wait ()
860- time .Sleep (300 * time .Millisecond )
861- metricsResp , err := client .Get (metricsUrl )
862- Expect (err ).NotTo (HaveOccurred ())
863- Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
828+ client := sendRequest (modelName , prompt , false , ttft , prefillTimePerToken , interTokenLatency )
829+ checkLatencyMertics (client , modelName , prompt , ttft , prefillTimePerToken , interTokenLatency )
864830
865- data , err := io .ReadAll (metricsResp .Body )
866- Expect (err ).NotTo (HaveOccurred ())
867- metrics := string (data )
868-
869- numOfTokens := 4
870- var expectedPrefillTime float64
871- // TODO take into consideration remote prefill
872- if ttft > 0 {
873- // time-to-first-token overwrites calculation of prefill time based on number of input tokens
874- expectedPrefillTime = float64 (ttft ) / 1000
875-
876- } else {
877- expectedPrefillTime = float64 (numOfTokens * prefillTimePerToken ) / 1000
878- }
879- expectedDecodeTime := float64 (interTokenLatency * (numOfTokens - 1 )) / 1000
880- expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
881-
882- prevBoundary := math .Inf (- 1 )
883-
884- for _ , bucketBoudary := range common .RequestLatencyBucketsBoundaries {
885- checkBucketBoundary (metrics , modelName , prefillTimeMetricName , bucketBoudary , prevBoundary , expectedPrefillTime )
886- checkBucketBoundary (metrics , modelName , decodeTimeMetricName , bucketBoudary , prevBoundary , expectedDecodeTime )
887- checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , bucketBoudary , prevBoundary , expectedE2ELatency )
888-
889- prevBoundary = bucketBoudary
890- }
891- // check the last bucket
892- lastBoundary := common .RequestLatencyBucketsBoundaries [len (common .RequestLatencyBucketsBoundaries )- 1 ]
893- checkBucketBoundary (metrics , modelName , prefillTimeMetricName , math .Inf (1 ), lastBoundary , expectedPrefillTime )
894- checkBucketBoundary (metrics , modelName , decodeTimeMetricName , math .Inf (1 ), lastBoundary , expectedDecodeTime )
895- checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , math .Inf (1 ), lastBoundary , expectedE2ELatency )
831+ // same in streaming mode
832+ client = sendRequest (modelName , prompt , true , ttft , prefillTimePerToken , interTokenLatency )
833+ checkLatencyMertics (client , modelName , prompt , ttft , prefillTimePerToken , interTokenLatency )
896834 },
897- func (testName string , doRemotePrefill bool , doRemoteDecode bool , kvcacheTransferLatency int , kvCacheTransferTimePerToken int ,
898- ttft int , prefillTimePerToken int , interTokenLatency int ) string {
899- return fmt .Sprintf ("%s\n doRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d" ,
900- testName , doRemotePrefill , doRemoteDecode , kvcacheTransferLatency , kvCacheTransferTimePerToken , ttft , prefillTimePerToken , interTokenLatency )
835+ func (testNamePrefix string , ttft int , prefillTimePerToken int , interTokenLatency int ) string {
836+ return fmt .Sprintf ("%s\n ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d" , testNamePrefix , ttft , prefillTimePerToken , interTokenLatency )
901837 },
902- // pay attention: do not define times close to bucket boundaries, this can lead to test failure
903- Entry (nil , "constant prefil + inter token time" , false , false , 0 , 0 , 900 , 0 , 100 ),
904- Entry (nil , "prefill per token + inter token time" , false , false , 0 , 0 , 0 , 100 , 100 ),
838+ // Params order: testName, ttft, prefillTimePerToken, interTokenLatency
839+ Entry (nil , "constant prefill + inter token time" , 0 , 0 , 100 ),
840+ Entry (nil , "constant prefill + inter token time" , 900 , 0 , 100 ),
841+ Entry (nil , "constant prefill + inter token time" , 1000 , 0 , 100 ),
842+ Entry (nil , "prefill per token + inter token time" , 0 , 100 , 100 ),
905843 )
906844 })
907845})
@@ -1122,8 +1060,8 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
11221060
11231061func checkBucketBoundary (metrics string , modelName string , metricName string , bucketBoudary float64 ,
11241062 prevBoundary float64 , expectedValue float64 ) {
1125- if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary - expectedValue ) < 0.005 {
1126- // expected time is too close to the bucket boudary
1063+ if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary - expectedValue ) < 0.005 {
1064+ // expected time is too close to the bucket's boudary
11271065 // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
11281066 // since there is some additional calculation time
11291067 fmt .Printf ("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n " ,
@@ -1135,5 +1073,67 @@ func checkBucketBoundary(metrics string, modelName string, metricName string, bu
11351073 expectedCount = 1
11361074 }
11371075 Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (modelName , metricName , bucketBoudary , expectedCount )))
1076+ }
1077+
1078+ // send a single request with the given prompt and echo mode
1079+ func sendRequest (modelName string , prompt string , isStreaming bool , ttft int , prefillTimePerToken int , interTokenLatency int ) * http.Client {
1080+ ctx := context .TODO ()
1081+ args := []string {"cmd" , "--model" , modelName , "--mode" , common .ModeEcho ,
1082+ // "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
1083+ // "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
1084+ "--time-to-first-token" , strconv .Itoa (ttft ),
1085+ "--prefill-time-per-token" , strconv .Itoa (prefillTimePerToken ),
1086+ "--inter-token-latency" , strconv .Itoa (interTokenLatency ),
1087+ }
1088+
1089+ client , err := startServerWithArgs (ctx , common .ModeRandom , args , nil )
1090+ Expect (err ).NotTo (HaveOccurred ())
1091+
1092+ openaiclient , params := getOpenAIClientAndChatParams (client , modelName , prompt , isStreaming )
1093+
1094+ // send a single request in a serial way
1095+ _ , err = openaiclient .Chat .Completions .New (ctx , params )
1096+ Expect (err ).NotTo (HaveOccurred ())
1097+
1098+ return client
1099+ }
1100+
1101+ func checkLatencyMertics (client * http.Client , modelName string , prompt string , ttft int , prefillTimePerToken int , interTokenLatency int ) {
1102+ // wait a little bit and check metrics
1103+ time .Sleep (300 * time .Millisecond )
1104+ metricsResp , err := client .Get (metricsUrl )
1105+ Expect (err ).NotTo (HaveOccurred ())
1106+ Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
1107+
1108+ data , err := io .ReadAll (metricsResp .Body )
1109+ Expect (err ).NotTo (HaveOccurred ())
1110+ metrics := string (data )
11381111
1112+ numOfTokens := len (common .Tokenize (prompt ))
1113+ var expectedPrefillTime float64
1114+ // TODO take into consideration remote prefill
1115+ if ttft > 0 {
1116+ // time-to-first-token overwrites calculation of prefill time based on number of input tokens
1117+ expectedPrefillTime = float64 (ttft ) / 1000
1118+
1119+ } else {
1120+ expectedPrefillTime = float64 (numOfTokens * prefillTimePerToken ) / 1000
1121+ }
1122+ expectedDecodeTime := float64 (interTokenLatency * (numOfTokens - 1 )) / 1000
1123+ expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
1124+
1125+ prevBoundary := math .Inf (- 1 )
1126+
1127+ for _ , bucketBoudary := range common .RequestLatencyBucketsBoundaries {
1128+ checkBucketBoundary (metrics , modelName , prefillTimeMetricName , bucketBoudary , prevBoundary , expectedPrefillTime )
1129+ checkBucketBoundary (metrics , modelName , decodeTimeMetricName , bucketBoudary , prevBoundary , expectedDecodeTime )
1130+ checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , bucketBoudary , prevBoundary , expectedE2ELatency )
1131+
1132+ prevBoundary = bucketBoudary
1133+ }
1134+ // check the last bucket
1135+ lastBoundary := common .RequestLatencyBucketsBoundaries [len (common .RequestLatencyBucketsBoundaries )- 1 ]
1136+ checkBucketBoundary (metrics , modelName , prefillTimeMetricName , math .Inf (1 ), lastBoundary , expectedPrefillTime )
1137+ checkBucketBoundary (metrics , modelName , decodeTimeMetricName , math .Inf (1 ), lastBoundary , expectedDecodeTime )
1138+ checkBucketBoundary (metrics , modelName , e2eReqLatencyMetricName , math .Inf (1 ), lastBoundary , expectedE2ELatency )
11391139}
0 commit comments