@@ -20,6 +20,7 @@ import (
2020 "context"
2121 "fmt"
2222 "io"
23+ "math"
2324 "net/http"
2425 "os"
2526 "strings"
@@ -808,7 +809,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
808809 })
809810 })
810811
811- Context ("latency metrics" , func () {
812+ Context ("single request latency metrics" , func () {
812813 numOfTokens := len (common .Tokenize (testUserMessage ))
813814
814815 DescribeTable ("should calculate all latency related metrics correctly for a single request" ,
@@ -831,6 +832,56 @@ var _ = Describe("Simulator metrics", Ordered, func() {
831832 Entry (nil , "prefill per token + inter token time" , 0 , 100 , 100 ),
832833 )
833834 })
835+
836+ Context ("multiple requests latency metrics" , func () {
837+ It ("should calculate waiting and inference time correctly" , func () {
838+ ctx := context .TODO ()
839+ args := []string {"cmd" , "--model" , testModel , "--mode" , common .ModeEcho ,
840+ "--time-to-first-token" , "1000" , "--max-num-seqs" , "1" ,
841+ }
842+
843+ client , err := startServerWithArgs (ctx , args )
844+ Expect (err ).NotTo (HaveOccurred ())
845+
846+ openaiclient , params := getOpenAIClientAndChatParams (client , testModel , testUserMessage , false )
847+
848+ var reqWg sync.WaitGroup
849+ reqWg .Add (2 )
850+
851+ // send two requests
852+ for range 2 {
853+ go func () {
854+ defer reqWg .Done ()
855+ defer GinkgoRecover ()
856+
857+ _ , err := openaiclient .Chat .Completions .New (ctx , params )
858+ Expect (err ).NotTo (HaveOccurred ())
859+ }()
860+ }
861+
862+ reqWg .Wait ()
863+ time .Sleep (300 * time .Millisecond )
864+ metricsResp , err := client .Get (metricsUrl )
865+ Expect (err ).NotTo (HaveOccurred ())
866+ Expect (metricsResp .StatusCode ).To (Equal (http .StatusOK ))
867+
868+ data , err := io .ReadAll (metricsResp .Body )
869+ Expect (err ).NotTo (HaveOccurred ())
870+ metrics := string (data )
871+
872+ for _ , boundary := range common .RequestLatencyBucketsBoundaries {
873+ if boundary < 1.5 {
874+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqInferenceTimeMetricName , boundary , 0 )))
875+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqQueueTimeMetricName , boundary , 0 )))
876+ } else {
877+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqInferenceTimeMetricName , boundary , 2 )))
878+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqQueueTimeMetricName , boundary , 1 )))
879+ }
880+ }
881+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqInferenceTimeMetricName , math .Inf (1 ), 2 )))
882+ Expect (metrics ).To (ContainSubstring (getFloatBucketMetricLine (testModel , reqQueueTimeMetricName , math .Inf (1 ), 1 )))
883+ })
884+ })
834885})
835886
836887var _ = Describe ("build125Buckets" , Ordered , func () {
0 commit comments