Skip to content

Commit 9477a58

Browse files
committed
Fix bug in metrics test + add latency test for streaming mode
Signed-off-by: Maya Barnea <[email protected]>
1 parent a91af3b commit 9477a58

File tree

1 file changed

+79
-79
lines changed

1 file changed

+79
-79
lines changed

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 79 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -820,88 +820,26 @@ var _ = Describe("Simulator metrics", Ordered, func() {
820820

821821
Context("latency metrics", func() {
822822
DescribeTable("should calculate all latency related metrics correctly for a single request",
823-
func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
824-
ttft int, prefillTimePerToken int, interTokenLatency int) {
825-
// Expect(true).To(BeFalse())
826-
// send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
823+
func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) {
824+
// send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too
827825
modelName := "my_model"
828-
// Send one request, check that ttft and tpot are as defined in the simulator command line params
829-
ctx := context.TODO()
830-
args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
831-
"--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
832-
"--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
833-
"--time-to-first-token", strconv.Itoa(ttft),
834-
"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
835-
"--inter-token-latency", strconv.Itoa(interTokenLatency),
836-
}
837-
838-
client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
839-
Expect(err).NotTo(HaveOccurred())
840-
841-
// TODO - pass isStreaming
842-
openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false)
843-
// TODO - how to test remote prefill/decode
844-
845-
var reqWg, metricsWg sync.WaitGroup
846-
metricsWg.Add(1)
847-
reqWg.Add(1)
848-
849-
// send a single request
850-
go func() {
851-
defer reqWg.Done()
852-
defer GinkgoRecover()
853-
854-
_, err := openaiclient.Chat.Completions.New(ctx, params)
855-
Expect(err).NotTo(HaveOccurred())
856-
}()
826+
prompt := "1 2 3 4"
857827

858-
// wait untill request processing was finished, send /mertics request
859-
reqWg.Wait()
860-
time.Sleep(300 * time.Millisecond)
861-
metricsResp, err := client.Get(metricsUrl)
862-
Expect(err).NotTo(HaveOccurred())
863-
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
828+
client := sendRequest(modelName, prompt, false, ttft, prefillTimePerToken, interTokenLatency)
829+
checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
864830

865-
data, err := io.ReadAll(metricsResp.Body)
866-
Expect(err).NotTo(HaveOccurred())
867-
metrics := string(data)
868-
869-
numOfTokens := 4
870-
var expectedPrefillTime float64
871-
// TODO take into consideration remote prefill
872-
if ttft > 0 {
873-
// time-to-first-token overwrites calculation of prefill time based on number of input tokens
874-
expectedPrefillTime = float64(ttft) / 1000
875-
876-
} else {
877-
expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
878-
}
879-
expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
880-
expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
881-
882-
prevBoundary := math.Inf(-1)
883-
884-
for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
885-
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
886-
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
887-
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
888-
889-
prevBoundary = bucketBoudary
890-
}
891-
// check the last bucket
892-
lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
893-
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
894-
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
895-
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
831+
// same in streaming mode
832+
client = sendRequest(modelName, prompt, true, ttft, prefillTimePerToken, interTokenLatency)
833+
checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
896834
},
897-
func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
898-
ttft int, prefillTimePerToken int, interTokenLatency int) string {
899-
return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d",
900-
testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency)
835+
func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) string {
836+
return fmt.Sprintf("%s\nttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", testNamePrefix, ttft, prefillTimePerToken, interTokenLatency)
901837
},
902-
// pay attention: do not define times close to bucket boundaries, this can lead to test failure
903-
Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100),
904-
Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100),
838+
// Params order: testName, ttft, prefillTimePerToken, interTokenLatency
839+
Entry(nil, "constant prefill + inter token time", 0, 0, 100),
840+
Entry(nil, "constant prefill + inter token time", 900, 0, 100),
841+
Entry(nil, "constant prefill + inter token time", 1000, 0, 100),
842+
Entry(nil, "prefill per token + inter token time", 0, 100, 100),
905843
)
906844
})
907845
})
@@ -1122,8 +1060,8 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
11221060

11231061
func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
11241062
prevBoundary float64, expectedValue float64) {
1125-
if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 {
1126-
// expected time is too close to the bucket boudary
1063+
if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 {
1064+
// expected time is too close to the bucket's boudary
11271065
// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
11281066
// since there is some additional calculation time
11291067
fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
@@ -1135,5 +1073,67 @@ func checkBucketBoundary(metrics string, modelName string, metricName string, bu
11351073
expectedCount = 1
11361074
}
11371075
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
1076+
}
1077+
1078+
// send a single request with the given prompt and echo mode
1079+
func sendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client {
1080+
ctx := context.TODO()
1081+
args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
1082+
// "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
1083+
// "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
1084+
"--time-to-first-token", strconv.Itoa(ttft),
1085+
"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
1086+
"--inter-token-latency", strconv.Itoa(interTokenLatency),
1087+
}
1088+
1089+
client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
1090+
Expect(err).NotTo(HaveOccurred())
1091+
1092+
openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming)
1093+
1094+
// send a single request in a serial way
1095+
_, err = openaiclient.Chat.Completions.New(ctx, params)
1096+
Expect(err).NotTo(HaveOccurred())
1097+
1098+
return client
1099+
}
1100+
1101+
func checkLatencyMertics(client *http.Client, modelName string, prompt string, ttft int, prefillTimePerToken int, interTokenLatency int) {
1102+
// wait a little bit and check metrics
1103+
time.Sleep(300 * time.Millisecond)
1104+
metricsResp, err := client.Get(metricsUrl)
1105+
Expect(err).NotTo(HaveOccurred())
1106+
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
1107+
1108+
data, err := io.ReadAll(metricsResp.Body)
1109+
Expect(err).NotTo(HaveOccurred())
1110+
metrics := string(data)
11381111

1112+
numOfTokens := len(common.Tokenize(prompt))
1113+
var expectedPrefillTime float64
1114+
// TODO take into consideration remote prefill
1115+
if ttft > 0 {
1116+
// time-to-first-token overwrites calculation of prefill time based on number of input tokens
1117+
expectedPrefillTime = float64(ttft) / 1000
1118+
1119+
} else {
1120+
expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
1121+
}
1122+
expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
1123+
expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
1124+
1125+
prevBoundary := math.Inf(-1)
1126+
1127+
for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
1128+
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
1129+
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
1130+
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
1131+
1132+
prevBoundary = bucketBoudary
1133+
}
1134+
// check the last bucket
1135+
lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
1136+
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
1137+
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
1138+
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
11391139
}

0 commit comments

Comments
 (0)