Skip to content

Commit a91af3b

Browse files
committed
Initial tests for new metrics + create constant for part of metrics names
Signed-off-by: Maya Barnea <[email protected]>
1 parent 5325b41 commit a91af3b

File tree

2 files changed

+129
-6
lines changed

2 files changed

+129
-6
lines changed

pkg/llm-d-inference-sim/metrics.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ import (
3232
vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
3333
)
3434

35+
const (
36+
e2eReqLatencyMetricName = "vllm:e2e_request_latency_seconds"
37+
reqQueueTimeMetricName = "vllm:request_queue_time_seconds"
38+
reqInferenceTimeMetricName = "vllm:request_inference_time_seconds"
39+
prefillTimeMetricName = "vllm:request_prefill_time_seconds"
40+
decodeTimeMetricName = "vllm:request_decode_time_seconds"
41+
)
42+
3543
// createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
3644
// Metrics reported:
3745
// - lora_requests_info
@@ -114,7 +122,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
114122
s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
115123
prometheus.HistogramOpts{
116124
Subsystem: "",
117-
Name: "vllm:e2e_request_latency_seconds",
125+
Name: e2eReqLatencyMetricName,
118126
Help: "Histogram of end to end request latency in seconds.",
119127
Buckets: common.RequestLatencyBucketsBoundaries,
120128
},
@@ -129,7 +137,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
129137
s.metrics.reqQueueTime = prometheus.NewHistogramVec(
130138
prometheus.HistogramOpts{
131139
Subsystem: "",
132-
Name: "vllm:request_queue_time_seconds",
140+
Name: reqQueueTimeMetricName,
133141
Help: "Histogram of time spent in WAITING phase for request.",
134142
Buckets: common.RequestLatencyBucketsBoundaries,
135143
},
@@ -144,7 +152,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
144152
s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
145153
prometheus.HistogramOpts{
146154
Subsystem: "",
147-
Name: "vllm:request_inference_time_seconds",
155+
Name: reqInferenceTimeMetricName,
148156
Help: "Histogram of time spent in RUNNING phase for request.",
149157
Buckets: common.RequestLatencyBucketsBoundaries,
150158
},
@@ -159,7 +167,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
159167
s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
160168
prometheus.HistogramOpts{
161169
Subsystem: "",
162-
Name: "vllm:request_prefill_time_seconds",
170+
Name: prefillTimeMetricName,
163171
Help: "Histogram of time spent in PREFILL phase for request.",
164172
Buckets: common.RequestLatencyBucketsBoundaries,
165173
},
@@ -174,7 +182,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
174182
s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
175183
prometheus.HistogramOpts{
176184
Subsystem: "",
177-
Name: "vllm:request_decode_time_seconds",
185+
Name: decodeTimeMetricName,
178186
Help: "Histogram of time spent in DECODE phase for request.",
179187
Buckets: common.RequestLatencyBucketsBoundaries,
180188
},

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package llmdinferencesim
1919
import (
2020
"context"
2121
"errors"
22+
"fmt"
2223
"io"
24+
"math"
2325
"net/http"
2426
"os"
2527
"reflect"
@@ -164,7 +166,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
164166
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
165167
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`))
166168
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`))
167-
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
169+
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`))
168170
Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`))
169171
// request_params_max_tokens_bucket
170172
Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`))
@@ -815,6 +817,93 @@ var _ = Describe("Simulator metrics", Ordered, func() {
815817
Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
816818
})
817819
})
820+
821+
Context("latency metrics", func() {
822+
DescribeTable("should calculate all latency related metrics correctly for a single request",
823+
func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
824+
ttft int, prefillTimePerToken int, interTokenLatency int) {
825+
// Expect(true).To(BeFalse())
826+
// send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
827+
modelName := "my_model"
828+
// Send one request, check that ttft and tpot are as defined in the simulator command line params
829+
ctx := context.TODO()
830+
args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
831+
"--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
832+
"--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
833+
"--time-to-first-token", strconv.Itoa(ttft),
834+
"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
835+
"--inter-token-latency", strconv.Itoa(interTokenLatency),
836+
}
837+
838+
client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
839+
Expect(err).NotTo(HaveOccurred())
840+
841+
// TODO - pass isStreaming
842+
openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false)
843+
// TODO - how to test remote prefill/decode
844+
845+
var reqWg, metricsWg sync.WaitGroup
846+
metricsWg.Add(1)
847+
reqWg.Add(1)
848+
849+
// send a single request
850+
go func() {
851+
defer reqWg.Done()
852+
defer GinkgoRecover()
853+
854+
_, err := openaiclient.Chat.Completions.New(ctx, params)
855+
Expect(err).NotTo(HaveOccurred())
856+
}()
857+
858+
// wait untill request processing was finished, send /mertics request
859+
reqWg.Wait()
860+
time.Sleep(300 * time.Millisecond)
861+
metricsResp, err := client.Get(metricsUrl)
862+
Expect(err).NotTo(HaveOccurred())
863+
Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
864+
865+
data, err := io.ReadAll(metricsResp.Body)
866+
Expect(err).NotTo(HaveOccurred())
867+
metrics := string(data)
868+
869+
numOfTokens := 4
870+
var expectedPrefillTime float64
871+
// TODO take into consideration remote prefill
872+
if ttft > 0 {
873+
// time-to-first-token overwrites calculation of prefill time based on number of input tokens
874+
expectedPrefillTime = float64(ttft) / 1000
875+
876+
} else {
877+
expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
878+
}
879+
expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
880+
expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
881+
882+
prevBoundary := math.Inf(-1)
883+
884+
for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
885+
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
886+
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
887+
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
888+
889+
prevBoundary = bucketBoudary
890+
}
891+
// check the last bucket
892+
lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
893+
checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
894+
checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
895+
checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
896+
},
897+
func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
898+
ttft int, prefillTimePerToken int, interTokenLatency int) string {
899+
return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d",
900+
testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency)
901+
},
902+
// pay attention: do not define times close to bucket boundaries, this can lead to test failure
903+
Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100),
904+
Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100),
905+
)
906+
})
818907
})
819908

820909
// isLoraMetricPresent checks if a matching metric exists
@@ -1022,3 +1111,29 @@ func TestBuild125Buckets(t *testing.T) {
10221111
})
10231112
}
10241113
}
1114+
1115+
func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
1116+
buckerBoundStr := "+Inf"
1117+
if bucketBoundary != math.Inf(1) {
1118+
buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
1119+
}
1120+
return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
1121+
}
1122+
1123+
func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
1124+
prevBoundary float64, expectedValue float64) {
1125+
if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 {
1126+
// expected time is too close to the bucket boudary
1127+
// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
1128+
// since there is some additional calculation time
1129+
fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
1130+
prevBoundary, bucketBoudary, expectedValue)
1131+
return
1132+
}
1133+
expectedCount := 0
1134+
if bucketBoudary > expectedValue {
1135+
expectedCount = 1
1136+
}
1137+
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
1138+
1139+
}

0 commit comments

Comments
 (0)