From 89ed87cc769a5cf555fe0bd49316c99d60056810 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Thu, 23 Oct 2025 12:24:34 +0300 Subject: [PATCH 01/14] Add e2e request latency histogram to prometheus metrics. Add reportHistogramValue function to be used for reporting values in histogram metrics Signed-off-by: Maya Barnea --- pkg/common/config.go | 7 ++++ pkg/common/utils.go | 3 ++ pkg/llm-d-inference-sim/metrics.go | 57 +++++++++++++++++++--------- pkg/llm-d-inference-sim/simulator.go | 10 +++++ 4 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index ae8eec47..49825a48 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -253,6 +253,13 @@ type Metrics struct { RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples // RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.). RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"` + // E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets, + // each value in this array is a value for the corresponding bucket. + // Array may contain less values than number of buckets, all trailing missing values assumed as 0. + // Buckets upper boundaries in seconds are: + // 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, + // 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf + E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"` } type LorasMetrics struct { diff --git a/pkg/common/utils.go b/pkg/common/utils.go index d1f3cfe1..78a279b1 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0} +var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, + 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0} + // ValidateContextWindow checks if the request fits within the model's context window // Returns validation result, actual completion tokens, and total tokens func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) { diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 35108582..88f7bfcb 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -111,6 +111,21 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + s.metrics.e2eReqLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:e2e_request_latency_seconds", + Help: "Histogram of end to end request latency in seconds.", + Buckets: common.E2ERequestLatencyBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.e2eReqLatency); err != nil { + s.logger.Error(err, "Prometheus end to end request latency histogram register failed") + return err + } + s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -215,6 +230,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal { s.metrics.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal)) } + + if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil { + s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues) + } } s.metrics.runningRequests.WithLabelValues(modelName).Set(nRunningReqs) @@ -317,25 +336,14 @@ func (s *VllmSimulator) reportWaitingRequests() { } } -// reportTTFT sets information about time to first token -func (s *VllmSimulator) reportTTFT(ttftInSecs float64) { - if s.config.FakeMetrics != nil { - return - } - if s.metrics.ttft != nil { - s.metrics.ttft.WithLabelValues( - s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs) - } -} - -// reportTPOT sets information about time per output token -func (s *VllmSimulator) reportTPOT(tpotInSecs float64) { +// reportHistogramValue sets the given value in the given histogram +func (s *VllmSimulator) reportHistogramValue(hist *prometheus.HistogramVec, val float64) { if s.config.FakeMetrics != nil { return } - if s.metrics.tpot != nil { - s.metrics.tpot.WithLabelValues( - s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs) + if hist != nil { + hist.WithLabelValues( + s.getDisplayedModelName(s.config.Model)).Observe(val) } } @@ -359,6 +367,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) { go s.ttftUpdater(ctx) go s.tpotUpdater(ctx) go s.recordRequestUpdater(ctx) + go s.e2eReqLatencyUpdater(ctx) } // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel @@ -406,7 +415,7 @@ func (s *VllmSimulator) ttftUpdater(ctx context.Context) { case <-ctx.Done(): return case value := <-s.metrics.ttftChan: - s.reportTTFT(value) + s.reportHistogramValue(s.metrics.ttft, value) } } } @@ -418,7 +427,19 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) { case <-ctx.Done(): return case value := <-s.metrics.tpotChan: - s.reportTPOT(value) + s.reportHistogramValue(s.metrics.tpot, value) + } + } +} + +// tpotUpdater updates the time per output token metric by listening on the relevant channel +func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.metrics.e2eReqLatencyChan: + s.reportHistogramValue(s.metrics.e2eReqLatency, value) } } } diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index d10dff80..b3aeb9af 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -93,6 +93,8 @@ type metricsData struct { ttftChan chan float64 // tpotChan is a channel to update time per output token tpotChan chan float64 + // e2eReqLatencyChan is a channel to update request e2e latency + e2eReqLatencyChan chan float64 // kvCacheUsageChan is a channel to update kvCacheUsagePercentage kvCacheUsageChan chan float64 // registry is a Prometheus registry @@ -107,6 +109,8 @@ type metricsData struct { ttft *prometheus.HistogramVec // tpot is prometheus histogram for time per output token in seconds tpot *prometheus.HistogramVec + // e2eReqLatency is prometheus histogram of end to end request latency in seconds + e2eReqLatency *prometheus.HistogramVec // kvCacheUsagePercentage is prometheus gauge kvCacheUsagePercentage *prometheus.GaugeVec // requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request @@ -271,6 +275,7 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error { s.metrics.kvCacheUsageChan = make(chan float64, maxNumberOfRequests) s.metrics.ttftChan = make(chan float64, maxNumberOfRequests) s.metrics.tpotChan = make(chan float64, maxNumberOfRequests) + s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests) s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests) s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests) @@ -460,6 +465,11 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC // handleCompletions general completion requests handler, support both text and chat completion APIs func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) { + startTime := time.Now() + defer func() { + s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds() + }() + // Check if we should inject a failure if shouldInjectFailure(s.config) { failure := getRandomFailure(s.config) From a38361e05ffb5ecf3e85c9885be0e16e2d745ce1 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Thu, 23 Oct 2025 14:19:37 +0300 Subject: [PATCH 02/14] Additional metrics - vllm:request_queue_time_seconds, vllm:request_inference_time_seconds, vllm:request_prefill_time_seconds, and vllm:request_decode_time_seconds Signed-off-by: Maya Barnea --- pkg/common/utils.go | 2 +- pkg/llm-d-inference-sim/metrics.go | 118 ++++++++++++++++++++++++++- pkg/llm-d-inference-sim/simulator.go | 53 +++++++++--- pkg/llm-d-inference-sim/streaming.go | 5 ++ pkg/llm-d-inference-sim/worker.go | 6 ++ 5 files changed, 169 insertions(+), 15 deletions(-) diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 78a279b1..7050fc55 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -32,7 +32,7 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0} -var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, +var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0} // ValidateContextWindow checks if the request fits within the model's context window diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 88f7bfcb..37175fb3 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -116,7 +116,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { Subsystem: "", Name: "vllm:e2e_request_latency_seconds", Help: "Histogram of end to end request latency in seconds.", - Buckets: common.E2ERequestLatencyBucketsBoundaries, + Buckets: common.RequestLatencyBucketsBoundaries, }, []string{vllmapi.PromLabelModelName}, ) @@ -126,6 +126,66 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + s.metrics.reqQueueTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_queue_time_seconds", + Help: "Histogram of time spent in WAITING phase for request.", + Buckets: common.RequestLatencyBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.reqQueueTime); err != nil { + s.logger.Error(err, "Prometheus request queue time histogram register failed") + return err + } + + s.metrics.reqInferenceTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_inference_time_seconds", + Help: "Histogram of time spent in RUNNING phase for request.", + Buckets: common.RequestLatencyBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil { + s.logger.Error(err, "Prometheus request inerence time histogram register failed") + return err + } + + s.metrics.reqPrefillTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_prefill_time_seconds", + Help: "Histogram of time spent in PREFILL phase for request.", + Buckets: common.RequestLatencyBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.reqPrefillTime); err != nil { + s.logger.Error(err, "Prometheus request prefill time histogram register failed") + return err + } + + s.metrics.reqDecodeTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:request_queue_time_seconds", + Help: "Histogram of time spent in DECODE phase for request.", + Buckets: common.RequestLatencyBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.reqDecodeTime); err != nil { + s.logger.Error(err, "Prometheus request decode time histogram register failed") + return err + } + s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -232,7 +292,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { } if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil { - s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues) + s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues) } } @@ -368,6 +428,10 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) { go s.tpotUpdater(ctx) go s.recordRequestUpdater(ctx) go s.e2eReqLatencyUpdater(ctx) + go s.reqQueueTimeUpdater(ctx) + go s.reqInferenceTimeUpdater(ctx) + go s.reqPrefillTimeUpdater(ctx) + go s.reqDecodeTimeUpdater(ctx) } // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel @@ -432,7 +496,7 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) { } } -// tpotUpdater updates the time per output token metric by listening on the relevant channel +// e2eReqLatencyUpdater updates the e2e request latency metric by listening on the relevant channel func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) { for { select { @@ -444,6 +508,54 @@ func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) { } } +// reqQueueTimeUpdater updates the request queue time metric by listening on the relevant channel +func (s *VllmSimulator) reqQueueTimeUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.metrics.reqQueueTimeChan: + s.reportHistogramValue(s.metrics.reqQueueTime, value) + } + } +} + +// reqInferenceTimeUpdater updates the request inference time metric by listening on the relevant channel +func (s *VllmSimulator) reqInferenceTimeUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.metrics.reqInferenceTimeChan: + s.reportHistogramValue(s.metrics.reqInferenceTime, value) + } + } +} + +// reqPrefillTimeUpdater updates the request prefill time metric by listening on the relevant channel +func (s *VllmSimulator) reqPrefillTimeUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.metrics.reqPrefillTimeChan: + s.reportHistogramValue(s.metrics.reqPrefillTime, value) + } + } +} + +// reqDecodeTimeUpdater updates the request decode time metric by listening on the relevant channel +func (s *VllmSimulator) reqDecodeTimeUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.metrics.reqDecodeTimeChan: + s.reportHistogramValue(s.metrics.reqDecodeTime, value) + } + } +} + // lorasUpdater updates the running loras metric by listening on the relevant channel // one function updates both waiting and running loras since they a part of the same prometheus gauge func (s *VllmSimulator) lorasUpdater(ctx context.Context) { diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index b3aeb9af..cd180219 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -95,6 +95,14 @@ type metricsData struct { tpotChan chan float64 // e2eReqLatencyChan is a channel to update request e2e latency e2eReqLatencyChan chan float64 + // reqQueueTimeChan is a channel to update request queue time + reqQueueTimeChan chan float64 + // reqInferenceTimeChan is a channel to update request inference time + reqInferenceTimeChan chan float64 + // reqPrefillTimeChan is a channel to update request prefill time + reqPrefillTimeChan chan float64 + // reqDecodeTimeChan is a channel to update request decode time + reqDecodeTimeChan chan float64 // kvCacheUsageChan is a channel to update kvCacheUsagePercentage kvCacheUsageChan chan float64 // registry is a Prometheus registry @@ -111,6 +119,14 @@ type metricsData struct { tpot *prometheus.HistogramVec // e2eReqLatency is prometheus histogram of end to end request latency in seconds e2eReqLatency *prometheus.HistogramVec + // reqQueueTime is prometheus histogram of request queue time in seconds + reqQueueTime *prometheus.HistogramVec + // reqInferenceTime is prometheus histogram of request inference time in seconds + reqInferenceTime *prometheus.HistogramVec + // reqPrefillTime is prometheus histogram of request prefill time in seconds + reqPrefillTime *prometheus.HistogramVec + // reqDecodeTime is prometheus histogram of request decode time in seconds + reqDecodeTime *prometheus.HistogramVec // kvCacheUsagePercentage is prometheus gauge kvCacheUsagePercentage *prometheus.GaugeVec // requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request @@ -139,6 +155,11 @@ type requestCompleted struct { model string } +type waitingQueueItem struct { + reqCtx *openaiserverapi.CompletionReqCtx + enqueueTime time.Time +} + // VllmSimulator simulates vLLM server supporting OpenAI API type VllmSimulator struct { // logger is used for information and errors logging @@ -276,6 +297,10 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error { s.metrics.ttftChan = make(chan float64, maxNumberOfRequests) s.metrics.tpotChan = make(chan float64, maxNumberOfRequests) s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests) + s.metrics.reqQueueTimeChan = make(chan float64, maxNumberOfRequests) + s.metrics.reqInferenceTimeChan = make(chan float64, maxNumberOfRequests) + s.metrics.reqPrefillTimeChan = make(chan float64, maxNumberOfRequests) + s.metrics.reqDecodeTimeChan = make(chan float64, maxNumberOfRequests) s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests) s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests) @@ -575,19 +600,22 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request). // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools // usageData - usage (tokens statistics) for this response -func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall, - modelName string, finishReason string, usageData *openaiserverapi.Usage) { +func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, + toolCalls []openaiserverapi.ToolCall, modelName string, finishReason string, usageData *openaiserverapi.Usage) { resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName, reqCtx.CompletionReq.IsDoRemoteDecode()) // calculate how long to wait before returning the response, time is based on number of tokens nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() + startPrefill := time.Now() ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) time.Sleep(time.Duration(ttft) * time.Millisecond) // report ttft in seconds common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan") + common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan") + startDecode := time.Now() for range usageData.CompletionTokens - 1 { perTokenLatency := s.getInterTokenLatency() time.Sleep(time.Duration(perTokenLatency) * time.Millisecond) @@ -595,8 +623,9 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r // report tpot in seconds common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan") } - s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp) + s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds() + s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp) s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID()) } @@ -639,7 +668,7 @@ func (s *VllmSimulator) enqueue(req *openaiserverapi.CompletionReqCtx) error { if s.waitingQueue.Len() >= s.queueCapacity { return errors.New("waiting requests queue is full") } - s.waitingQueue.PushBack(req) + s.waitingQueue.PushBack(waitingQueueItem{req, time.Now()}) return nil } @@ -650,20 +679,22 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx { // Find first request for a loaded LoRA for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() { - req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx) - if ok && req != nil && s.loraIsLoaded(req.CompletionReq.GetModel()) { + item, ok := elem.Value.(waitingQueueItem) + if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) { s.waitingQueue.Remove(elem) - s.incrementLora(req.CompletionReq.GetModel()) - return req + s.incrementLora(item.reqCtx.CompletionReq.GetModel()) + s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds() + return item.reqCtx } } // All the requests require a LoRA that is not loaded, check if we can load a LoRA for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() { - req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx) - if ok && req != nil && s.loadLora(req.CompletionReq.GetModel()) { + item, ok := elem.Value.(waitingQueueItem) + if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) { s.waitingQueue.Remove(elem) - return req + s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds() + return item.reqCtx } } diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index 8e87af96..84320464 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -102,12 +102,15 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons // sendTokenChunks creates and sends response chunks func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) { + startPrefill := time.Now() // time to first token delay ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill) time.Sleep(time.Duration(ttft) * time.Millisecond) // report ttft in seconds common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan") + common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan") + startDecode := time.Now() for i, token := range genTokens { if i != 0 { interTokenLat := s.getInterTokenLatency() @@ -148,6 +151,8 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ } } + s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds() + // send the last chunk if finish reason is stop var chunk openaiserverapi.CompletionRespChunk if finishReason == dataset.StopFinishReason { diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go index b247a72b..e2a6e504 100644 --- a/pkg/llm-d-inference-sim/worker.go +++ b/pkg/llm-d-inference-sim/worker.go @@ -19,6 +19,7 @@ package llmdinferencesim import ( "context" + "time" "github.com/go-logr/logr" "github.com/llm-d/llm-d-inference-sim/pkg/common" @@ -59,6 +60,11 @@ type requestProcessor interface { } func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) { + start := time.Now() + defer func() { + s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds() + }() + req := reqCtx.CompletionReq model := req.GetModel() displayModel := s.getDisplayedModelName(model) From 7d0e5f09a0c484d9c0dc9a480660e9c65e6806be Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Thu, 23 Oct 2025 14:26:42 +0300 Subject: [PATCH 03/14] typo in metric name Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 37175fb3..92de74ef 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -174,7 +174,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.reqDecodeTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_queue_time_seconds", + Name: "vllm:request_decode_time_seconds", Help: "Histogram of time spent in DECODE phase for request.", Buckets: common.RequestLatencyBucketsBoundaries, }, From f0e8882e54f37b5c248aaf2cfa35aa8a16967786 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Sat, 25 Oct 2025 22:53:33 +0300 Subject: [PATCH 04/14] Initial tests for new metrics + create constant for part of metrics names Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 18 +++- pkg/llm-d-inference-sim/metrics_test.go | 117 +++++++++++++++++++++++- 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 92de74ef..4322b024 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -32,6 +32,14 @@ import ( vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api" ) +const ( + e2eReqLatencyMetricName = "vllm:e2e_request_latency_seconds" + reqQueueTimeMetricName = "vllm:request_queue_time_seconds" + reqInferenceTimeMetricName = "vllm:request_inference_time_seconds" + prefillTimeMetricName = "vllm:request_prefill_time_seconds" + decodeTimeMetricName = "vllm:request_decode_time_seconds" +) + // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator // Metrics reported: // - lora_requests_info @@ -114,7 +122,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.e2eReqLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:e2e_request_latency_seconds", + Name: e2eReqLatencyMetricName, Help: "Histogram of end to end request latency in seconds.", Buckets: common.RequestLatencyBucketsBoundaries, }, @@ -129,7 +137,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.reqQueueTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_queue_time_seconds", + Name: reqQueueTimeMetricName, Help: "Histogram of time spent in WAITING phase for request.", Buckets: common.RequestLatencyBucketsBoundaries, }, @@ -144,7 +152,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.reqInferenceTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_inference_time_seconds", + Name: reqInferenceTimeMetricName, Help: "Histogram of time spent in RUNNING phase for request.", Buckets: common.RequestLatencyBucketsBoundaries, }, @@ -159,7 +167,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.reqPrefillTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_prefill_time_seconds", + Name: prefillTimeMetricName, Help: "Histogram of time spent in PREFILL phase for request.", Buckets: common.RequestLatencyBucketsBoundaries, }, @@ -174,7 +182,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.reqDecodeTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_decode_time_seconds", + Name: decodeTimeMetricName, Help: "Histogram of time spent in DECODE phase for request.", Buckets: common.RequestLatencyBucketsBoundaries, }, diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 52d3aecc..754f2026 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -19,7 +19,9 @@ package llmdinferencesim import ( "context" "errors" + "fmt" "io" + "math" "net/http" "os" "reflect" @@ -164,7 +166,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`)) Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`)) Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`)) Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`)) // request_params_max_tokens_bucket Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`)) @@ -815,6 +817,93 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1")) }) }) + + Context("latency metrics", func() { + DescribeTable("should calculate all latency related metrics correctly for a single request", + func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int, + ttft int, prefillTimePerToken int, interTokenLatency int) { + // Expect(true).To(BeFalse()) + // send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too + modelName := "my_model" + // Send one request, check that ttft and tpot are as defined in the simulator command line params + ctx := context.TODO() + args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, + "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency), + "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken), + "--time-to-first-token", strconv.Itoa(ttft), + "--prefill-time-per-token", strconv.Itoa(prefillTimePerToken), + "--inter-token-latency", strconv.Itoa(interTokenLatency), + } + + client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + Expect(err).NotTo(HaveOccurred()) + + // TODO - pass isStreaming + openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false) + // TODO - how to test remote prefill/decode + + var reqWg, metricsWg sync.WaitGroup + metricsWg.Add(1) + reqWg.Add(1) + + // send a single request + go func() { + defer reqWg.Done() + defer GinkgoRecover() + + _, err := openaiclient.Chat.Completions.New(ctx, params) + Expect(err).NotTo(HaveOccurred()) + }() + + // wait untill request processing was finished, send /mertics request + reqWg.Wait() + time.Sleep(300 * time.Millisecond) + metricsResp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + + numOfTokens := 4 + var expectedPrefillTime float64 + // TODO take into consideration remote prefill + if ttft > 0 { + // time-to-first-token overwrites calculation of prefill time based on number of input tokens + expectedPrefillTime = float64(ttft) / 1000 + + } else { + expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000 + } + expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000 + expectedE2ELatency := expectedPrefillTime + expectedDecodeTime + + prevBoundary := math.Inf(-1) + + for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries { + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency) + + prevBoundary = bucketBoudary + } + // check the last bucket + lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1] + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency) + }, + func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int, + ttft int, prefillTimePerToken int, interTokenLatency int) string { + return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", + testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency) + }, + // pay attention: do not define times close to bucket boundaries, this can lead to test failure + Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100), + Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100), + ) + }) }) // isLoraMetricPresent checks if a matching metric exists @@ -1022,3 +1111,29 @@ func TestBuild125Buckets(t *testing.T) { }) } } + +func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string { + buckerBoundStr := "+Inf" + if bucketBoundary != math.Inf(1) { + buckerBoundStr = fmt.Sprintf("%g", bucketBoundary) + } + return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count) +} + +func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64, + prevBoundary float64, expectedValue float64) { + if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 { + // expected time is too close to the bucket boudary + // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok + // since there is some additional calculation time + fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n", + prevBoundary, bucketBoudary, expectedValue) + return + } + expectedCount := 0 + if bucketBoudary > expectedValue { + expectedCount = 1 + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount))) + +} From 59bb8dd50da55b0634fc21ff9a31866b8d44f555 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Mon, 27 Oct 2025 11:20:58 +0200 Subject: [PATCH 05/14] Fix bug in metrics test + add latency test for streaming mode Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics_test.go | 158 ++++++++++++------------ 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 754f2026..0e9df371 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -820,88 +820,26 @@ var _ = Describe("Simulator metrics", Ordered, func() { Context("latency metrics", func() { DescribeTable("should calculate all latency related metrics correctly for a single request", - func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int, - ttft int, prefillTimePerToken int, interTokenLatency int) { - // Expect(true).To(BeFalse()) - // send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too + func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) { + // send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too modelName := "my_model" - // Send one request, check that ttft and tpot are as defined in the simulator command line params - ctx := context.TODO() - args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, - "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency), - "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken), - "--time-to-first-token", strconv.Itoa(ttft), - "--prefill-time-per-token", strconv.Itoa(prefillTimePerToken), - "--inter-token-latency", strconv.Itoa(interTokenLatency), - } - - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) - Expect(err).NotTo(HaveOccurred()) - - // TODO - pass isStreaming - openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false) - // TODO - how to test remote prefill/decode - - var reqWg, metricsWg sync.WaitGroup - metricsWg.Add(1) - reqWg.Add(1) - - // send a single request - go func() { - defer reqWg.Done() - defer GinkgoRecover() - - _, err := openaiclient.Chat.Completions.New(ctx, params) - Expect(err).NotTo(HaveOccurred()) - }() + prompt := "1 2 3 4" - // wait untill request processing was finished, send /mertics request - reqWg.Wait() - time.Sleep(300 * time.Millisecond) - metricsResp, err := client.Get(metricsUrl) - Expect(err).NotTo(HaveOccurred()) - Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + client := sendRequest(modelName, prompt, false, ttft, prefillTimePerToken, interTokenLatency) + checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency) - data, err := io.ReadAll(metricsResp.Body) - Expect(err).NotTo(HaveOccurred()) - metrics := string(data) - - numOfTokens := 4 - var expectedPrefillTime float64 - // TODO take into consideration remote prefill - if ttft > 0 { - // time-to-first-token overwrites calculation of prefill time based on number of input tokens - expectedPrefillTime = float64(ttft) / 1000 - - } else { - expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000 - } - expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000 - expectedE2ELatency := expectedPrefillTime + expectedDecodeTime - - prevBoundary := math.Inf(-1) - - for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries { - checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime) - checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime) - checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency) - - prevBoundary = bucketBoudary - } - // check the last bucket - lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1] - checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime) - checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime) - checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency) + // same in streaming mode + client = sendRequest(modelName, prompt, true, ttft, prefillTimePerToken, interTokenLatency) + checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency) }, - func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int, - ttft int, prefillTimePerToken int, interTokenLatency int) string { - return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", - testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency) + func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) string { + return fmt.Sprintf("%s\nttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", testNamePrefix, ttft, prefillTimePerToken, interTokenLatency) }, - // pay attention: do not define times close to bucket boundaries, this can lead to test failure - Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100), - Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100), + // Params order: testName, ttft, prefillTimePerToken, interTokenLatency + Entry(nil, "constant prefill + inter token time", 0, 0, 100), + Entry(nil, "constant prefill + inter token time", 900, 0, 100), + Entry(nil, "constant prefill + inter token time", 1000, 0, 100), + Entry(nil, "prefill per token + inter token time", 0, 100, 100), ) }) }) @@ -1122,8 +1060,8 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6 func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64, prevBoundary float64, expectedValue float64) { - if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 { - // expected time is too close to the bucket boudary + if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 { + // expected time is too close to the bucket's boudary // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok // since there is some additional calculation time fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n", @@ -1135,5 +1073,67 @@ func checkBucketBoundary(metrics string, modelName string, metricName string, bu expectedCount = 1 } Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount))) +} + +// send a single request with the given prompt and echo mode +func sendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client { + ctx := context.TODO() + args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, + // "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency), + // "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken), + "--time-to-first-token", strconv.Itoa(ttft), + "--prefill-time-per-token", strconv.Itoa(prefillTimePerToken), + "--inter-token-latency", strconv.Itoa(interTokenLatency), + } + + client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + Expect(err).NotTo(HaveOccurred()) + + openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming) + + // send a single request in a serial way + _, err = openaiclient.Chat.Completions.New(ctx, params) + Expect(err).NotTo(HaveOccurred()) + + return client +} + +func checkLatencyMertics(client *http.Client, modelName string, prompt string, ttft int, prefillTimePerToken int, interTokenLatency int) { + // wait a little bit and check metrics + time.Sleep(300 * time.Millisecond) + metricsResp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + numOfTokens := len(common.Tokenize(prompt)) + var expectedPrefillTime float64 + // TODO take into consideration remote prefill + if ttft > 0 { + // time-to-first-token overwrites calculation of prefill time based on number of input tokens + expectedPrefillTime = float64(ttft) / 1000 + + } else { + expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000 + } + expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000 + expectedE2ELatency := expectedPrefillTime + expectedDecodeTime + + prevBoundary := math.Inf(-1) + + for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries { + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency) + + prevBoundary = bucketBoudary + } + // check the last bucket + lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1] + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency) } From 99bdec45415884e79bc17f94f2a1585446180d65 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Mon, 27 Oct 2025 21:27:41 +0200 Subject: [PATCH 06/14] Move common simulator tests helper functions to test_utils.go, use same model name is all tests, refactoring in server start functions Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/failures_test.go | 46 +- pkg/llm-d-inference-sim/lora_test.go | 12 +- pkg/llm-d-inference-sim/metrics_test.go | 680 +++++++--------------- pkg/llm-d-inference-sim/seed_test.go | 7 +- pkg/llm-d-inference-sim/server_test.go | 20 +- pkg/llm-d-inference-sim/simulator_test.go | 183 +----- pkg/llm-d-inference-sim/test_utils.go | 427 ++++++++++++++ pkg/llm-d-inference-sim/tools_test.go | 34 +- pkg/llm-d-inference-sim/worker_test.go | 36 +- 9 files changed, 751 insertions(+), 694 deletions(-) create mode 100644 pkg/llm-d-inference-sim/test_utils.go diff --git a/pkg/llm-d-inference-sim/failures_test.go b/pkg/llm-d-inference-sim/failures_test.go index da8f8576..1459eed5 100644 --- a/pkg/llm-d-inference-sim/failures_test.go +++ b/pkg/llm-d-inference-sim/failures_test.go @@ -126,15 +126,15 @@ var _ = Describe("Failures", func() { BeforeEach(func() { ctx = context.Background() var err error - client, err = startServerWithArgs(ctx, "", []string{ - "cmd", "--model", model, + client, err = startServerWithArgs(ctx, []string{ + "cmd", "--model", testModel, "--failure-injection-rate", "100", - }, nil) + }) Expect(err).ToNot(HaveOccurred()) }) It("should always return an error response for chat completions", func() { - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) _, err := openaiClient.Chat.Completions.New(ctx, params) Expect(err).To(HaveOccurred()) @@ -147,7 +147,7 @@ var _ = Describe("Failures", func() { }) It("should always return an error response for text completions", func() { - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) _, err := openaiClient.Chat.Completions.New(ctx, params) Expect(err).To(HaveOccurred()) @@ -164,16 +164,16 @@ var _ = Describe("Failures", func() { BeforeEach(func() { ctx = context.Background() var err error - client, err = startServerWithArgs(ctx, "", []string{ - "cmd", "--model", model, + client, err = startServerWithArgs(ctx, []string{ + "cmd", "--model", testModel, "--failure-injection-rate", "100", "--failure-types", common.FailureTypeRateLimit, - }, nil) + }) Expect(err).ToNot(HaveOccurred()) }) It("should return only rate limit errors", func() { - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) _, err := openaiClient.Chat.Completions.New(ctx, params) Expect(err).To(HaveOccurred()) @@ -182,7 +182,7 @@ var _ = Describe("Failures", func() { Expect(ok).To(BeTrue()) Expect(openaiError.StatusCode).To(Equal(429)) Expect(openaiError.Type).To(Equal(openaiserverapi.ErrorCodeToType(429))) - Expect(strings.Contains(openaiError.Message, model)).To(BeTrue()) + Expect(strings.Contains(openaiError.Message, testModel)).To(BeTrue()) }) }) @@ -190,16 +190,16 @@ var _ = Describe("Failures", func() { BeforeEach(func() { ctx = context.Background() var err error - client, err = startServerWithArgs(ctx, "", []string{ - "cmd", "--model", model, + client, err = startServerWithArgs(ctx, []string{ + "cmd", "--model", testModel, "--failure-injection-rate", "100", "--failure-types", common.FailureTypeInvalidAPIKey, common.FailureTypeServerError, - }, nil) + }) Expect(err).ToNot(HaveOccurred()) }) It("should return only specified error types", func() { - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) // Make multiple requests to verify we get the expected error types for i := 0; i < 10; i++ { @@ -222,20 +222,20 @@ var _ = Describe("Failures", func() { BeforeEach(func() { ctx = context.Background() var err error - client, err = startServerWithArgs(ctx, "", []string{ - "cmd", "--model", model, + client, err = startServerWithArgs(ctx, []string{ + "cmd", "--model", testModel, "--failure-injection-rate", "0", - }, nil) + }) Expect(err).ToNot(HaveOccurred()) }) It("should never return errors and behave like random mode", func() { - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) resp, err := openaiClient.Chat.Completions.New(ctx, params) Expect(err).ToNot(HaveOccurred()) Expect(resp.Choices).To(HaveLen(1)) Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty()) - Expect(resp.Model).To(Equal(model)) + Expect(resp.Model).To(Equal(testModel)) }) }) @@ -243,14 +243,14 @@ var _ = Describe("Failures", func() { DescribeTable("should return correct error for each failure type", func(failureType string, expectedStatusCode int, expectedErrorType string) { ctx := context.Background() - client, err := startServerWithArgs(ctx, "", []string{ - "cmd", "--model", model, + client, err := startServerWithArgs(ctx, []string{ + "cmd", "--model", testModel, "--failure-injection-rate", "100", "--failure-types", failureType, - }, nil) + }) Expect(err).ToNot(HaveOccurred()) - openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) _, err = openaiClient.Chat.Completions.New(ctx, params) Expect(err).To(HaveOccurred()) diff --git a/pkg/llm-d-inference-sim/lora_test.go b/pkg/llm-d-inference-sim/lora_test.go index 837a36fc..2bcd63c0 100644 --- a/pkg/llm-d-inference-sim/lora_test.go +++ b/pkg/llm-d-inference-sim/lora_test.go @@ -34,14 +34,14 @@ var _ = Describe("LoRAs", func() { Context("LoRAs config and load", func() { It("Should config, load and load LoRAs correctly", func() { ctx := context.TODO() - client, err := startServerWithArgs(ctx, "", - []string{"cmd", "--model", model, "--mode", common.ModeEcho, + client, err := startServerWithArgs(ctx, + []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", - "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}, nil) + "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}) Expect(err).NotTo(HaveOccurred()) // Request to lora3 - openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", testUserMessage, false) resp, err := openaiclient.Chat.Completions.New(ctx, params) Expect(err).ToNot(HaveOccurred()) @@ -49,7 +49,7 @@ var _ = Describe("LoRAs", func() { Expect(string(resp.Object)).To(Equal(chatCompletionObject)) msg := resp.Choices[0].Message.Content - Expect(msg).Should(Equal(userMessage)) + Expect(msg).Should(Equal(testUserMessage)) // Unknown model, should return 404 params.Model = "lora1" @@ -88,7 +88,7 @@ var _ = Describe("LoRAs", func() { Expect(string(resp.Object)).To(Equal(chatCompletionObject)) msg = resp.Choices[0].Message.Content - Expect(msg).Should(Equal(userMessage)) + Expect(msg).Should(Equal(testUserMessage)) // Unload lora3 payload = map[string]string{ diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 0e9df371..4cc1b948 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -18,19 +18,12 @@ package llmdinferencesim import ( "context" - "errors" "fmt" "io" - "math" "net/http" "os" - "reflect" - "regexp" - "sort" - "strconv" "strings" "sync" - "testing" "time" "github.com/llm-d/llm-d-inference-sim/pkg/common" @@ -41,8 +34,6 @@ import ( ) const ( - metricsUrl = "http://localhost/metrics" - lora1 = "lora1" lora2 = "lora2" ) @@ -53,51 +44,51 @@ var lora2Arr = []string{lora2} var paramsLora1 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: "lora1", } var paramsLora2 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: "lora2", } var paramsLora3 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: "lora3", } var paramsLora4 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: "lora4", } var paramsLora5 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: "lora5", } var _ = Describe("Simulator metrics", Ordered, func() { - It("Should send correct running and waiting requests metrics", func() { + It("should send correct running and waiting requests metrics", func() { // Three requests, only two can run in parallel, we expect // two running requests and one waiting request in the metrics ctx := context.TODO() - args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "3000", "--max-num-seqs", "2"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, modelName, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) for range 3 { go func() { @@ -128,7 +119,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom, "--time-to-first-token", "100", "--max-num-seqs", "4"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -192,12 +183,12 @@ var _ = Describe("Simulator metrics", Ordered, func() { It("Should send correct lora metrics", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "3000", "--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -237,12 +228,12 @@ var _ = Describe("Simulator metrics", Ordered, func() { It("Should send correct lora metrics for parallel requests with delay", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "3000", "--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -312,12 +303,12 @@ var _ = Describe("Simulator metrics", Ordered, func() { It("Should send correct lora metrics for parallel requests without delay", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "3000", "--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -392,17 +383,16 @@ var _ = Describe("Simulator metrics", Ordered, func() { }) It("Should send correct ttft and tpot metrics", func() { - modelName := "my_model" // Send one request, check that ttft and tpot are as defined in the simulator command line params ctx := context.TODO() // use mode echo to be sure that response is more than one token - this makes sure that tpot is reported to prometheus - args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "200", "--inter-token-latency", "100"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, modelName, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) var reqWg, metricsWg sync.WaitGroup metricsWg.Add(1) @@ -430,83 +420,83 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics := string(data) // ttft - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1")) - // check tpot only is it exists in metrics, when a single - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1")) + // tpot + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 0")) metricsLines := strings.Split(metrics, "\n") // the following values should be greater than 0, we don't know the exact value since it depends on the random response length - count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"}") + count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.2\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.2\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.3\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.3\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.4\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.4\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"1\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"5\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"10\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"20\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"40\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"80\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"}") + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"}") Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) }() @@ -528,7 +518,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { "--enable-kvcache", "true", "--kv-cache-size", "16", "--block-size", "8", "--time-to-first-token", "5000", "--tokenizers-cache-dir", tmpDir} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -605,7 +595,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { "--enable-kvcache", "true", "--kv-cache-size", "16", "--block-size", "8", "--time-to-first-token", "5000", "--tokenizers-cache-dir", tmpDir, "--max-num-seqs", "2"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -670,7 +660,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { Context("fake metrics", func() { It("Should respond with fake metrics to /metrics", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--fake-metrics", `{` + `"running-requests":10,` + @@ -702,7 +692,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { `}`, } - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) resp, err := client.Get(metricsUrl) @@ -712,76 +702,76 @@ var _ = Describe("Simulator metrics", Ordered, func() { data, err := io.ReadAll(resp.Body) Expect(err).NotTo(HaveOccurred()) metrics := string(data) - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"my_model\"} 10")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"my_model\"} 30")) - Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"my_model\"} 0.4")) + Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10")) + Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30")) + Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 3")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 6")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 6")) - - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6")) - - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) - - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) - - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="+Inf"} 60`)) - - Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="my_model"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="my_model"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="my_model"} 20`)) - Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="my_model"} 0`)) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 3")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 6")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 6")) + + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 3")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 6")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"} 6")) + + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 10`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 30`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 60`)) + Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) + + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`)) + Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`)) }) }) Context("fake ttft metrics", func() { It("Should respond with fake ttft metrics to /metrics", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--fake-metrics", "{\"ttft-buckets-values\":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}", } - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) resp, err := client.Get(metricsUrl) @@ -792,45 +782,44 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics := string(data) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1")) }) }) Context("latency metrics", func() { + numOfTokens := len(common.Tokenize(testUserMessage)) + DescribeTable("should calculate all latency related metrics correctly for a single request", func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) { // send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too - modelName := "my_model" - prompt := "1 2 3 4" - - client := sendRequest(modelName, prompt, false, ttft, prefillTimePerToken, interTokenLatency) - checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency) + client := startServerAndSendRequest(testModel, testUserMessage, false, ttft, prefillTimePerToken, interTokenLatency) + checkLatencyMertics(client, testModel, numOfTokens, numOfTokens, ttft, prefillTimePerToken, interTokenLatency) - // same in streaming mode - client = sendRequest(modelName, prompt, true, ttft, prefillTimePerToken, interTokenLatency) - checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency) + // same in streaming modeq + client = startServerAndSendRequest(testModel, testUserMessage, true, ttft, prefillTimePerToken, interTokenLatency) + checkLatencyMertics(client, testModel, numOfTokens, numOfTokens, ttft, prefillTimePerToken, interTokenLatency) }, func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) string { return fmt.Sprintf("%s\nttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", testNamePrefix, ttft, prefillTimePerToken, interTokenLatency) @@ -844,296 +833,77 @@ var _ = Describe("Simulator metrics", Ordered, func() { }) }) -// isLoraMetricPresent checks if a matching metric exists -// metrics: the list of metrics -// running: list of loras in running_lora_adapters, the order does not matter -// waiting: list of loras in waiting_lora_adapters, the order does not matter -func isLoraMetricPresent(metrics []string, running, waiting []string) bool { - return findLoraMetric(metrics, running, waiting) != "" -} - -// getLoraTimestamp returns timestamp or nil, error -func getLoraTimestamp(metrics []string, running, waiting []string) (*float64, error) { - metric := findLoraMetric(metrics, running, waiting) - if metric == "" { - return nil, nil // not found - } - return extractTimestamp(metric) -} - -func extractTimestamp(metric string) (*float64, error) { - // Extract timestamp: last part after space - parts := strings.Split(metric, " ") - if len(parts) < 2 { - return nil, errors.New("invalid metric format") - } - timestampStr := parts[len(parts)-1] - timestamp, err := strconv.ParseFloat(timestampStr, 64) - Expect(err).NotTo(HaveOccurred()) - - return ×tamp, nil -} - -func getLoraValidTimestamp(metrics []string, running, waiting []string) float64 { - timestamp, err := getLoraTimestamp(metrics, running, waiting) - Expect(err).NotTo(HaveOccurred()) - Expect(timestamp).ToNot(BeNil()) - return *timestamp -} - -func getLastLoraMetrics(metrics []string) ([]string, error) { - lastTimestamp := float64(0) - var lastMetrics []string - for _, metric := range metrics { - if strings.HasPrefix(metric, "vllm:lora_requests_info") { - timestamp, err := extractTimestamp(metric) - if err != nil { - return nil, err - } - if lastTimestamp > *timestamp { - continue - } - lastTimestamp = *timestamp - if lastTimestamp < *timestamp { - lastMetrics = make([]string, 0) - } - lastMetrics = append(lastMetrics, metric) +var _ = Describe("build125Buckets", Ordered, func() { + It("should create valid 125 buckets", func() { + // tests the build125Buckets function with various inputs. + tests := []struct { + name string + maxValue int + want []float64 + }{ + { + name: "max_value zero", + maxValue: 0, + want: []float64{}, // no bucket <= 0 + }, + { + name: "max_value one", + maxValue: 1, + want: []float64{1}, + }, + { + name: "max_value five", + maxValue: 5, + want: []float64{1, 2, 5}, + }, + { + name: "max_value ten", + maxValue: 10, + want: []float64{1, 2, 5, 10}, + }, + { + name: "max_value 100", + maxValue: 100, + want: []float64{1, 2, 5, 10, 20, 50, 100}, + }, + { + name: "max_value 999", + maxValue: 999, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500}, + }, + { + name: "max_value 1024", + maxValue: 1024, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}, + }, + { + name: "max_value 4096", + maxValue: 4096, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}, + }, + { + name: "max_value 32768", + maxValue: 32768, + want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000}, + }, + { + name: "max_value just below power of 10", + maxValue: 99, + want: []float64{1, 2, 5, 10, 20, 50}, + }, + { + name: "max_value negative", + maxValue: -1, + want: []float64{}, // no positive bucket <= -1 + }, } - } - return lastMetrics, nil -} -// findLoraMetric finds the relevant metric by comparing with the given loras sets (ignoring order) -// metrics: lines of metrics -// running: list of running loras to find -// waiting: list of waiting loras to find -// Looks for a line with the given running and waiting loras sets, the comparison is order agnostic. -// Return metric should match in both running and waiting sets. -// E.g. for input running=["l1", "l2", "l3"] and waiting=[] will return metric -// with running_lora_adapters=["l3", "l1", "l2"] and waiting_lora_adapters=[] -func findLoraMetric(metrics []string, running, waiting []string) string { - // sort input arrays before compare, create string of all values, separated by comma - sort.Strings(running) - sort.Strings(waiting) - runStr := strings.Join(running, ",") - waitStr := strings.Join(waiting, ",") - - // regex to extract lora metrics and values - re := regexp.MustCompile(`vllm:lora_requests_info\{.*running_lora_adapters="([^"]*)".*waiting_lora_adapters="([^"]*)".*\}\s+([0-9.e\+\-]+)`) - for _, metric := range metrics { - matches := re.FindStringSubmatch(metric) - if len(matches) == 4 { - // this line contains loraInfo metric, check running and waiting loras lists - // split and sort metric's running and waiting loras lists for the comparison - metricRun := splitString(matches[1]) - metricWait := splitString(matches[2]) - sort.Strings(metricRun) - sort.Strings(metricWait) - // if both lists are the same - return the metric - if strings.Join(metricRun, ",") == runStr && strings.Join(metricWait, ",") == waitStr { - return metric - } - } // if the metric is not in the required format - skip it - } - - // required metric was not found - return "" -} - -// splits the given string to array of strings with separator = "," -func splitString(str string) []string { - if str == "" { - return []string{} - } - return strings.Split(str, ",") -} - -func findMetric(metrics []string, metricPrefix string) string { - // regex to extract metrics and values - for _, metric := range metrics { - if strings.Contains(metric, metricPrefix) { - arr := strings.Split(metric, " ") - if len(arr) == 2 { - return arr[1] - } - break + for _, test := range tests { + got := build125Buckets(test.maxValue) + Expect(got).To(Equal(test.want)) + // if !reflect.DeepEqual(got, test.want) { + // t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want) + // } } - } - // required metric was not found - return "" -} - -func findIntMetric(metrics []string, metricPrefix string) *int { - valueStr := findMetric(metrics, metricPrefix) - if valueStr == "" { - return nil - } - - val, err := strconv.Atoi(valueStr) - if err != nil { - return nil - } - return &val -} - -// TestBuild125Buckets tests the build125Buckets function with various inputs. -func TestBuild125Buckets(t *testing.T) { - tests := []struct { - name string - maxValue int - want []float64 - }{ - { - name: "max_value zero", - maxValue: 0, - want: []float64{}, // no bucket <= 0 - }, - { - name: "max_value one", - maxValue: 1, - want: []float64{1}, - }, - { - name: "max_value five", - maxValue: 5, - want: []float64{1, 2, 5}, - }, - { - name: "max_value ten", - maxValue: 10, - want: []float64{1, 2, 5, 10}, - }, - { - name: "max_value 100", - maxValue: 100, - want: []float64{1, 2, 5, 10, 20, 50, 100}, - }, - { - name: "max_value 999", - maxValue: 999, - want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500}, - }, - { - name: "max_value 1024", - maxValue: 1024, - want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}, - }, - { - name: "max_value 4096", - maxValue: 4096, - want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}, - }, - { - name: "max_value 32768", - maxValue: 32768, - want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000}, - }, - { - name: "max_value just below power of 10", - maxValue: 99, - want: []float64{1, 2, 5, 10, 20, 50}, - }, - { - name: "max_value negative", - maxValue: -1, - want: []float64{}, // no positive bucket <= -1 - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := build125Buckets(tt.maxValue) - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want) - } - }) - } -} - -func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string { - buckerBoundStr := "+Inf" - if bucketBoundary != math.Inf(1) { - buckerBoundStr = fmt.Sprintf("%g", bucketBoundary) - } - return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count) -} - -func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64, - prevBoundary float64, expectedValue float64) { - if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 { - // expected time is too close to the bucket's boudary - // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok - // since there is some additional calculation time - fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n", - prevBoundary, bucketBoudary, expectedValue) - return - } - expectedCount := 0 - if bucketBoudary > expectedValue { - expectedCount = 1 - } - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount))) -} - -// send a single request with the given prompt and echo mode -func sendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client { - ctx := context.TODO() - args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, - // "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency), - // "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken), - "--time-to-first-token", strconv.Itoa(ttft), - "--prefill-time-per-token", strconv.Itoa(prefillTimePerToken), - "--inter-token-latency", strconv.Itoa(interTokenLatency), - } - - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) - Expect(err).NotTo(HaveOccurred()) - - openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming) - - // send a single request in a serial way - _, err = openaiclient.Chat.Completions.New(ctx, params) - Expect(err).NotTo(HaveOccurred()) - - return client -} - -func checkLatencyMertics(client *http.Client, modelName string, prompt string, ttft int, prefillTimePerToken int, interTokenLatency int) { - // wait a little bit and check metrics - time.Sleep(300 * time.Millisecond) - metricsResp, err := client.Get(metricsUrl) - Expect(err).NotTo(HaveOccurred()) - Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) - - data, err := io.ReadAll(metricsResp.Body) - Expect(err).NotTo(HaveOccurred()) - metrics := string(data) - - numOfTokens := len(common.Tokenize(prompt)) - var expectedPrefillTime float64 - // TODO take into consideration remote prefill - if ttft > 0 { - // time-to-first-token overwrites calculation of prefill time based on number of input tokens - expectedPrefillTime = float64(ttft) / 1000 - - } else { - expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000 - } - expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000 - expectedE2ELatency := expectedPrefillTime + expectedDecodeTime - - prevBoundary := math.Inf(-1) - - for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries { - checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime) - checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime) - checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency) - - prevBoundary = bucketBoudary - } - // check the last bucket - lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1] - checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime) - checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime) - checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency) -} + }) +}) diff --git a/pkg/llm-d-inference-sim/seed_test.go b/pkg/llm-d-inference-sim/seed_test.go index 4e10f1f8..210419e1 100644 --- a/pkg/llm-d-inference-sim/seed_test.go +++ b/pkg/llm-d-inference-sim/seed_test.go @@ -31,11 +31,10 @@ var _ = Describe("Simulator with seed", func() { // use a function so that httpClient is captured when running func() { ctx := context.TODO() - client, err := startServerWithArgs(ctx, common.ModeRandom, - []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--seed", "100"}, nil) + client, err := startServerWithArgs(ctx, []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--seed", "100"}) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false) params.MaxTokens = openai.Int(10) resp, err := openaiclient.Completions.New(ctx, params) Expect(err).NotTo(HaveOccurred()) @@ -67,7 +66,7 @@ var _ = Describe("Simulator with seed", func() { client, err := startServer(ctx, common.ModeRandom) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false) resp, err := openaiclient.Completions.New(ctx, params) Expect(err).NotTo(HaveOccurred()) Expect(resp.Choices).ShouldNot(BeEmpty()) diff --git a/pkg/llm-d-inference-sim/server_test.go b/pkg/llm-d-inference-sim/server_test.go index 1f610562..0f648681 100644 --- a/pkg/llm-d-inference-sim/server_test.go +++ b/pkg/llm-d-inference-sim/server_test.go @@ -63,7 +63,7 @@ var _ = Describe("Server", func() { ctx := context.TODO() args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom, "--tokenizers-cache-dir", tmpDir, "--max-model-len", "2048"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) reqBody := `{ @@ -92,7 +92,7 @@ var _ = Describe("Server", func() { ctx := context.TODO() args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom, "--tokenizers-cache-dir", tmpDir, "--max-model-len", "2048"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) reqBody := `{ @@ -129,7 +129,7 @@ var _ = Describe("Server", func() { os.Args = oldArgs }() - os.Args = []string{"cmd", "--model", model, "--ssl-certfile", certFile, "--ssl-keyfile", keyFile} + os.Args = []string{"cmd", "--model", testModel, "--ssl-certfile", certFile, "--ssl-keyfile", keyFile} config, err := common.ParseCommandParamsAndLoadConfig() Expect(err).NotTo(HaveOccurred()) Expect(config.SSLEnabled()).To(BeTrue()) @@ -143,7 +143,7 @@ var _ = Describe("Server", func() { os.Args = oldArgs }() - os.Args = []string{"cmd", "--model", model, "--self-signed-certs"} + os.Args = []string{"cmd", "--model", testModel, "--self-signed-certs"} config, err := common.ParseCommandParamsAndLoadConfig() Expect(err).NotTo(HaveOccurred()) Expect(config.SSLEnabled()).To(BeTrue()) @@ -168,7 +168,7 @@ var _ = Describe("Server", func() { certFile, _, err := GenerateTempCerts(tempDir) Expect(err).NotTo(HaveOccurred()) - os.Args = []string{"cmd", "--model", model, "--ssl-certfile", certFile} + os.Args = []string{"cmd", "--model", testModel, "--ssl-certfile", certFile} _, err = common.ParseCommandParamsAndLoadConfig() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("both ssl-certfile and ssl-keyfile must be provided together")) @@ -176,7 +176,7 @@ var _ = Describe("Server", func() { _, keyFile, err := GenerateTempCerts(tempDir) Expect(err).NotTo(HaveOccurred()) - os.Args = []string{"cmd", "--model", model, "--ssl-keyfile", keyFile} + os.Args = []string{"cmd", "--model", testModel, "--ssl-keyfile", keyFile} _, err = common.ParseCommandParamsAndLoadConfig() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("both ssl-certfile and ssl-keyfile must be provided together")) @@ -187,9 +187,9 @@ var _ = Describe("Server", func() { certFile, keyFile, err := GenerateTempCerts(tempDir) Expect(err).NotTo(HaveOccurred()) - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--ssl-certfile", certFile, "--ssl-keyfile", keyFile} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) resp, err := client.Get("https://localhost/health") @@ -198,8 +198,8 @@ var _ = Describe("Server", func() { }) It("Should start HTTPS server with self-signed certificates", func(ctx SpecContext) { - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--self-signed-certs"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--self-signed-certs"} + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) resp, err := client.Get("https://localhost/health") diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index a461ff01..010b82ad 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -18,13 +18,10 @@ package llmdinferencesim import ( "context" - "crypto/tls" "errors" "fmt" "io" - "net" "net/http" - "os" "strings" "github.com/llm-d/llm-d-inference-sim/pkg/common" @@ -34,89 +31,10 @@ import ( "github.com/openai/openai-go/v3" "github.com/openai/openai-go/v3/option" "github.com/openai/openai-go/v3/packages/param" - "github.com/valyala/fasthttp/fasthttputil" - "k8s.io/klog/v2" ) -const model = "my_model" -const qwenModelName = "Qwen/Qwen2-0.5B" -const baseURL = "http://localhost/v1" -const userMessage = "This is a test." const invalidMaxTokensErrMsg = "Max completion tokens and max tokens should be positive" -var userMsgTokens int64 - -func startServer(ctx context.Context, mode string) (*http.Client, error) { - return startServerWithArgs(ctx, mode, nil, nil) -} - -func startServerWithArgs(ctx context.Context, mode string, args []string, envs map[string]string) (*http.Client, error) { - oldArgs := os.Args - defer func() { - os.Args = oldArgs - }() - - if args != nil { - os.Args = args - } else { - os.Args = []string{"cmd", "--model", model, "--mode", mode} - } - - if envs != nil { - for k, v := range envs { - err := os.Setenv(k, v) - Expect(err).NotTo(HaveOccurred()) - } - - defer func() { - for k := range envs { - err := os.Unsetenv(k) - Expect(err).NotTo(HaveOccurred()) - } - }() - } - - logger := klog.Background() - - s, err := New(logger) - if err != nil { - return nil, err - } - config, err := common.ParseCommandParamsAndLoadConfig() - if err != nil { - return nil, err - } - s.config = config - - // calculate number of tokens for user message, - // must be activated after parseCommandParamsAndLoadConfig since it initializes the random engine - userMsgTokens = int64(len(common.Tokenize(userMessage))) - - if err := s.initializeSim(ctx); err != nil { - return nil, err - } - - listener := fasthttputil.NewInmemoryListener() - - // start the http server - go func() { - if err := s.startServer(ctx, listener); err != nil { - logger.Error(err, "error starting server") - } - }() - - return &http.Client{ - Transport: &http.Transport{ - DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { - return listener.Dial() - }, - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, - }, - }, - }, nil -} - var _ = Describe("Simulator", func() { DescribeTable("chat completions streaming", @@ -125,7 +43,7 @@ var _ = Describe("Simulator", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, true) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, true) stream := openaiclient.Chat.Completions.NewStreaming(ctx, params) defer func() { err := stream.Close() @@ -161,7 +79,7 @@ var _ = Describe("Simulator", func() { Expect(dataset.IsValidText(msg)).To(BeTrue()) } else { // in case of echo mode check that the text is returned as-is - Expect(msg).Should(Equal(userMessage)) + Expect(msg).Should(Equal(testUserMessage)) } Expect(role).Should(Equal("assistant")) }, @@ -178,7 +96,7 @@ var _ = Describe("Simulator", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, true) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, true) stream := openaiclient.Completions.NewStreaming(ctx, params) defer func() { err := stream.Close() @@ -210,7 +128,7 @@ var _ = Describe("Simulator", func() { Expect(dataset.IsValidText(text)).To(BeTrue()) } else { // in case of echo mode check that the text is returned as-is - Expect(text).Should(Equal(userMessage)) + Expect(text).Should(Equal(testUserMessage)) } }, func(mode string) string { @@ -226,7 +144,7 @@ var _ = Describe("Simulator", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) numTokens := 0 // if maxTokens and maxCompletionTokens are passsed // maxCompletionTokens is used @@ -271,7 +189,7 @@ var _ = Describe("Simulator", func() { Expect(dataset.IsValidText(msg)).To(BeTrue()) } else { // in case of echo mode check that the text is returned as-is - Expect(msg).Should(Equal(userMessage)) + Expect(msg).Should(Equal(testUserMessage)) } } }, @@ -303,7 +221,7 @@ var _ = Describe("Simulator", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false) numTokens := 0 if maxTokens != 0 { params.MaxTokens = param.NewOpt(int64(maxTokens)) @@ -342,7 +260,7 @@ var _ = Describe("Simulator", func() { Expect(dataset.IsValidText(text)).To(BeTrue()) } else { // in case of echo mode check that the text is returned as-is - Expect(text).Should(Equal(userMessage)) + Expect(text).Should(Equal(testUserMessage)) } } }, @@ -433,10 +351,10 @@ var _ = Describe("Simulator", func() { podNameEnv: testPod, podNsEnv: testNamespace, } - client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs) + client, err := startServerWithEnv(ctx, common.ModeRandom, envs) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false) var httpResp *http.Response resp, err := openaiclient.Completions.New(ctx, params, option.WithResponseInto(&httpResp)) Expect(err).NotTo(HaveOccurred()) @@ -461,10 +379,10 @@ var _ = Describe("Simulator", func() { podNameEnv: testPod, podNsEnv: testNamespace, } - client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs) + client, err := startServerWithEnv(ctx, common.ModeRandom, envs) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, true) + openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, true) var httpResp *http.Response resp, err := openaiclient.Completions.New(ctx, params, option.WithResponseInto(&httpResp)) Expect(err).NotTo(HaveOccurred()) @@ -485,14 +403,14 @@ var _ = Describe("Simulator", func() { It("Should reject requests exceeding context window", func() { ctx := context.TODO() // Start server with max-model-len=10 - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--max-model-len", "10"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--max-model-len", "10"} + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) // Test with raw HTTP to verify the error response format reqBody := `{ "messages": [{"role": "user", "content": "This is a test message"}], - "model": "my_model", + "model": "testmodel", "max_tokens": 8 }` @@ -513,7 +431,7 @@ var _ = Describe("Simulator", func() { Expect(string(body)).To(ContainSubstring("BadRequestError")) // Also test with OpenAI client to ensure it gets an error - openaiclient, params := getOpenAIClientAndChatParams(client, model, "This is a test message", false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, "This is a test message", false) params.MaxTokens = openai.Int(8) _, err = openaiclient.Chat.Completions.New(ctx, params) @@ -526,11 +444,11 @@ var _ = Describe("Simulator", func() { It("Should accept requests within context window", func() { ctx := context.TODO() // Start server with max-model-len=50 - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, "--max-model-len", "50"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--max-model-len", "50"} + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, "Hello", false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, "Hello", false) params.MaxTokens = openai.Int(5) // Send a request within the context window @@ -538,20 +456,20 @@ var _ = Describe("Simulator", func() { Expect(err).NotTo(HaveOccurred()) Expect(resp.Choices).To(HaveLen(1)) - Expect(resp.Model).To(Equal(model)) + Expect(resp.Model).To(Equal(testModel)) }) It("Should handle text completion requests exceeding context window", func() { ctx := context.TODO() // Start server with max-model-len=10 - args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--max-model-len", "10"} - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--max-model-len", "10"} + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) // Test with raw HTTP for text completion reqBody := `{ "prompt": "This is a long test prompt with many words", - "model": "my_model", + "model": "testmodel", "max_tokens": 5 }` @@ -571,58 +489,3 @@ var _ = Describe("Simulator", func() { }) }) }) - -func sendSimpleChatRequest(envs map[string]string, streaming bool) *http.Response { - ctx := context.TODO() - - client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs) - Expect(err).NotTo(HaveOccurred()) - - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, streaming) - var httpResp *http.Response - resp, err := openaiclient.Chat.Completions.New(ctx, params, option.WithResponseInto(&httpResp)) - Expect(err).NotTo(HaveOccurred()) - Expect(resp).NotTo(BeNil()) - - Expect(resp.Choices).ShouldNot(BeEmpty()) - Expect(string(resp.Object)).To(Equal(chatCompletionObject)) - - return httpResp -} - -func getOpenAIClientAndChatParams(client option.HTTPClient, model string, message string, - streaming bool) (openai.Client, openai.ChatCompletionNewParams) { - openaiclient := openai.NewClient( - option.WithBaseURL(baseURL), - option.WithHTTPClient(client)) - - params := openai.ChatCompletionNewParams{ - Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(message), - }, - Model: model, - } - if streaming { - params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)} - } - return openaiclient, params -} - -// nolint -func getOpenAIClentAndCompletionParams(client option.HTTPClient, model string, message string, - streaming bool) (openai.Client, openai.CompletionNewParams) { - openaiclient := openai.NewClient( - option.WithBaseURL(baseURL), - option.WithHTTPClient(client)) - - params := openai.CompletionNewParams{ - Prompt: openai.CompletionNewParamsPromptUnion{ - OfString: openai.String(message), - }, - Model: openai.CompletionNewParamsModel(model), - } - if streaming { - params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)} - } - return openaiclient, params -} diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go new file mode 100644 index 00000000..1919ec1c --- /dev/null +++ b/pkg/llm-d-inference-sim/test_utils.go @@ -0,0 +1,427 @@ +package llmdinferencesim + +import ( + "context" + "crypto/tls" + "errors" + "fmt" + "io" + "math" + "net" + "net/http" + "os" + "regexp" + "sort" + "strconv" + "strings" + "time" + + "github.com/llm-d/llm-d-inference-sim/pkg/common" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/option" + "github.com/openai/openai-go/v3/packages/param" + "github.com/valyala/fasthttp/fasthttputil" + "k8s.io/klog/v2" + + "github.com/onsi/gomega" +) + +const ( + qwenModelName = "Qwen/Qwen2-0.5B" + baseURL = "http://localhost/v1" + testModel = "testmodel" + testUserMessage = "This is a test." + metricsUrl = "http://localhost/metrics" +) + +var userMsgTokens int64 + +// Starts server in the given mode, no additional arguments or environment variables +func startServer(ctx context.Context, mode string) (*http.Client, error) { + return startServerWithArgsAndEnv(ctx, mode, nil, nil) +} + +// Starts server in the given mode and environment variables +func startServerWithEnv(ctx context.Context, mode string, envs map[string]string) (*http.Client, error) { + return startServerWithArgsAndEnv(ctx, mode, nil, envs) +} + +// Starts server according the given arguments +// if args are defined - mode defined in args will override the mode defined by the mode parameter +func startServerWithArgs(ctx context.Context, args []string) (*http.Client, error) { + return startServerWithArgsAndEnv(ctx, "", args, nil) +} + +// Starts server according the given parmaters: mode, arguments and environment +// if args are defined - the mode parameter is discarded, value from args is used +func startServerWithArgsAndEnv(ctx context.Context, mode string, args []string, envs map[string]string) (*http.Client, error) { + oldArgs := os.Args + defer func() { + os.Args = oldArgs + }() + + if args != nil { + os.Args = args + } else { + os.Args = []string{"cmd", "--model", testModel, "--mode", mode} + } + + if envs != nil { + for k, v := range envs { + err := os.Setenv(k, v) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + defer func() { + for k := range envs { + err := os.Unsetenv(k) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + } + + logger := klog.Background() + + s, err := New(logger) + if err != nil { + return nil, err + } + config, err := common.ParseCommandParamsAndLoadConfig() + if err != nil { + return nil, err + } + s.config = config + + // calculate number of tokens for user message, + // must be activated after parseCommandParamsAndLoadConfig since it initializes the random engine + userMsgTokens = int64(len(common.Tokenize(testUserMessage))) + + if err := s.initializeSim(ctx); err != nil { + return nil, err + } + + listener := fasthttputil.NewInmemoryListener() + + // start the http server + go func() { + if err := s.startServer(ctx, listener); err != nil { + logger.Error(err, "error starting server") + } + }() + + return &http.Client{ + Transport: &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return listener.Dial() + }, + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + }, nil +} + +// startServerAndSendRequest - starts server configured according the given latency parameters in echo mode, +// sends a single request with the given prompt +func startServerAndSendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client { + ctx := context.TODO() + args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho, + // "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency), + // "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken), + "--time-to-first-token", strconv.Itoa(ttft), + "--prefill-time-per-token", strconv.Itoa(prefillTimePerToken), + "--inter-token-latency", strconv.Itoa(interTokenLatency), + } + + client, err := startServerWithArgs(ctx, args) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming) + + // send a single request in a serial way + _, err = openaiclient.Chat.Completions.New(ctx, params) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + return client +} + +// sendSimpleChatRequest starts server using the given environment variables and sends one chat completions request +func sendSimpleChatRequest(envs map[string]string, streaming bool) *http.Response { + ctx := context.TODO() + + client, err := startServerWithEnv(ctx, common.ModeRandom, envs) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, streaming) + var httpResp *http.Response + resp, err := openaiclient.Chat.Completions.New(ctx, params, option.WithResponseInto(&httpResp)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(resp).NotTo(gomega.BeNil()) + + gomega.Expect(resp.Choices).ShouldNot(gomega.BeEmpty()) + gomega.Expect(string(resp.Object)).To(gomega.Equal(chatCompletionObject)) + + return httpResp +} + +// getOpenAIClientAndChatParams - creates an openai client and params for /chat/completions call based on the given parameters +func getOpenAIClientAndChatParams(client option.HTTPClient, model string, message string, + streaming bool) (openai.Client, openai.ChatCompletionNewParams) { + openaiclient := openai.NewClient( + option.WithBaseURL(baseURL), + option.WithHTTPClient(client)) + + params := openai.ChatCompletionNewParams{ + Messages: []openai.ChatCompletionMessageParamUnion{ + openai.UserMessage(message), + }, + Model: model, + } + if streaming { + params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)} + } + return openaiclient, params +} + +// nolint +// getOpenAIClentAndCompletionParams - creates an openai client and params for /completions call based on the given parameters +func getOpenAIClentAndCompletionParams(client option.HTTPClient, model string, message string, + streaming bool) (openai.Client, openai.CompletionNewParams) { + openaiclient := openai.NewClient( + option.WithBaseURL(baseURL), + option.WithHTTPClient(client)) + + params := openai.CompletionNewParams{ + Prompt: openai.CompletionNewParamsPromptUnion{ + OfString: openai.String(message), + }, + Model: openai.CompletionNewParamsModel(model), + } + if streaming { + params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)} + } + return openaiclient, params +} + +// isLoraMetricPresent checks if a matching metric exists +// metrics: the list of metrics +// running: list of loras in running_lora_adapters, the order does not matter +// waiting: list of loras in waiting_lora_adapters, the order does not matter +func isLoraMetricPresent(metrics []string, running, waiting []string) bool { + return findLoraMetric(metrics, running, waiting) != "" +} + +// getLoraTimestamp returns timestamp or nil, error +func getLoraTimestamp(metrics []string, running, waiting []string) (*float64, error) { + metric := findLoraMetric(metrics, running, waiting) + if metric == "" { + return nil, nil // not found + } + return extractTimestamp(metric) +} + +// extractTimestamp gets timestamp from the given metric +func extractTimestamp(metric string) (*float64, error) { + // Extract timestamp: last part after space + parts := strings.Split(metric, " ") + if len(parts) < 2 { + return nil, errors.New("invalid metric format") + } + timestampStr := parts[len(parts)-1] + timestamp, err := strconv.ParseFloat(timestampStr, 64) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + return ×tamp, nil +} + +func getLoraValidTimestamp(metrics []string, running, waiting []string) float64 { + timestamp, err := getLoraTimestamp(metrics, running, waiting) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(timestamp).ToNot(gomega.BeNil()) + return *timestamp +} + +func getLastLoraMetrics(metrics []string) ([]string, error) { + lastTimestamp := float64(0) + var lastMetrics []string + for _, metric := range metrics { + if strings.HasPrefix(metric, "vllm:lora_requests_info") { + timestamp, err := extractTimestamp(metric) + if err != nil { + return nil, err + } + if lastTimestamp > *timestamp { + continue + } + lastTimestamp = *timestamp + if lastTimestamp < *timestamp { + lastMetrics = make([]string, 0) + } + lastMetrics = append(lastMetrics, metric) + } + } + return lastMetrics, nil +} + +// findLoraMetric finds the relevant metric by comparing with the given loras sets (ignoring order) +// metrics: lines of metrics +// running: list of running loras to find +// waiting: list of waiting loras to find +// Looks for a line with the given running and waiting loras sets, the comparison is order agnostic. +// Return metric should match in both running and waiting sets. +// E.g. for input running=["l1", "l2", "l3"] and waiting=[] will return metric +// with running_lora_adapters=["l3", "l1", "l2"] and waiting_lora_adapters=[] +func findLoraMetric(metrics []string, running, waiting []string) string { + // sort input arrays before compare, create string of all values, separated by comma + sort.Strings(running) + sort.Strings(waiting) + runStr := strings.Join(running, ",") + waitStr := strings.Join(waiting, ",") + + // regex to extract lora metrics and values + re := regexp.MustCompile(`vllm:lora_requests_info\{.*running_lora_adapters="([^"]*)".*waiting_lora_adapters="([^"]*)".*\}\s+([0-9.e\+\-]+)`) + for _, metric := range metrics { + matches := re.FindStringSubmatch(metric) + if len(matches) == 4 { + // this line contains loraInfo metric, check running and waiting loras lists + // split and sort metric's running and waiting loras lists for the comparison + metricRun := splitString(matches[1]) + metricWait := splitString(matches[2]) + sort.Strings(metricRun) + sort.Strings(metricWait) + // if both lists are the same - return the metric + if strings.Join(metricRun, ",") == runStr && strings.Join(metricWait, ",") == waitStr { + return metric + } + } // if the metric is not in the required format - skip it + } + + // required metric was not found + return "" +} + +// splits the given string to array of strings with separator = "," +func splitString(str string) []string { + if str == "" { + return []string{} + } + return strings.Split(str, ",") +} + +// findMetric returns the value for the first metrics with the given prefix or an empty string if not found +func findMetric(metrics []string, metricPrefix string) string { + // regex to extract metrics and values + for _, metric := range metrics { + if strings.Contains(metric, metricPrefix) { + arr := strings.Split(metric, " ") + if len(arr) == 2 { + return arr[1] + } + break + } + } + // required metric was not found + return "" +} + +// findIntMetric returns the value for the first metrics with the given prefix as int or nil if not found +func findIntMetric(metrics []string, metricPrefix string) *int { + valueStr := findMetric(metrics, metricPrefix) + if valueStr == "" { + return nil + } + + val, err := strconv.Atoi(valueStr) + if err != nil { + return nil + } + return &val +} + +// getFloatBucketMetricLine builds a string which will defin bucket metric line for the given parameters +// model the model name +// metrics the metric name +// bucketBoundary the upper bucket boundary, Inf(1) defines the last bucket +// count bucket samples count +func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string { + buckerBoundStr := "+Inf" + if bucketBoundary != math.Inf(1) { + buckerBoundStr = fmt.Sprintf("%g", bucketBoundary) + } + return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count) +} + +// checkBucketBoundary checks that the given bucket's samples count is valid according the given parameters +// Scenario is a single request, so buckets counts could be 0 or 1. +// Buckets lower than the expected value should have count 0, other buckets - count 1. +// Important note: since metrics represent real timing, it could be a little bit higher than the expected, +// which is based on the pure latencies calculations, on in case the expected value is equal or very close to the +// upper bounary we can get any value (0 or 1), in this case we don't check this bucket +// metrics the full metrics response +// modelName the model name +// metricName the specific metric name +// bucketBoudary the upper boundary of the required bucket +// prevBoundary the upper boundary of the previous bucket +// expectedValue expected value in the histogram +func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64, + prevBoundary float64, expectedValue float64) { + if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 { + // expected time is too close to the bucket's boudary + // it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok + // since there is some additional calculation time + fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n", + prevBoundary, bucketBoudary, expectedValue) + return + } + expectedCount := 0 + if bucketBoudary > expectedValue { + expectedCount = 1 + } + gomega.Expect(metrics).To(gomega.ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount))) +} + +// checkLatencyMertics sends /metrics request and checks that latency related values are valid +// client the http client to be used for request send +// modelName the model name +// numOfOutputTokens number of tokens in the output of the completion request we want to validate +// ttft time to first token parameter +// prefillTimePerToken prefill time per input tokens +// interTokenLatency processing time per output token +func checkLatencyMertics(client *http.Client, modelName string, numOfInputTokens int, numOfOutputTokens int, ttft int, prefillTimePerToken int, interTokenLatency int) { + // wait a little bit and check metrics + time.Sleep(300 * time.Millisecond) + metricsResp, err := client.Get(metricsUrl) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(metricsResp.StatusCode).To(gomega.Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + metrics := string(data) + + var expectedPrefillTime float64 + // TODO take into consideration remote prefill + if ttft > 0 { + // time-to-first-token overwrites calculation of prefill time based on number of input tokens + expectedPrefillTime = float64(ttft) / 1000 + + } else { + expectedPrefillTime = float64(numOfInputTokens*prefillTimePerToken) / 1000 + } + expectedDecodeTime := float64(interTokenLatency*(numOfOutputTokens-1)) / 1000 + expectedE2ELatency := expectedPrefillTime + expectedDecodeTime + + prevBoundary := math.Inf(-1) + + for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries { + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency) + + prevBoundary = bucketBoudary + } + // check the last bucket + lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1] + checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime) + checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime) + checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency) +} diff --git a/pkg/llm-d-inference-sim/tools_test.go b/pkg/llm-d-inference-sim/tools_test.go index c504c5d5..431d742e 100644 --- a/pkg/llm-d-inference-sim/tools_test.go +++ b/pkg/llm-d-inference-sim/tools_test.go @@ -377,7 +377,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, true) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, true) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = tools @@ -459,7 +459,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = tools @@ -569,8 +569,8 @@ var _ = Describe("Simulator for request with tools", func() { for _, invalidTool := range invalidTools { params := openai.ChatCompletionNewParams{ - Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(userMessage)}, - Model: model, + Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(testUserMessage)}, + Model: testModel, ToolChoice: openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}, Tools: invalidTool, } @@ -588,16 +588,16 @@ var _ = Describe("Simulator for request with tools", func() { DescribeTable("array parameter, no streaming", func(mode string, minLength int, maxLength int, min float64, max float64) { ctx := context.TODO() - serverArgs := []string{"cmd", "--model", model, "--mode", mode, + serverArgs := []string{"cmd", "--model", testModel, "--mode", mode, "--min-tool-call-array-param-length", strconv.Itoa(minLength), "--max-tool-call-array-param-length", strconv.Itoa(maxLength), "--min-tool-call-number-param", fmt.Sprint(min), "--max-tool-call-number-param", fmt.Sprint(max), } - client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil) + client, err := startServerWithArgs(ctx, serverArgs) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithArray @@ -646,7 +646,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWith3DArray @@ -699,7 +699,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithWrongMinMax @@ -718,7 +718,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithObjects @@ -773,7 +773,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithObjectAndArray @@ -818,13 +818,13 @@ var _ = Describe("Simulator for request with tools", func() { DescribeTable("tool with not required params", func(probability int, numberOfParams int) { ctx := context.TODO() - serverArgs := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + serverArgs := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--tool-call-not-required-param-probability", strconv.Itoa(probability), } - client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil) + client, err := startServerWithArgs(ctx, serverArgs) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithoutRequiredParams @@ -854,15 +854,15 @@ var _ = Describe("Simulator for request with tools", func() { DescribeTable("tool with object with not required params", func(probability int, numberOfParams int, min int, max int) { ctx := context.TODO() - serverArgs := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + serverArgs := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--object-tool-call-not-required-field-probability", strconv.Itoa(probability), "--min-tool-call-integer-param", strconv.Itoa(min), "--max-tool-call-integer-param", strconv.Itoa(max), } - client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil) + client, err := startServerWithArgs(ctx, serverArgs) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")} params.Tools = toolWithObjectWithoutRequiredParams diff --git a/pkg/llm-d-inference-sim/worker_test.go b/pkg/llm-d-inference-sim/worker_test.go index bec8bcb6..21181842 100644 --- a/pkg/llm-d-inference-sim/worker_test.go +++ b/pkg/llm-d-inference-sim/worker_test.go @@ -33,18 +33,16 @@ import ( "github.com/openai/openai-go/v3/option" ) -const modelName = "testmodel" - var _ = Describe("Simulator requests scheduling", Ordered, func() { Context("Requests for already loaded loras should be handled first", func() { DescribeTable("Should process in correct order simultaneous requests to two loras", func(maxNumSeq string) { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "500", "--max-num-seqs", maxNumSeq, "--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient(option.WithBaseURL(baseURL), option.WithHTTPClient(client)) @@ -86,13 +84,13 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { DescribeTable("Should process in correct order delayed requests to two loras", func(maxNumSeq string, maxLoras string, checkOrder func([]int)) { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "1000", "--max-num-seqs", maxNumSeq, "--max-loras", maxLoras, "--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient(option.WithBaseURL(baseURL), @@ -127,7 +125,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { It("Should keep the order of requests with one worker", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "500", "--max-num-seqs", "1", "--max-loras", "1", "--lora-modules", @@ -136,7 +134,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient(option.WithBaseURL(baseURL), @@ -167,7 +165,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { It("Should keep the order of requests with two workers", func() { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "500", "--max-num-seqs", "2", "--max-loras", "1", "--lora-modules", @@ -176,7 +174,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient(option.WithBaseURL(baseURL), @@ -207,7 +205,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { DescribeTable("Should keep the order of requests with multiple workers and loras", func(maxNumSeq string, maxLoras string, checkOrder func([]int)) { ctx := context.TODO() - args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--time-to-first-token", "1000", "--max-num-seqs", maxNumSeq, "--max-loras", maxLoras, "--lora-modules", @@ -217,7 +215,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { "{\"name\":\"lora5\",\"path\":\"/path/to/lora5\"}", "{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"} - client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient(option.WithBaseURL(baseURL), @@ -258,7 +256,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { Context("Stress", func() { It("Should work correctly with many simultaneous requests", func() { ctx := context.TODO() - args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "3000", "--max-num-seqs", "12", "--max-loras", "2", "--lora-modules", "{\"name\":\"lora0\",\"path\":\"/path/to/lora0\"}", @@ -268,7 +266,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { "{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}", } - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -282,7 +280,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { defer GinkgoRecover() params := openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: fmt.Sprintf("lora%d", i%5), } @@ -331,7 +329,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}" waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}" ctx := context.TODO() - args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom, + args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600", "--max-num-seqs", "1000", "--max-loras", "2", "--max-waiting-queue-length", "1500", "--lora-modules", @@ -339,7 +337,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}", } - client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + client, err := startServerWithArgs(ctx, args) Expect(err).NotTo(HaveOccurred()) openaiclient := openai.NewClient( @@ -353,7 +351,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { defer GinkgoRecover() params := openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: fmt.Sprintf("lora%d", i%2), } @@ -392,7 +390,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { defer GinkgoRecover() params := openai.ChatCompletionNewParams{ Messages: []openai.ChatCompletionMessageParamUnion{ - openai.UserMessage(userMessage), + openai.UserMessage(testUserMessage), }, Model: fmt.Sprintf("lora%d", i%2), } From 031e4619c95a3e8be3b11ffee2faf47611e16c10 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Mon, 27 Oct 2025 22:48:33 +0200 Subject: [PATCH 07/14] Add test for vllm:request_queue_time_seconds and vllm:request_inference_time_seconds Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics_test.go | 53 ++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 4cc1b948..8d2cee97 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "io" + "math" "net/http" "os" "strings" @@ -808,7 +809,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { }) }) - Context("latency metrics", func() { + Context("single request latency metrics", func() { numOfTokens := len(common.Tokenize(testUserMessage)) DescribeTable("should calculate all latency related metrics correctly for a single request", @@ -831,6 +832,56 @@ var _ = Describe("Simulator metrics", Ordered, func() { Entry(nil, "prefill per token + inter token time", 0, 100, 100), ) }) + + Context("multiple requests latency metrics", func() { + It("should calculate waiting and inference time correctly", func() { + ctx := context.TODO() + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, + "--time-to-first-token", "1000", "--max-num-seqs", "1", + } + + client, err := startServerWithArgs(ctx, args) + Expect(err).NotTo(HaveOccurred()) + + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) + + var reqWg sync.WaitGroup + reqWg.Add(2) + + // send two requests + for range 2 { + go func() { + defer reqWg.Done() + defer GinkgoRecover() + + _, err := openaiclient.Chat.Completions.New(ctx, params) + Expect(err).NotTo(HaveOccurred()) + }() + } + + reqWg.Wait() + time.Sleep(300 * time.Millisecond) + metricsResp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(metricsResp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(metricsResp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + + for _, boundary := range common.RequestLatencyBucketsBoundaries { + if boundary < 1.5 { + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, 0))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, 0))) + } else { + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, 2))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, 1))) + } + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, math.Inf(1), 2))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, math.Inf(1), 1))) + }) + }) }) var _ = Describe("build125Buckets", Ordered, func() { From e16f9f9fbdc352f05477e13cdf4d42f30e75db40 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 10:18:19 +0200 Subject: [PATCH 08/14] Define constant for metrics names, use helper functions in metrics test for histogram buckets validation Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 15 +- pkg/llm-d-inference-sim/metrics_test.go | 250 ++++++++---------------- pkg/llm-d-inference-sim/test_utils.go | 7 +- 3 files changed, 94 insertions(+), 178 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 4322b024..725b3c6c 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -38,6 +38,11 @@ const ( reqInferenceTimeMetricName = "vllm:request_inference_time_seconds" prefillTimeMetricName = "vllm:request_prefill_time_seconds" decodeTimeMetricName = "vllm:request_decode_time_seconds" + ttftMetricName = "vllm:time_to_first_token_seconds" + tpotMetricName = "vllm:time_per_output_token_seconds" + generationTokensMetricName = "vllm:request_generation_tokens" + paramMaxTokensMetricName = "vllm:request_params_max_tokens" + promptTokensMetricName = "vllm:request_prompt_tokens" ) // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator @@ -92,7 +97,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.ttft = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:time_to_first_token_seconds", + Name: ttftMetricName, Help: "Histogram of time to first token in seconds.", Buckets: common.TTFTBucketsBoundaries, }, @@ -107,7 +112,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.tpot = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:time_per_output_token_seconds", + Name: tpotMetricName, Help: "Histogram of time per output token in seconds.", Buckets: common.TPOTBucketsBoundaries, }, @@ -211,7 +216,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.requestPromptTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_prompt_tokens", + Name: promptTokensMetricName, Help: "Number of prefill tokens processed.", Buckets: build125Buckets(s.config.MaxModelLen), }, @@ -225,7 +230,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.requestGenerationTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_generation_tokens", + Name: generationTokensMetricName, Help: "Number of generation tokens processed.", Buckets: build125Buckets(s.config.MaxModelLen), }, @@ -239,7 +244,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.requestParamsMaxTokens = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: "", - Name: "vllm:request_params_max_tokens", + Name: paramMaxTokensMetricName, Help: "Histogram of the max_tokens request parameter.", Buckets: build125Buckets(s.config.MaxModelLen), }, diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 8d2cee97..b55068f8 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -148,30 +148,21 @@ var _ = Describe("Simulator metrics", Ordered, func() { data, err := io.ReadAll(metricsResp.Body) Expect(err).NotTo(HaveOccurred()) metrics := string(data) - // request_prompt_tokens_bucket - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`)) - // request_params_max_tokens_bucket - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 0`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 1`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 1`)) + // request_prompt_tokens_bucket and request_params_max_tokens_bucket + buckets := build125Buckets(1024) + + for _, boundary := range buckets { + if boundary <= 20 { + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, 0))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, 0))) + } else { + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, 1))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, 1))) + } + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1))) + // request_generation_tokens // We do not verify the distribution of the number of tokens generated per request, // as the number of generated tokens is unpredictable in this test. @@ -420,84 +411,35 @@ var _ = Describe("Simulator metrics", Ordered, func() { data, err := io.ReadAll(metricsResp.Body) Expect(err).NotTo(HaveOccurred()) metrics := string(data) + // ttft - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1")) - // tpot - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 0")) + for _, boundary := range common.TTFTBucketsBoundaries { + if boundary <= 0.1 { + // buckets up to 0.1 should be empty + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 0))) + } else { + // buckets higher than 0.1 should contain a single sample + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 1))) + } + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, math.Inf(1), 1))) + // tpot metricsLines := strings.Split(metrics, "\n") - // the following values should be greater than 0, we don't know the exact value since it depends on the random response length - count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.2\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.3\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.4\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"}") - Expect(count).ToNot(BeNil()) - Expect(*count).To(BeNumerically(">", 0)) - count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"}") + var count *int + + for _, boundary := range common.TPOTBucketsBoundaries { + if boundary <= 0.075 { + // ensure that values for buckets up to 0.075 have count 0 + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, boundary, 0))) + } else { + // buckets higher than 0.75 should be greater than 0, we don't know the exact value since it depends on the random response length + count = findIntMetric(metricsLines, getFloatBucketMetricPrefix(testModel, tpotMetricName, 0.1)) + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + } + } + count = findIntMetric(metricsLines, getFloatBucketMetricPrefix(testModel, tpotMetricName, math.Inf(1))) Expect(count).ToNot(BeNil()) Expect(*count).To(BeNumerically(">", 0)) }() @@ -709,53 +651,39 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 3")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 6")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 6")) - - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 3")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 6")) - Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"} 6")) - - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) - - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) - - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 10`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 30`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 60`)) - Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 60`)) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.001, 1))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.005, 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.01, 6))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.02, 6))) + + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.01, 0))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.025, 0))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.05, 1))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.075, 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.1, 6))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.15, 6))) + + buckets := build125Buckets(1024) + + for _, boudary := range buckets { + switch boudary { + case 1.0: + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 1, 10))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 1, 10))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 1, 10))) + case 2.0: + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 2, 30))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 2, 30))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 2, 30))) + default: + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boudary, 60))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boudary, 60))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boudary, 60))) + } + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), 60))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 60))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 60))) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`)) @@ -783,29 +711,10 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics := string(data) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1")) + for _, boundary := range common.TTFTBucketsBoundaries { + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 0))) + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, math.Inf(1), 1))) }) }) @@ -952,9 +861,6 @@ var _ = Describe("build125Buckets", Ordered, func() { for _, test := range tests { got := build125Buckets(test.maxValue) Expect(got).To(Equal(test.want)) - // if !reflect.DeepEqual(got, test.want) { - // t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want) - // } } }) }) diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go index 1919ec1c..c123ea85 100644 --- a/pkg/llm-d-inference-sim/test_utils.go +++ b/pkg/llm-d-inference-sim/test_utils.go @@ -344,11 +344,16 @@ func findIntMetric(metrics []string, metricPrefix string) *int { // bucketBoundary the upper bucket boundary, Inf(1) defines the last bucket // count bucket samples count func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string { + return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count) +} + +// same as getFloatBucketMetricLine but without the value part +func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string { buckerBoundStr := "+Inf" if bucketBoundary != math.Inf(1) { buckerBoundStr = fmt.Sprintf("%g", bucketBoundary) } - return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count) + return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"}", metric, model, buckerBoundStr) } // checkBucketBoundary checks that the given bucket's samples count is valid according the given parameters From 62e4c6aa27a0456d7c41ff8f2ee6c8f44d58ddb7 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:09:14 +0200 Subject: [PATCH 09/14] - Add full list of supported metrics to readme - Create constants for all metrics - Define all latency related fake metrics in config - Add validation for new fake metrics in config Signed-off-by: Maya Barnea --- README.md | 13 +++++- pkg/common/config.go | 59 +++++++++++++++++++++---- pkg/llm-d-inference-sim/metrics.go | 15 ++++--- pkg/llm-d-inference-sim/metrics_test.go | 30 ++++++------- pkg/llm-d-inference-sim/test_utils.go | 10 ++++- pkg/llm-d-inference-sim/worker_test.go | 8 ++-- pkg/vllm-api/vllm-models.go | 3 -- 7 files changed, 100 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index c7c88c98..0853bd60 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar | vllm:lora_requests_info | Running stats on LoRA requests | | vllm:num_requests_running | Number of requests currently running on GPU | | vllm:num_requests_waiting | Prometheus metric for the number of queued requests | - +| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds | +| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request | +| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request | +| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request | +| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request | +| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds | +| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds | +| vllm:request_generation_tokens | Number of generation tokens processed | +| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | +| vllm:request_prompt_tokens | Number of prefill tokens processed | +| vllm:request_success_total | Count of successfully processed requests | + The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint. The simulator supports two modes of operation: diff --git a/pkg/common/config.go b/pkg/common/config.go index 49825a48..bc82087b 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -232,16 +232,17 @@ type Metrics struct { WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"` // KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1) KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"` - // TTFTBuckets is an array of values for time-to-first-token buckets, - // each value in this array is a value for the corresponding bucket. + + // Histogram metrics - defined by array of values. + // Each value in this array is a value for the corresponding bucket. // Array may contain less values than number of buckets, all trailing missing values assumed as 0. + + // TTFTBuckets is an array of values for time-to-first-token buckets. // Buckets upper boundaries in seconds are: // 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, // 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"` - // TPOTBuckets is an array of values for time-per-output-token buckets, - // each value in this array is a value for the corresponding bucket. - // Array may contain less values than number of buckets, all trailing missing values assumed as 0. + // TPOTBuckets is an array of values for time-per-output-token buckets. // Buckets upper boundaries in seconds are: // 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, // 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf @@ -253,13 +254,21 @@ type Metrics struct { RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples // RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.). RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"` - // E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets, - // each value in this array is a value for the corresponding bucket. - // Array may contain less values than number of buckets, all trailing missing values assumed as 0. - // Buckets upper boundaries in seconds are: + + // Latency histograms - have same buckets upper boundaries in seconds are: // 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, // 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf + + // E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets. E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"` + // ReqQueueTimeBucketValues is an array of values for request queue time buckets. + ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"` + // ReqInfTimeBucketValues is an array of values for request inference time buckets. + ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"` + // ReqPrefillTimeBucketValues is an array of values for request prefill time buckets. + ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"` + // ReqDecodeTimeBucketValues is an array of values for request decode time buckets. + ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"` } type LorasMetrics struct { @@ -595,6 +604,38 @@ func (c *Configuration) validate() error { return errors.New("fake metrics request-params-max-tokens cannot contain negative values") } } + + for _, v := range c.FakeMetrics.RequestParamsMaxTokens { + if v < 0 { + return errors.New("fake metrics request-params-max-tokens cannot contain negative values") + } + } + + for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues { + if v < 0 { + return errors.New("fake metrics e2erl-buckets-values cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues { + if v < 0 { + return errors.New("fake metrics queue-time-buckets-values cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.ReqInfTimeBucketValues { + if v < 0 { + return errors.New("fake metrics inf-time-buckets-values cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues { + if v < 0 { + return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values") + } + } + for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues { + if v < 0 { + return errors.New("fake metrics decode-time-buckets-values cannot contain negative values") + } + } } if c.DPSize < 1 || c.DPSize > 8 { diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 725b3c6c..91c3dc25 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -43,6 +43,11 @@ const ( generationTokensMetricName = "vllm:request_generation_tokens" paramMaxTokensMetricName = "vllm:request_params_max_tokens" promptTokensMetricName = "vllm:request_prompt_tokens" + successTotalMetricName = "vllm:request_success_total" + loraRequestsMetricName = "vllm:lora_requests_info" + reqRunningMetricName = "vllm:num_requests_running" + reqWaitingMetricName = "vllm:num_requests_waiting" + gpuCacheUsageMetricName = "vllm:gpu_cache_usage_perc" ) // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator @@ -54,7 +59,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.loraInfo = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", - Name: "vllm:lora_requests_info", + Name: loraRequestsMetricName, Help: "Running stats on lora requests.", }, []string{vllmapi.PromLabelMaxLora, vllmapi.PromLabelRunningLoraAdapters, vllmapi.PromLabelWaitingLoraAdapters}, @@ -68,7 +73,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.runningRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", - Name: "vllm:num_requests_running", + Name: reqRunningMetricName, Help: "Number of requests currently running on GPU.", }, []string{vllmapi.PromLabelModelName}, @@ -83,7 +88,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.waitingRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", - Name: "vllm:num_requests_waiting", + Name: reqWaitingMetricName, Help: "Prometheus metric for the number of queued requests.", }, []string{vllmapi.PromLabelModelName}, @@ -202,7 +207,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", - Name: "vllm:gpu_cache_usage_perc", + Name: gpuCacheUsageMetricName, Help: "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).", }, []string{vllmapi.PromLabelModelName}, @@ -258,7 +263,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { s.metrics.requestSuccessTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: "", - Name: "vllm:request_success_total", + Name: successTotalMetricName, Help: "Count of successfully processed requests.", }, []string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason}, diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index b55068f8..a8f2f3ee 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -107,8 +107,8 @@ var _ = Describe("Simulator metrics", Ordered, func() { data, err := io.ReadAll(metricsResp.Body) Expect(err).NotTo(HaveOccurred()) metrics := string(data) - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 2")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 1")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 2))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 1))) }) It("Should record correct prompt and generation token counts", func() { @@ -168,7 +168,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { // as the number of generated tokens is unpredictable in this test. // Therefore, we only verify the number of requests and the total number of generated tokens, // and skip the bucket distribution. - Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`)) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, generationTokensMetricName+"_count", 1))) // request_success_total Expect(metrics).To(MatchRegexp(`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1`)) }) @@ -512,9 +512,9 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics := string(data) // Expect three running requests and two blocks in the kv cache - usage 2/16=0.125 - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 3")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.125")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 3))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.125))) time.Sleep(4 * time.Second) metricsResp, err = client.Get(metricsUrl) @@ -525,9 +525,9 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics = string(data) // The requests finished running, expect 0 usage - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0")) - Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 0))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0))) }() wg.Wait() }) @@ -592,9 +592,9 @@ var _ = Describe("Simulator metrics", Ordered, func() { // The requests were sent with 500 millisecond intervals, and the first two should be still running. // The third is waiting, and is still not in the kv-cache. // We expect one block in the kv-cache, usage 1/16=0.0625. - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 2")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 1")) - Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.0625")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 2))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 1))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.0625))) }() wg.Wait() }) @@ -645,9 +645,9 @@ var _ = Describe("Simulator metrics", Ordered, func() { data, err := io.ReadAll(resp.Body) Expect(err).NotTo(HaveOccurred()) metrics := string(data) - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30")) - Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 10))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 30))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, gpuCacheUsageMetricName, 0.4))) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09")) diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go index c123ea85..5368e770 100644 --- a/pkg/llm-d-inference-sim/test_utils.go +++ b/pkg/llm-d-inference-sim/test_utils.go @@ -245,7 +245,7 @@ func getLastLoraMetrics(metrics []string) ([]string, error) { lastTimestamp := float64(0) var lastMetrics []string for _, metric := range metrics { - if strings.HasPrefix(metric, "vllm:lora_requests_info") { + if strings.HasPrefix(metric, loraRequestsMetricName) { timestamp, err := extractTimestamp(metric) if err != nil { return nil, err @@ -347,6 +347,14 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6 return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count) } +func getCountMetricPrefix(model string, metric string) string { + return fmt.Sprintf("%s{model_name=\"%s\"}", metric, model) +} + +func getCountMetricLine(model string, metric string, count float64) string { + return fmt.Sprintf("%s %g", getCountMetricPrefix(model, metric), count) +} + // same as getFloatBucketMetricLine but without the value part func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string { buckerBoundStr := "+Inf" diff --git a/pkg/llm-d-inference-sim/worker_test.go b/pkg/llm-d-inference-sim/worker_test.go index 21181842..ce5ee076 100644 --- a/pkg/llm-d-inference-sim/worker_test.go +++ b/pkg/llm-d-inference-sim/worker_test.go @@ -300,8 +300,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { // max-num-seqs is 12, so number of running requests should be 12 // and the number of waiting requests 1000-12=988 - Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 12")) - Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 988")) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 12))) + Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 988))) // max-loras is 2, so the last lora metric should be: // running: two loras (doesn't matter which two) @@ -326,8 +326,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() { }) It("Should work correctly with many simultaneous requests with many workers", func() { - runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}" - waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}" + runningMetric := getCountMetricPrefix(testModel, reqRunningMetricName) + waitingMetric := getCountMetricPrefix(testModel, reqWaitingMetricName) ctx := context.TODO() args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600", diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go index 6c83af69..333a8284 100644 --- a/pkg/vllm-api/vllm-models.go +++ b/pkg/vllm-api/vllm-models.go @@ -26,9 +26,6 @@ const ( PromLabelMaxLora = "max_lora" PromLabelModelName = "model_name" PromLabelFinishReason = "finish_reason" - - VllmLoraRequestInfo = "vllm:lora_requests_info" - VllmNumRequestsRunning = "vllm:num_requests_running" ) // modelInfo defines data about model returned by /models API From b201e1ee4b1436315bc06945be322871055a3962 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:10:16 +0200 Subject: [PATCH 10/14] add license to test_utils.go Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/test_utils.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go index 5368e770..516f2f7f 100644 --- a/pkg/llm-d-inference-sim/test_utils.go +++ b/pkg/llm-d-inference-sim/test_utils.go @@ -1,3 +1,18 @@ +/* +Copyright 2025 The llm-d-inference-sim Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package llmdinferencesim import ( From 3e85d382c9c4645c327c5af2dfbcd5bf30745b5a Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:31:53 +0200 Subject: [PATCH 11/14] Set fake latency metrics if defined in configuration, added tests for latency fake metrics Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 91c3dc25..787d452c 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -170,7 +170,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { ) if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil { - s.logger.Error(err, "Prometheus request inerence time histogram register failed") + s.logger.Error(err, "Prometheus request inference time histogram register failed") return err } @@ -310,7 +310,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { } if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil { - s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues) + s.initFakeHistogram(s.metrics.e2eReqLatency, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues) + } + + if s.config.FakeMetrics.ReqQueueTimeBucketValues != nil { + s.initFakeHistogram(s.metrics.reqQueueTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqQueueTimeBucketValues) + } + + if s.config.FakeMetrics.ReqInfTimeBucketValues != nil { + s.initFakeHistogram(s.metrics.reqInferenceTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqInfTimeBucketValues) + } + + if s.config.FakeMetrics.ReqPrefillTimeBucketValues != nil { + s.initFakeHistogram(s.metrics.reqPrefillTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqPrefillTimeBucketValues) + } + + if s.config.FakeMetrics.ReqDecodeTimeBucketValues != nil { + s.initFakeHistogram(s.metrics.reqDecodeTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqDecodeTimeBucketValues) } } From 94be0aaac4e0d454aaad416d9ff2cbd4d71bb177 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:40:49 +0200 Subject: [PATCH 12/14] add fake latency metrics test Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics_test.go | 80 ++++++++++++++++++++----- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index a8f2f3ee..e64b7323 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -664,26 +664,26 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.15, 6))) buckets := build125Buckets(1024) + var expectedCount int - for _, boudary := range buckets { - switch boudary { + for _, boundary := range buckets { + switch boundary { case 1.0: - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 1, 10))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 1, 10))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 1, 10))) + expectedCount = 10 case 2.0: - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 2, 30))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 2, 30))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 2, 30))) + expectedCount = 30 default: - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boudary, 60))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boudary, 60))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boudary, 60))) + expectedCount = 60 } + + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, expectedCount))) + } - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), 60))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 60))) - Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 60))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount))) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`)) @@ -718,6 +718,58 @@ var _ = Describe("Simulator metrics", Ordered, func() { }) }) + Context("fake latency metrics", func() { + It("should respond with valid fake latency metrics to /metrics", func() { + ctx := context.TODO() + args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, + "--fake-metrics", + `{` + + `"e2erl-buckets-values":[0, 1, 2],` + + `"queue-time-buckets-values":[0, 1, 2],` + + `"inf-time-buckets-values":[0, 1, 2],` + + `"prefill-time-buckets-values":[0, 1, 2],` + + `"decode-time-buckets-values":[0, 1, 2]` + + `}`, + } + + client, err := startServerWithArgs(ctx, args) + Expect(err).NotTo(HaveOccurred()) + + resp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + + // buckets counts should be 0, 1, 3, 3, 3, ... + var expectedCount int + + for i, boundary := range common.RequestLatencyBucketsBoundaries { + switch i { + case 0: + expectedCount = 0 + case 1: + expectedCount = 1 + default: + expectedCount = 3 + } + + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, e2eReqLatencyMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, prefillTimeMetricName, boundary, expectedCount))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, decodeTimeMetricName, boundary, expectedCount))) + } + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, e2eReqLatencyMetricName, math.Inf(1), 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, math.Inf(1), 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, math.Inf(1), 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, prefillTimeMetricName, math.Inf(1), 3))) + Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, decodeTimeMetricName, math.Inf(1), 3))) + }) + }) + Context("single request latency metrics", func() { numOfTokens := len(common.Tokenize(testUserMessage)) From 9e6f8c1e54e154ea35a37de9cb4b63024fe18d64 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:50:26 +0200 Subject: [PATCH 13/14] fix sending latency metrics, use WriteToChannel function Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/simulator.go | 8 ++++---- pkg/llm-d-inference-sim/streaming.go | 2 +- pkg/llm-d-inference-sim/worker.go | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index cd180219..7f9bc249 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -492,7 +492,7 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) { startTime := time.Now() defer func() { - s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds() + common.WriteToChannel(s.metrics.e2eReqLatencyChan, time.Since(startTime).Seconds(), s.logger, "metrics.e2eReqLatencyChan") }() // Check if we should inject a failure @@ -623,7 +623,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r // report tpot in seconds common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan") } - s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds() + common.WriteToChannel(s.metrics.reqDecodeTimeChan, time.Since(startDecode).Seconds(), s.logger, "metrics.reqDecodeTimeChan") s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp) s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID()) @@ -683,7 +683,7 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx { if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) { s.waitingQueue.Remove(elem) s.incrementLora(item.reqCtx.CompletionReq.GetModel()) - s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds() + common.WriteToChannel(s.metrics.reqQueueTimeChan, time.Since(item.enqueueTime).Seconds(), s.logger, "metrics.reqQueueTimeChan") return item.reqCtx } } @@ -693,7 +693,7 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx { item, ok := elem.Value.(waitingQueueItem) if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) { s.waitingQueue.Remove(elem) - s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds() + common.WriteToChannel(s.metrics.reqQueueTimeChan, time.Since(item.enqueueTime).Seconds(), s.logger, "metrics.reqQueueTimeChan") return item.reqCtx } } diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index 84320464..6be9a43e 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -151,7 +151,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ } } - s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds() + common.WriteToChannel(s.metrics.reqDecodeTimeChan, time.Since(startDecode).Seconds(), s.logger, "metrics.reqDecodeTimeChan") // send the last chunk if finish reason is stop var chunk openaiserverapi.CompletionRespChunk diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go index e2a6e504..674c283f 100644 --- a/pkg/llm-d-inference-sim/worker.go +++ b/pkg/llm-d-inference-sim/worker.go @@ -62,7 +62,7 @@ type requestProcessor interface { func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) { start := time.Now() defer func() { - s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds() + common.WriteToChannel(s.metrics.reqInferenceTimeChan, time.Since(start).Seconds(), s.logger, "metrics.reqInferenceTimeChan") }() req := reqCtx.CompletionReq From 8d623a99311a1e3558517c4d4e3476acfc8b6a09 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Tue, 28 Oct 2025 12:57:04 +0200 Subject: [PATCH 14/14] fix merge Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/tools_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/llm-d-inference-sim/tools_test.go b/pkg/llm-d-inference-sim/tools_test.go index 431d742e..aa6f54c0 100644 --- a/pkg/llm-d-inference-sim/tools_test.go +++ b/pkg/llm-d-inference-sim/tools_test.go @@ -510,7 +510,7 @@ var _ = Describe("Simulator for request with tools", func() { client, err := startServer(ctx, mode) Expect(err).NotTo(HaveOccurred()) - openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false) + openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false) params.ToolChoice = openai.ToolChoiceOptionFunctionToolChoice(openai.ChatCompletionNamedToolChoiceFunctionParam{ Name: specificTool, })