From 3e3605d530b9a8ae59676e936284291797dd129c Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Thu, 16 Oct 2025 16:35:39 +0300 Subject: [PATCH 1/5] Add vllm:time_per_output_token_seconds and vllm:time_to_first_token_seconds histogram metrics, including support in fake metrics, and update of readme Signed-off-by: Maya Barnea --- README.md | 6 +- manifests/config_with_fake.yaml | 2 + pkg/common/config.go | 34 ++++++++ pkg/common/config_test.go | 7 ++ pkg/common/utils.go | 6 ++ pkg/llm-d-inference-sim/metrics.go | 100 +++++++++++++++++++++++- pkg/llm-d-inference-sim/metrics_test.go | 14 +++- pkg/llm-d-inference-sim/simulator.go | 17 ++++ pkg/llm-d-inference-sim/streaming.go | 8 ++ 9 files changed, 189 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fa4dfde2..0db50852 100644 --- a/README.md +++ b/README.md @@ -143,8 +143,10 @@ For more details see the 1 { return errors.New("fake metrics KV cache usage must be between 0 ans 1") } + if c.FakeMetrics.TTFTBucketValues != nil { + if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 { + return errors.New("fake time-to-first-token array is too long") + } + for v := range c.FakeMetrics.TTFTBucketValues { + if v < 0 { + return errors.New("time-to-first-token fake metrics should contain only non-negative values") + } + } + } + if c.FakeMetrics.TPOTBucketValues != nil { + if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 { + return errors.New("fake time-per-output-token array is too long") + } + for v := range c.FakeMetrics.TPOTBucketValues { + if v < 0 { + return errors.New("time-per-output-token fake metrics should contain only non-negative values") + } + } + } } if c.DPSize < 1 || c.DPSize > 8 { diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 1c0353ed..c6c65efc 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -203,6 +203,8 @@ var _ = Describe("Simulator configuration", func() { "{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}", "{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}", }, + TTFTBucketValues: []int{10, 20, 30, 10}, + TPOTBucketValues: []int{0, 0, 10, 20, 30}, } test = testCase{ name: "config with fake metrics file", @@ -451,6 +453,11 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--time-factor-under-load", "-1", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid ttft", + args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]", + "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 20f0cca8..87370793 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -24,6 +24,12 @@ import ( "github.com/google/uuid" ) +var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0} +var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0} + // ValidateContextWindow checks if the request fits within the model's context window // Returns validation result, actual completion tokens, and total tokens func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) { diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index e86e900f..bbaa76a4 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -27,6 +27,7 @@ import ( "github.com/prometheus/client_golang/prometheus" + "github.com/llm-d/llm-d-inference-sim/pkg/common" vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api" ) @@ -64,7 +65,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } - // not supported for now, reports constant value s.waitingRequests = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -79,7 +79,36 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } - // not supported for now, reports constant value + s.ttft = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:time_to_first_token_seconds", + Help: "Histogram of time to first token in seconds.", + Buckets: common.TTFTBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.registry.Register(s.ttft); err != nil { + s.logger.Error(err, "Prometheus time to first token histogram register failed") + return err + } + + s.tpot = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: "", + Name: "vllm:time_per_output_token_seconds", + Help: "Histogram of time per output token in seconds.", + Buckets: common.TPOTBucketsBoundaries, + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.registry.Register(s.tpot); err != nil { + s.logger.Error(err, "Prometheus time per output token histogram register failed") + return err + } + s.kvCacheUsagePercentage = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: "", @@ -107,7 +136,26 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { nRunningReqs = float64(s.config.FakeMetrics.RunningRequests) nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests) kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage) + + if s.config.FakeMetrics.TTFTBucketValues != nil { + for i, bucketVal := range s.config.FakeMetrics.TTFTBucketValues { + for range bucketVal { + s.ttft.WithLabelValues(s.getDisplayedModelName(s.config.Model)). + Observe(common.TTFTBucketsBoundaries[i]) + } + } + } + + if s.config.FakeMetrics.TPOTBucketValues != nil { + for i, bucketVal := range s.config.FakeMetrics.TPOTBucketValues { + for range bucketVal { + s.tpot.WithLabelValues(s.getDisplayedModelName(s.config.Model)). + Observe(common.TPOTBucketsBoundaries[i]) + } + } + } } + modelName := s.getDisplayedModelName(s.config.Model) s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs) s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs) @@ -181,6 +229,28 @@ func (s *VllmSimulator) reportWaitingRequests() { } } +// reportTTFT sets information about time to first token +func (s *VllmSimulator) reportTTFT(ttftInSecs float64) { + if s.config.FakeMetrics != nil { + return + } + if s.ttft != nil { + s.ttft.WithLabelValues( + s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs) + } +} + +// reportTTFT sets information about time per output token +func (s *VllmSimulator) reportTPOT(tpotInSecs float64) { + if s.config.FakeMetrics != nil { + return + } + if s.tpot != nil { + s.tpot.WithLabelValues( + s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs) + } +} + // reportKVCacheUsage sets information about kv cache usage func (s *VllmSimulator) reportKVCacheUsage(value float64) { if s.config.FakeMetrics != nil { @@ -198,6 +268,8 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) { go s.runningRequestsUpdater(ctx) go s.lorasUpdater(ctx) go s.kvCacheUsageUpdater(ctx) + go s.ttftUpdater(ctx) + go s.tpotUpdater(ctx) } // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel @@ -238,6 +310,30 @@ func (s *VllmSimulator) kvCacheUsageUpdater(ctx context.Context) { } } +// ttftUpdater updates the time to first token metric by listening on the relevant channel +func (s *VllmSimulator) ttftUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.ttftChan: + s.reportTTFT(value) + } + } +} + +// tpotUpdater updates the time per output token metric by listening on the relevant channel +func (s *VllmSimulator) tpotUpdater(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case value := <-s.tpotChan: + s.reportTPOT(value) + } + } +} + // lorasUpdater updates the running loras metric by listening on the relevant channel // one function updates both waiting and running loras since they a part of the same prometheus gauge func (s *VllmSimulator) lorasUpdater(ctx context.Context) { diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 744f54e1..78b60f7a 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -464,7 +464,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { ctx := context.TODO() args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--fake-metrics", - "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}", + "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}],\"ttft-buckets-values\":[1, 2, 3],\"tpot-buckets-values\": [0, 0, 1, 2, 3]}", } client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) @@ -482,6 +482,18 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"my_model\"} 0.4")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09")) Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09")) + + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 3")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 6")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 6")) + + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 1")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6")) + Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6")) }) }) }) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index e5d70ede..3851c517 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -92,6 +92,10 @@ type VllmSimulator struct { nWaitingReqs int64 // waitingReqChan is a channel to update nWaitingReqs waitingReqChan chan int64 + // ttftChan is a channel to update time to first token + ttftChan chan float64 + // tpotChan is a channel to update time per output token + tpotChan chan float64 // kvCacheUsageChan is a channel to update kvCacheUsagePercentage kvCacheUsageChan chan float64 // registry is a Prometheus registry @@ -102,6 +106,10 @@ type VllmSimulator struct { runningRequests *prometheus.GaugeVec // waitingRequests is prometheus gauge for number of queued requests waitingRequests *prometheus.GaugeVec + // ttft is prometheus histogram for time to first token in seconds + ttft *prometheus.HistogramVec + // tpot is prometheus histogram for time per output token in seconds + tpot *prometheus.HistogramVec // kvCacheUsagePercentage is prometheus gauge kvCacheUsagePercentage *prometheus.GaugeVec // channel for requeasts to be passed to workers @@ -136,6 +144,8 @@ func New(logger logr.Logger) (*VllmSimulator, error) { pod: os.Getenv(podNameEnv), runReqChan: make(chan int64, maxNumberOfRequests), waitingReqChan: make(chan int64, maxNumberOfRequests), + ttftChan: make(chan float64, maxNumberOfRequests), + tpotChan: make(chan float64, maxNumberOfRequests), lorasChan: make(chan loraUsage, maxNumberOfRequests), kvCacheUsageChan: make(chan float64, maxNumberOfRequests), }, nil @@ -497,9 +507,16 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) time.Sleep(time.Duration(ttft) * time.Millisecond) + + // report ttft in seconds + s.ttftChan <- (float64(ttft) / 1000) + for range usageData.CompletionTokens - 1 { perTokenLatency := s.getInterTokenLatency() time.Sleep(time.Duration(perTokenLatency) * time.Millisecond) + + // report tpot in seconds + s.tpotChan <- float64(perTokenLatency) / 1000 } s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp) diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index c64affc8..a208b63b 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -103,11 +103,19 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ // time to first token delay ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill) time.Sleep(time.Duration(ttft) * time.Millisecond) + // report ttft in seconds + s.ttftChan <- (float64(ttft) / 1000) for i, token := range genTokens { if i != 0 { time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond) } + + interTokenLat := s.getInterTokenLatency() + time.Sleep(time.Duration(interTokenLat) * time.Millisecond) + // report tpot in seconds + s.tpotChan <- float64(interTokenLat) / 1000 + var toolChunkInsert *openaiserverapi.ToolCall if tc != nil { toolChunkInsert = &openaiserverapi.ToolCall{ From 49d100075978042c7c31a4febbb96b6ddbfba367 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Sun, 19 Oct 2025 09:48:55 +0300 Subject: [PATCH 2/5] Add test for ttft kae metrics command line parameter with value for the last bucket Signed-off-by: Maya Barnea --- README.md | 4 +- pkg/llm-d-inference-sim/metrics.go | 33 +++++++++++------ pkg/llm-d-inference-sim/metrics_test.go | 49 +++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0db50852..49e715d1 100644 --- a/README.md +++ b/README.md @@ -147,8 +147,8 @@ For more details see the + --fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}' --- - `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc. --- diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index bbaa76a4..a6a5ec03 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -138,21 +138,11 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage) if s.config.FakeMetrics.TTFTBucketValues != nil { - for i, bucketVal := range s.config.FakeMetrics.TTFTBucketValues { - for range bucketVal { - s.ttft.WithLabelValues(s.getDisplayedModelName(s.config.Model)). - Observe(common.TTFTBucketsBoundaries[i]) - } - } + s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues) } if s.config.FakeMetrics.TPOTBucketValues != nil { - for i, bucketVal := range s.config.FakeMetrics.TPOTBucketValues { - for range bucketVal { - s.tpot.WithLabelValues(s.getDisplayedModelName(s.config.Model)). - Observe(common.TPOTBucketsBoundaries[i]) - } - } + s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues) } } @@ -176,6 +166,25 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { } } +func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) { + var valueToObserve float64 + numOfBuckets := len(bucketsBoundaries) + + for i, bucketVal := range bucketValues { + if i < numOfBuckets { + valueToObserve = bucketsBoundaries[i] + } else { + // this is last bucket - use number larger than the upper bound of the last bucket + valueToObserve = bucketsBoundaries[len(bucketsBoundaries)-1] + 1 + } + + for range bucketVal { + hist.WithLabelValues(s.getDisplayedModelName(s.config.Model)). + Observe(valueToObserve) + } + } +} + // reportLoras sets information about loaded LoRA adapters func (s *VllmSimulator) reportLoras() { if s.config.FakeMetrics != nil { diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 78b60f7a..46b35d38 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -19,6 +19,7 @@ package llmdinferencesim import ( "context" "errors" + "fmt" "io" "net/http" "os" @@ -496,6 +497,54 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6")) }) }) + + Context("fake ttft metrics", func() { + It("Should respond with fake ttft metrics to /metrics", func() { + ctx := context.TODO() + args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, + "--fake-metrics", + "{\"ttft-buckets-values\":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}", + } + + client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil) + Expect(err).NotTo(HaveOccurred()) + + resp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + + fmt.Println("---MAYA---") + fmt.Println(metrics) + + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 0")) + Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1")) + }) + }) }) // isLoraMetricPresent checks if a matching metric exists From c43a64a85b6972a0a2208155b981925688a8236d Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Sun, 19 Oct 2025 11:08:53 +0300 Subject: [PATCH 3/5] move calculating model name from a loop Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index a6a5ec03..53f2b826 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -169,6 +169,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) { var valueToObserve float64 numOfBuckets := len(bucketsBoundaries) + modelName := s.getDisplayedModelName(s.config.Model) for i, bucketVal := range bucketValues { if i < numOfBuckets { @@ -179,8 +180,7 @@ func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, buckets } for range bucketVal { - hist.WithLabelValues(s.getDisplayedModelName(s.config.Model)). - Observe(valueToObserve) + hist.WithLabelValues(modelName).Observe(valueToObserve) } } } From 2310404c5ee6130cc58bb09bb127e8b509a2e117 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Sun, 19 Oct 2025 13:20:43 +0300 Subject: [PATCH 4/5] Changes according the PR review Signed-off-by: Maya Barnea --- README.md | 4 +- pkg/common/config.go | 12 +- pkg/common/config_test.go | 5 + pkg/common/utils.go | 1 + pkg/llm-d-inference-sim/metrics.go | 25 ++-- pkg/llm-d-inference-sim/metrics_test.go | 155 +++++++++++++++++++++++- pkg/llm-d-inference-sim/streaming.go | 10 +- 7 files changed, 186 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 49e715d1..c3f35c61 100644 --- a/README.md +++ b/README.md @@ -144,8 +144,8 @@ For more details see the ", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.2\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.3\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.4\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"1\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"5\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"10\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"20\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"40\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"80\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"}") + Expect(count).ToNot(BeNil()) + Expect(*count).To(BeNumerically(">", 0)) + }() + + metricsWg.Wait() + }) + Context("kv cache metrics", func() { tmpDir := "./tests-tmp/" AfterAll(func() { @@ -517,9 +639,6 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(err).NotTo(HaveOccurred()) metrics := string(data) - fmt.Println("---MAYA---") - fmt.Println(metrics) - Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0")) Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0")) Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0")) @@ -624,3 +743,31 @@ func splitString(str string) []string { } return strings.Split(str, ",") } + +func findMetric(metrics []string, metricPrefix string) string { + // regex to extract metrics and values + for _, metric := range metrics { + if strings.Contains(metric, metricPrefix) { + arr := strings.Split(metric, " ") + if len(arr) == 2 { + return arr[1] + } + break + } + } + // required metric was not found + return "" +} + +func findIntMetric(metrics []string, metricPrefix string) *int { + valueStr := findMetric(metrics, metricPrefix) + if valueStr == "" { + return nil + } + + val, err := strconv.Atoi(valueStr) + if err != nil { + return nil + } + return &val +} diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index a208b63b..1bd2525d 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -108,14 +108,12 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ for i, token := range genTokens { if i != 0 { - time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond) + interTokenLat := s.getInterTokenLatency() + time.Sleep(time.Duration(interTokenLat) * time.Millisecond) + // report tpot in seconds + s.tpotChan <- float64(interTokenLat) / 1000 } - interTokenLat := s.getInterTokenLatency() - time.Sleep(time.Duration(interTokenLat) * time.Millisecond) - // report tpot in seconds - s.tpotChan <- float64(interTokenLat) / 1000 - var toolChunkInsert *openaiserverapi.ToolCall if tc != nil { toolChunkInsert = &openaiserverapi.ToolCall{ From 621e4f51501a2c3d51239a7ddaec3b5c84cf32a8 Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Sun, 19 Oct 2025 14:32:14 +0300 Subject: [PATCH 5/5] according review comments Signed-off-by: Maya Barnea --- pkg/llm-d-inference-sim/metrics.go | 4 ++-- pkg/llm-d-inference-sim/metrics_test.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index 410289bd..45c340d3 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -167,10 +167,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() { } // initFakeHistogram initializes the given histogram values based on the input -// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number pf buckets is len(bucketsBoundaries)+1. +// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number of buckets is len(bucketsBoundaries)+1. // This includes the last bucket (last_boundary, +Inf]. // bucketsSamplesCount - array containing number of samples per bucket, starting from the first bucket. -// Trailing empty buckets are not included in this array, so it length could be <= len(bucketsBoundaries)+1 +// Trailing empty buckets are not included in this array, so its length can be <= len(bucketsBoundaries)+1 func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketsSamplesCount []int) { var valueToObserve float64 numOfBoundaries := len(bucketsBoundaries) diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 313525db..f0f8bb58 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -374,7 +374,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 0")) Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 0")) - metricsLines := strings.Split(string(metrics), "\n") + metricsLines := strings.Split(metrics, "\n") // the following values should be greater than 0, we don't know the exact value since it depends on the random response length count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"}") Expect(count).ToNot(BeNil())