From 3e3605d530b9a8ae59676e936284291797dd129c Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Thu, 16 Oct 2025 16:35:39 +0300
Subject: [PATCH 1/5] Add vllm:time_per_output_token_seconds and
 vllm:time_to_first_token_seconds histogram metrics, including support in fake
 metrics, and update of readme

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 README.md                               |   6 +-
 manifests/config_with_fake.yaml         |   2 +
 pkg/common/config.go                    |  34 ++++++++
 pkg/common/config_test.go               |   7 ++
 pkg/common/utils.go                     |   6 ++
 pkg/llm-d-inference-sim/metrics.go      | 100 +++++++++++++++++++++++-
 pkg/llm-d-inference-sim/metrics_test.go |  14 +++-
 pkg/llm-d-inference-sim/simulator.go    |  17 ++++
 pkg/llm-d-inference-sim/streaming.go    |   8 ++
 9 files changed, 189 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index fa4dfde2..0db50852 100644
--- a/README.md
+++ b/README.md
@@ -143,8 +143,10 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `running-requests`
     - `waiting-requests`
     - `kv-cache-usage`
-    - `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric).  
-
+    - `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric). 
+    - `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
+    - `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
+    <br>
     Example:
       {"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
 ---
diff --git a/manifests/config_with_fake.yaml b/manifests/config_with_fake.yaml
index 81d14c54..1c75c115 100644
--- a/manifests/config_with_fake.yaml
+++ b/manifests/config_with_fake.yaml
@@ -14,3 +14,5 @@ fake-metrics:
   loras:
   - '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
   - '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
+  ttft-buckets-values: [10, 20, 30, 10]
+  tpot-buckets-values: [0, 0, 10, 20, 30]
diff --git a/pkg/common/config.go b/pkg/common/config.go
index 20baef27..879ad88d 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -209,6 +209,20 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
+	// TTFTBuckets  is an array of values for time-to-first-token buckets,
+	// each value in this array is a value for the corresponding bucket.
+	// Array may contain less values than number of buckets, all missing values assumed as 0.
+	// Buckets upper boundaries are:
+	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
+	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
+	// TPOTBuckets  is an array of values for time-per-output-token buckets,
+	// each value in this array is a value for the corresponding bucket.
+	// Array may contain less values than number of buckets, all missing values assumed as 0.
+	// Buckets upper boundaries are:
+	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
+	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
 }
 
 type LorasMetrics struct {
@@ -487,6 +501,26 @@ func (c *Configuration) validate() error {
 		if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
 			return errors.New("fake metrics KV cache usage must be between 0 ans 1")
 		}
+		if c.FakeMetrics.TTFTBucketValues != nil {
+			if len(c.FakeMetrics.TTFTBucketValues) > len(TTFTBucketsBoundaries)+1 {
+				return errors.New("fake time-to-first-token array is too long")
+			}
+			for v := range c.FakeMetrics.TTFTBucketValues {
+				if v < 0 {
+					return errors.New("time-to-first-token fake metrics should contain only non-negative values")
+				}
+			}
+		}
+		if c.FakeMetrics.TPOTBucketValues != nil {
+			if len(c.FakeMetrics.TPOTBucketValues) > len(TPOTBucketsBoundaries)+1 {
+				return errors.New("fake time-per-output-token array is too long")
+			}
+			for v := range c.FakeMetrics.TPOTBucketValues {
+				if v < 0 {
+					return errors.New("time-per-output-token fake metrics should contain only non-negative values")
+				}
+			}
+		}
 	}
 
 	if c.DPSize < 1 || c.DPSize > 8 {
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 1c0353ed..c6c65efc 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -203,6 +203,8 @@ var _ = Describe("Simulator configuration", func() {
 			"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
 			"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
 		},
+		TTFTBucketValues: []int{10, 20, 30, 10},
+		TPOTBucketValues: []int{0, 0, 10, 20, 30},
 	}
 	test = testCase{
 		name:           "config with fake metrics file",
@@ -451,6 +453,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--time-factor-under-load", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid ttft",
+			args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index 20f0cca8..87370793 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -24,6 +24,12 @@ import (
 	"github.com/google/uuid"
 )
 
+var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+	0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+	2560.0}
+var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
+
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index e86e900f..bbaa76a4 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -27,6 +27,7 @@ import (
 
 	"github.com/prometheus/client_golang/prometheus"
 
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
 )
 
@@ -64,7 +65,6 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
-	// not supported for now, reports constant value
 	s.waitingRequests = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -79,7 +79,36 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
-	// not supported for now, reports constant value
+	s.ttft = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:time_to_first_token_seconds",
+			Help:      "Histogram of time to first token in seconds.",
+			Buckets:   common.TTFTBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.registry.Register(s.ttft); err != nil {
+		s.logger.Error(err, "Prometheus time to first token histogram register failed")
+		return err
+	}
+
+	s.tpot = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:time_per_output_token_seconds",
+			Help:      "Histogram of time per output token in seconds.",
+			Buckets:   common.TPOTBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.registry.Register(s.tpot); err != nil {
+		s.logger.Error(err, "Prometheus time per output token histogram register failed")
+		return err
+	}
+
 	s.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -107,7 +136,26 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
 		nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
 		kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
+
+		if s.config.FakeMetrics.TTFTBucketValues != nil {
+			for i, bucketVal := range s.config.FakeMetrics.TTFTBucketValues {
+				for range bucketVal {
+					s.ttft.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
+						Observe(common.TTFTBucketsBoundaries[i])
+				}
+			}
+		}
+
+		if s.config.FakeMetrics.TPOTBucketValues != nil {
+			for i, bucketVal := range s.config.FakeMetrics.TPOTBucketValues {
+				for range bucketVal {
+					s.tpot.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
+						Observe(common.TPOTBucketsBoundaries[i])
+				}
+			}
+		}
 	}
+
 	modelName := s.getDisplayedModelName(s.config.Model)
 	s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
 	s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
@@ -181,6 +229,28 @@ func (s *VllmSimulator) reportWaitingRequests() {
 	}
 }
 
+// reportTTFT sets information about time to first token
+func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
+	if s.config.FakeMetrics != nil {
+		return
+	}
+	if s.ttft != nil {
+		s.ttft.WithLabelValues(
+			s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
+	}
+}
+
+// reportTTFT sets information about time per output token
+func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
+	if s.config.FakeMetrics != nil {
+		return
+	}
+	if s.tpot != nil {
+		s.tpot.WithLabelValues(
+			s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
+	}
+}
+
 // reportKVCacheUsage sets information about kv cache usage
 func (s *VllmSimulator) reportKVCacheUsage(value float64) {
 	if s.config.FakeMetrics != nil {
@@ -198,6 +268,8 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.runningRequestsUpdater(ctx)
 	go s.lorasUpdater(ctx)
 	go s.kvCacheUsageUpdater(ctx)
+	go s.ttftUpdater(ctx)
+	go s.tpotUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -238,6 +310,30 @@ func (s *VllmSimulator) kvCacheUsageUpdater(ctx context.Context) {
 	}
 }
 
+// ttftUpdater updates the time to first token metric by listening on the relevant channel
+func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.ttftChan:
+			s.reportTTFT(value)
+		}
+	}
+}
+
+// tpotUpdater updates the time per output token metric by listening on the relevant channel
+func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.tpotChan:
+			s.reportTPOT(value)
+		}
+	}
+}
+
 // lorasUpdater updates the running loras metric by listening on the relevant channel
 // one function updates both waiting and running loras since they a part of the same prometheus gauge
 func (s *VllmSimulator) lorasUpdater(ctx context.Context) {
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 744f54e1..78b60f7a 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -464,7 +464,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			ctx := context.TODO()
 			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
 				"--fake-metrics",
-				"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
+				"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}],\"ttft-buckets-values\":[1, 2, 3],\"tpot-buckets-values\": [0, 0, 1, 2, 3]}",
 			}
 
 			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
@@ -482,6 +482,18 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"my_model\"} 0.4"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
+
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 3"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 6"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 6"))
+
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6"))
 		})
 	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index e5d70ede..3851c517 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -92,6 +92,10 @@ type VllmSimulator struct {
 	nWaitingReqs int64
 	// waitingReqChan is a channel to update nWaitingReqs
 	waitingReqChan chan int64
+	// ttftChan is a channel to update time to first token
+	ttftChan chan float64
+	// tpotChan is a channel to update time per output token
+	tpotChan chan float64
 	// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
 	kvCacheUsageChan chan float64
 	// registry is a Prometheus registry
@@ -102,6 +106,10 @@ type VllmSimulator struct {
 	runningRequests *prometheus.GaugeVec
 	// waitingRequests is prometheus gauge for number of queued requests
 	waitingRequests *prometheus.GaugeVec
+	// ttft is prometheus histogram for time to first token in seconds
+	ttft *prometheus.HistogramVec
+	// tpot is prometheus histogram for time per output token in seconds
+	tpot *prometheus.HistogramVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
 	// channel for requeasts to be passed to workers
@@ -136,6 +144,8 @@ func New(logger logr.Logger) (*VllmSimulator, error) {
 		pod:              os.Getenv(podNameEnv),
 		runReqChan:       make(chan int64, maxNumberOfRequests),
 		waitingReqChan:   make(chan int64, maxNumberOfRequests),
+		ttftChan:         make(chan float64, maxNumberOfRequests),
+		tpotChan:         make(chan float64, maxNumberOfRequests),
 		lorasChan:        make(chan loraUsage, maxNumberOfRequests),
 		kvCacheUsageChan: make(chan float64, maxNumberOfRequests),
 	}, nil
@@ -497,9 +507,16 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
+
+	// report ttft in seconds
+	s.ttftChan <- (float64(ttft) / 1000)
+
 	for range usageData.CompletionTokens - 1 {
 		perTokenLatency := s.getInterTokenLatency()
 		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
+
+		// report tpot in seconds
+		s.tpotChan <- float64(perTokenLatency) / 1000
 	}
 
 	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index c64affc8..a208b63b 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -103,11 +103,19 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 	// time to first token delay
 	ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
+	// report ttft in seconds
+	s.ttftChan <- (float64(ttft) / 1000)
 
 	for i, token := range genTokens {
 		if i != 0 {
 			time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)
 		}
+
+		interTokenLat := s.getInterTokenLatency()
+		time.Sleep(time.Duration(interTokenLat) * time.Millisecond)
+		// report tpot in seconds
+		s.tpotChan <- float64(interTokenLat) / 1000
+
 		var toolChunkInsert *openaiserverapi.ToolCall
 		if tc != nil {
 			toolChunkInsert = &openaiserverapi.ToolCall{

From 49d100075978042c7c31a4febbb96b6ddbfba367 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Sun, 19 Oct 2025 09:48:55 +0300
Subject: [PATCH 2/5] Add test for ttft kae metrics command line parameter with
 value for the last bucket

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 README.md                               |  4 +-
 pkg/llm-d-inference-sim/metrics.go      | 33 +++++++++++------
 pkg/llm-d-inference-sim/metrics_test.go | 49 +++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 0db50852..49e715d1 100644
--- a/README.md
+++ b/README.md
@@ -147,8 +147,8 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
     - `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
     <br>
-    Example:
-      {"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
+    Example:<br>
+      --fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
 ---
 - `data-parallel-size`: number of ranks to run in Data Parallel deployment, from 1 to 8, default is 1. The ports will be assigned as follows: rank 0 will run on the configured `port`, rank 1 on `port`+1, etc.      
 ---
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index bbaa76a4..a6a5ec03 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -138,21 +138,11 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
 
 		if s.config.FakeMetrics.TTFTBucketValues != nil {
-			for i, bucketVal := range s.config.FakeMetrics.TTFTBucketValues {
-				for range bucketVal {
-					s.ttft.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
-						Observe(common.TTFTBucketsBoundaries[i])
-				}
-			}
+			s.initFakeHistogram(s.ttft, common.TTFTBucketsBoundaries, s.config.FakeMetrics.TTFTBucketValues)
 		}
 
 		if s.config.FakeMetrics.TPOTBucketValues != nil {
-			for i, bucketVal := range s.config.FakeMetrics.TPOTBucketValues {
-				for range bucketVal {
-					s.tpot.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
-						Observe(common.TPOTBucketsBoundaries[i])
-				}
-			}
+			s.initFakeHistogram(s.tpot, common.TPOTBucketsBoundaries, s.config.FakeMetrics.TPOTBucketValues)
 		}
 	}
 
@@ -176,6 +166,25 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 	}
 }
 
+func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) {
+	var valueToObserve float64
+	numOfBuckets := len(bucketsBoundaries)
+
+	for i, bucketVal := range bucketValues {
+		if i < numOfBuckets {
+			valueToObserve = bucketsBoundaries[i]
+		} else {
+			// this is last bucket - use number larger than the upper bound of the last bucket
+			valueToObserve = bucketsBoundaries[len(bucketsBoundaries)-1] + 1
+		}
+
+		for range bucketVal {
+			hist.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
+				Observe(valueToObserve)
+		}
+	}
+}
+
 // reportLoras sets information about loaded LoRA adapters
 func (s *VllmSimulator) reportLoras() {
 	if s.config.FakeMetrics != nil {
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 78b60f7a..46b35d38 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -19,6 +19,7 @@ package llmdinferencesim
 import (
 	"context"
 	"errors"
+	"fmt"
 	"io"
 	"net/http"
 	"os"
@@ -496,6 +497,54 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6"))
 		})
 	})
+
+	Context("fake ttft metrics", func() {
+		It("Should respond with fake ttft metrics to /metrics", func() {
+			ctx := context.TODO()
+			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+				"--fake-metrics",
+				"{\"ttft-buckets-values\":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}",
+			}
+
+			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			Expect(err).NotTo(HaveOccurred())
+
+			resp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(resp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+
+			fmt.Println("---MAYA---")
+			fmt.Println(metrics)
+
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
+		})
+	})
 })
 
 // isLoraMetricPresent checks if a matching metric exists

From c43a64a85b6972a0a2208155b981925688a8236d Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Sun, 19 Oct 2025 11:08:53 +0300
Subject: [PATCH 3/5] move calculating model name from a loop

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index a6a5ec03..53f2b826 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -169,6 +169,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) {
 	var valueToObserve float64
 	numOfBuckets := len(bucketsBoundaries)
+	modelName := s.getDisplayedModelName(s.config.Model)
 
 	for i, bucketVal := range bucketValues {
 		if i < numOfBuckets {
@@ -179,8 +180,7 @@ func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, buckets
 		}
 
 		for range bucketVal {
-			hist.WithLabelValues(s.getDisplayedModelName(s.config.Model)).
-				Observe(valueToObserve)
+			hist.WithLabelValues(modelName).Observe(valueToObserve)
 		}
 	}
 }

From 2310404c5ee6130cc58bb09bb127e8b509a2e117 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Sun, 19 Oct 2025 13:20:43 +0300
Subject: [PATCH 4/5] Changes according the PR review

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 README.md                               |   4 +-
 pkg/common/config.go                    |  12 +-
 pkg/common/config_test.go               |   5 +
 pkg/common/utils.go                     |   1 +
 pkg/llm-d-inference-sim/metrics.go      |  25 ++--
 pkg/llm-d-inference-sim/metrics_test.go | 155 +++++++++++++++++++++++-
 pkg/llm-d-inference-sim/streaming.go    |  10 +-
 7 files changed, 186 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 49e715d1..c3f35c61 100644
--- a/README.md
+++ b/README.md
@@ -144,8 +144,8 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `waiting-requests`
     - `kv-cache-usage`
     - `loras` - an array containing LoRA information objects, each with the fields: `running` (a comma-separated list of LoRAs in use by running requests), `waiting` (a comma-separated list of LoRAs to be used by waiting requests), and `timestamp` (seconds since Jan 1 1970, the timestamp of this metric). 
-    - `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
-    - `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
+    - `ttft-buckets-values` - array of values for time-to-first-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf.
+    - `tpot-buckets-values` - array of values for time-per-output-token buckets, each value in this array is a value for the corresponding bucket. Array may contain less values than number of buckets, all trailing missing values assumed as 0. Buckets upper boundaries are: 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf.
     <br>
     Example:<br>
       --fake-metrics '{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}'
diff --git a/pkg/common/config.go b/pkg/common/config.go
index 879ad88d..e4ae22bb 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -209,17 +209,17 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
-	// TTFTBuckets  is an array of values for time-to-first-token buckets,
+	// TTFTBuckets is an array of values for time-to-first-token buckets,
 	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all missing values assumed as 0.
-	// Buckets upper boundaries are:
+	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// Buckets upper boundaries in seconds are:
 	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
 	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
 	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
-	// TPOTBuckets  is an array of values for time-per-output-token buckets,
+	// TPOTBuckets is an array of values for time-per-output-token buckets,
 	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all missing values assumed as 0.
-	// Buckets upper boundaries are:
+	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// Buckets upper boundaries in seconds are:
 	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
 	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index c6c65efc..816dd412 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -458,6 +458,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--ttft-buckets-values", "[1, 2, -10, 1]",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid tpot",
+			args: []string{"cmd", "--tpot-buckets-values", "[1, 2, -10, 1]",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index 87370793..d22fcd7c 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -24,6 +24,7 @@ import (
 	"github.com/google/uuid"
 )
 
+// Definition of buckets for time-to-first-token and time-per-output-token metrics, each value is an upper boundary of a bucket
 var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
 	0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
 	2560.0}
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 53f2b826..410289bd 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -166,20 +166,29 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 	}
 }
 
-func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketValues []int) {
+// initFakeHistogram initializes the given histogram values based on the input
+// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number pf buckets is len(bucketsBoundaries)+1.
+// This includes the last bucket (last_boundary, +Inf].
+// bucketsSamplesCount - array containing number of samples per bucket, starting from the first bucket.
+// Trailing empty buckets are not included in this array, so it length could be <= len(bucketsBoundaries)+1
+func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketsSamplesCount []int) {
 	var valueToObserve float64
-	numOfBuckets := len(bucketsBoundaries)
+	numOfBoundaries := len(bucketsBoundaries)
 	modelName := s.getDisplayedModelName(s.config.Model)
 
-	for i, bucketVal := range bucketValues {
-		if i < numOfBuckets {
+	for i, bucketSamplesCount := range bucketsSamplesCount {
+		// for each bucket calculate value to use for Observe function
+		// for all buckets except the last one it will be the upper boundary (which is included in the bucket)
+		// for the last bucket it will be top boundary of the previous bucket + 1
+		if i < numOfBoundaries {
 			valueToObserve = bucketsBoundaries[i]
 		} else {
-			// this is last bucket - use number larger than the upper bound of the last bucket
-			valueToObserve = bucketsBoundaries[len(bucketsBoundaries)-1] + 1
+			// this is last bucket - use number larger than the upper bound of the previous bucket
+			valueToObserve = bucketsBoundaries[numOfBoundaries-1] + 1
 		}
 
-		for range bucketVal {
+		for range bucketSamplesCount {
+			// create required number of observations for the calculated sample
 			hist.WithLabelValues(modelName).Observe(valueToObserve)
 		}
 	}
@@ -249,7 +258,7 @@ func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
 	}
 }
 
-// reportTTFT sets information about time per output token
+// reportTPOT sets information about time per output token
 func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
 	if s.config.FakeMetrics != nil {
 		return
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 46b35d38..313525db 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -19,7 +19,6 @@ package llmdinferencesim
 import (
 	"context"
 	"errors"
-	"fmt"
 	"io"
 	"net/http"
 	"os"
@@ -307,6 +306,129 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		Expect(bothRunningTimestamp <= emptyTimestamp).To(BeTrue())
 	})
 
+	It("Should send correct ttft and tpot metrics", func() {
+		modelName := "my_model"
+		// Send one request, check that ttft and tpot are as defined in the simulator command line params
+		ctx := context.TODO()
+		args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
+			"--time-to-first-token", "200", "--inter-token-latency", "100"}
+
+		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		Expect(err).NotTo(HaveOccurred())
+
+		openaiclient, params := getOpenAIClentAndChatParams(client, modelName, userMessage, false)
+		params.MaxTokens = openai.Int(5)
+
+		var reqWg, metricsWg sync.WaitGroup
+		metricsWg.Add(1)
+		reqWg.Add(1)
+
+		go func() {
+			defer reqWg.Done()
+			defer GinkgoRecover()
+
+			_, err := openaiclient.Chat.Completions.New(ctx, params)
+			Expect(err).NotTo(HaveOccurred())
+		}()
+
+		go func() {
+			defer metricsWg.Done()
+			defer GinkgoRecover()
+
+			reqWg.Wait()
+			time.Sleep(300 * time.Millisecond)
+			metricsResp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(metricsResp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+			// ttft
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
+			// tpot
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 0"))
+
+			metricsLines := strings.Split(string(metrics), "\n")
+			// the following values should be greater than 0, we don't know the exact value since it depends on the random response length
+			count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.2\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.3\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.4\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"1\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"5\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"10\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"20\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"40\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"80\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"}")
+			Expect(count).ToNot(BeNil())
+			Expect(*count).To(BeNumerically(">", 0))
+		}()
+
+		metricsWg.Wait()
+	})
+
 	Context("kv cache metrics", func() {
 		tmpDir := "./tests-tmp/"
 		AfterAll(func() {
@@ -517,9 +639,6 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
 
-			fmt.Println("---MAYA---")
-			fmt.Println(metrics)
-
 			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
 			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
 			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
@@ -624,3 +743,31 @@ func splitString(str string) []string {
 	}
 	return strings.Split(str, ",")
 }
+
+func findMetric(metrics []string, metricPrefix string) string {
+	// regex to extract metrics and values
+	for _, metric := range metrics {
+		if strings.Contains(metric, metricPrefix) {
+			arr := strings.Split(metric, " ")
+			if len(arr) == 2 {
+				return arr[1]
+			}
+			break
+		}
+	}
+	// required metric was not found
+	return ""
+}
+
+func findIntMetric(metrics []string, metricPrefix string) *int {
+	valueStr := findMetric(metrics, metricPrefix)
+	if valueStr == "" {
+		return nil
+	}
+
+	val, err := strconv.Atoi(valueStr)
+	if err != nil {
+		return nil
+	}
+	return &val
+}
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index a208b63b..1bd2525d 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -108,14 +108,12 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 
 	for i, token := range genTokens {
 		if i != 0 {
-			time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)
+			interTokenLat := s.getInterTokenLatency()
+			time.Sleep(time.Duration(interTokenLat) * time.Millisecond)
+			// report tpot in seconds
+			s.tpotChan <- float64(interTokenLat) / 1000
 		}
 
-		interTokenLat := s.getInterTokenLatency()
-		time.Sleep(time.Duration(interTokenLat) * time.Millisecond)
-		// report tpot in seconds
-		s.tpotChan <- float64(interTokenLat) / 1000
-
 		var toolChunkInsert *openaiserverapi.ToolCall
 		if tc != nil {
 			toolChunkInsert = &openaiserverapi.ToolCall{

From 621e4f51501a2c3d51239a7ddaec3b5c84cf32a8 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Sun, 19 Oct 2025 14:32:14 +0300
Subject: [PATCH 5/5] according review comments

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go      | 4 ++--
 pkg/llm-d-inference-sim/metrics_test.go | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 410289bd..45c340d3 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -167,10 +167,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 }
 
 // initFakeHistogram initializes the given histogram values based on the input
-// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number pf buckets is len(bucketsBoundaries)+1.
+// bucketsBoundaries - upper boudaries of all buckets except the last one. Actual number of buckets is len(bucketsBoundaries)+1.
 // This includes the last bucket (last_boundary, +Inf].
 // bucketsSamplesCount - array containing number of samples per bucket, starting from the first bucket.
-// Trailing empty buckets are not included in this array, so it length could be <= len(bucketsBoundaries)+1
+// Trailing empty buckets are not included in this array, so its length can be <= len(bucketsBoundaries)+1
 func (s *VllmSimulator) initFakeHistogram(hist *prometheus.HistogramVec, bucketsBoundaries []float64, bucketsSamplesCount []int) {
 	var valueToObserve float64
 	numOfBoundaries := len(bucketsBoundaries)
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 313525db..f0f8bb58 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -374,7 +374,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 0"))
 			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 0"))
 
-			metricsLines := strings.Split(string(metrics), "\n")
+			metricsLines := strings.Split(metrics, "\n")
 			// the following values should be greater than 0, we don't know the exact value since it depends on the random response length
 			count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"}")
 			Expect(count).ToNot(BeNil())