From 89ed87cc769a5cf555fe0bd49316c99d60056810 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Thu, 23 Oct 2025 12:24:34 +0300
Subject: [PATCH 01/14] Add e2e request latency histogram to prometheus
 metrics. Add reportHistogramValue function to be used for reporting values in
 histogram metrics

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/common/config.go                 |  7 ++++
 pkg/common/utils.go                  |  3 ++
 pkg/llm-d-inference-sim/metrics.go   | 57 +++++++++++++++++++---------
 pkg/llm-d-inference-sim/simulator.go | 10 +++++
 4 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index ae8eec47..49825a48 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -253,6 +253,13 @@ type Metrics struct {
 	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
+	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
+	// each value in this array is a value for the corresponding bucket.
+	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// Buckets upper boundaries in seconds are:
+	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
+	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
 }
 
 type LorasMetrics struct {
diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index d1f3cfe1..78a279b1 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
 
+var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
+
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 35108582..88f7bfcb 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -111,6 +111,21 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:e2e_request_latency_seconds",
+			Help:      "Histogram of end to end request latency in seconds.",
+			Buckets:   common.E2ERequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.e2eReqLatency); err != nil {
+		s.logger.Error(err, "Prometheus end to end request latency histogram register failed")
+		return err
+	}
+
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -215,6 +230,10 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		for reason, requestSuccessTotal := range s.config.FakeMetrics.RequestSuccessTotal {
 			s.metrics.requestSuccessTotal.WithLabelValues(modelName, reason).Add(float64(requestSuccessTotal))
 		}
+
+		if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
+			s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+		}
 	}
 
 	s.metrics.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
@@ -317,25 +336,14 @@ func (s *VllmSimulator) reportWaitingRequests() {
 	}
 }
 
-// reportTTFT sets information about time to first token
-func (s *VllmSimulator) reportTTFT(ttftInSecs float64) {
-	if s.config.FakeMetrics != nil {
-		return
-	}
-	if s.metrics.ttft != nil {
-		s.metrics.ttft.WithLabelValues(
-			s.getDisplayedModelName(s.config.Model)).Observe(ttftInSecs)
-	}
-}
-
-// reportTPOT sets information about time per output token
-func (s *VllmSimulator) reportTPOT(tpotInSecs float64) {
+// reportHistogramValue sets the given value in the given histogram
+func (s *VllmSimulator) reportHistogramValue(hist *prometheus.HistogramVec, val float64) {
 	if s.config.FakeMetrics != nil {
 		return
 	}
-	if s.metrics.tpot != nil {
-		s.metrics.tpot.WithLabelValues(
-			s.getDisplayedModelName(s.config.Model)).Observe(tpotInSecs)
+	if hist != nil {
+		hist.WithLabelValues(
+			s.getDisplayedModelName(s.config.Model)).Observe(val)
 	}
 }
 
@@ -359,6 +367,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.ttftUpdater(ctx)
 	go s.tpotUpdater(ctx)
 	go s.recordRequestUpdater(ctx)
+	go s.e2eReqLatencyUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -406,7 +415,7 @@ func (s *VllmSimulator) ttftUpdater(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case value := <-s.metrics.ttftChan:
-			s.reportTTFT(value)
+			s.reportHistogramValue(s.metrics.ttft, value)
 		}
 	}
 }
@@ -418,7 +427,19 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case value := <-s.metrics.tpotChan:
-			s.reportTPOT(value)
+			s.reportHistogramValue(s.metrics.tpot, value)
+		}
+	}
+}
+
+// tpotUpdater updates the time per output token metric by listening on the relevant channel
+func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.e2eReqLatencyChan:
+			s.reportHistogramValue(s.metrics.e2eReqLatency, value)
 		}
 	}
 }
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index d10dff80..b3aeb9af 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -93,6 +93,8 @@ type metricsData struct {
 	ttftChan chan float64
 	// tpotChan is a channel to update time per output token
 	tpotChan chan float64
+	// e2eReqLatencyChan is a channel to update request e2e latency
+	e2eReqLatencyChan chan float64
 	// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
 	kvCacheUsageChan chan float64
 	// registry is a Prometheus registry
@@ -107,6 +109,8 @@ type metricsData struct {
 	ttft *prometheus.HistogramVec
 	// tpot is prometheus histogram for time per output token in seconds
 	tpot *prometheus.HistogramVec
+	// e2eReqLatency is prometheus histogram of end to end request latency in seconds
+	e2eReqLatency *prometheus.HistogramVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
 	// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -271,6 +275,7 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 	s.metrics.kvCacheUsageChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
 
 	s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -460,6 +465,11 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
 
 // handleCompletions general completion requests handler, support both text and chat completion APIs
 func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) {
+	startTime := time.Now()
+	defer func() {
+		s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds()
+	}()
+
 	// Check if we should inject a failure
 	if shouldInjectFailure(s.config) {
 		failure := getRandomFailure(s.config)

From a38361e05ffb5ecf3e85c9885be0e16e2d745ce1 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Thu, 23 Oct 2025 14:19:37 +0300
Subject: [PATCH 02/14] Additional metrics - vllm:request_queue_time_seconds,
 vllm:request_inference_time_seconds, vllm:request_prefill_time_seconds, and
 vllm:request_decode_time_seconds

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/common/utils.go                  |   2 +-
 pkg/llm-d-inference-sim/metrics.go   | 118 ++++++++++++++++++++++++++-
 pkg/llm-d-inference-sim/simulator.go |  53 +++++++++---
 pkg/llm-d-inference-sim/streaming.go |   5 ++
 pkg/llm-d-inference-sim/worker.go    |   6 ++
 5 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index 78a279b1..7050fc55 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -32,7 +32,7 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
 
-var E2ERequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
 
 // ValidateContextWindow checks if the request fits within the model's context window
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 88f7bfcb..37175fb3 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -116,7 +116,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 			Subsystem: "",
 			Name:      "vllm:e2e_request_latency_seconds",
 			Help:      "Histogram of end to end request latency in seconds.",
-			Buckets:   common.E2ERequestLatencyBucketsBoundaries,
+			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
 		[]string{vllmapi.PromLabelModelName},
 	)
@@ -126,6 +126,66 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.reqQueueTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_queue_time_seconds",
+			Help:      "Histogram of time spent in WAITING phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqQueueTime); err != nil {
+		s.logger.Error(err, "Prometheus request queue time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_inference_time_seconds",
+			Help:      "Histogram of time spent in RUNNING phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil {
+		s.logger.Error(err, "Prometheus request inerence time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_prefill_time_seconds",
+			Help:      "Histogram of time spent in PREFILL phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqPrefillTime); err != nil {
+		s.logger.Error(err, "Prometheus request prefill time histogram register failed")
+		return err
+	}
+
+	s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "",
+			Name:      "vllm:request_queue_time_seconds",
+			Help:      "Histogram of time spent in DECODE phase for request.",
+			Buckets:   common.RequestLatencyBucketsBoundaries,
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.reqDecodeTime); err != nil {
+		s.logger.Error(err, "Prometheus request decode time histogram register failed")
+		return err
+	}
+
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
@@ -232,7 +292,7 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		}
 
 		if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
-			s.initFakeHistogram(s.metrics.tpot, common.E2ERequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+			s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
 		}
 	}
 
@@ -368,6 +428,10 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
 	go s.tpotUpdater(ctx)
 	go s.recordRequestUpdater(ctx)
 	go s.e2eReqLatencyUpdater(ctx)
+	go s.reqQueueTimeUpdater(ctx)
+	go s.reqInferenceTimeUpdater(ctx)
+	go s.reqPrefillTimeUpdater(ctx)
+	go s.reqDecodeTimeUpdater(ctx)
 }
 
 // waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -432,7 +496,7 @@ func (s *VllmSimulator) tpotUpdater(ctx context.Context) {
 	}
 }
 
-// tpotUpdater updates the time per output token metric by listening on the relevant channel
+// e2eReqLatencyUpdater updates the e2e request latency metric by listening on the relevant channel
 func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
 	for {
 		select {
@@ -444,6 +508,54 @@ func (s *VllmSimulator) e2eReqLatencyUpdater(ctx context.Context) {
 	}
 }
 
+// reqQueueTimeUpdater updates the request queue time metric by listening on the relevant channel
+func (s *VllmSimulator) reqQueueTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqQueueTimeChan:
+			s.reportHistogramValue(s.metrics.reqQueueTime, value)
+		}
+	}
+}
+
+// reqInferenceTimeUpdater updates the request inference time metric by listening on the relevant channel
+func (s *VllmSimulator) reqInferenceTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqInferenceTimeChan:
+			s.reportHistogramValue(s.metrics.reqInferenceTime, value)
+		}
+	}
+}
+
+// reqPrefillTimeUpdater updates the request prefill time metric by listening on the relevant channel
+func (s *VllmSimulator) reqPrefillTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqPrefillTimeChan:
+			s.reportHistogramValue(s.metrics.reqPrefillTime, value)
+		}
+	}
+}
+
+// reqDecodeTimeUpdater updates the request decode time metric by listening on the relevant channel
+func (s *VllmSimulator) reqDecodeTimeUpdater(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case value := <-s.metrics.reqDecodeTimeChan:
+			s.reportHistogramValue(s.metrics.reqDecodeTime, value)
+		}
+	}
+}
+
 // lorasUpdater updates the running loras metric by listening on the relevant channel
 // one function updates both waiting and running loras since they a part of the same prometheus gauge
 func (s *VllmSimulator) lorasUpdater(ctx context.Context) {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index b3aeb9af..cd180219 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -95,6 +95,14 @@ type metricsData struct {
 	tpotChan chan float64
 	// e2eReqLatencyChan is a channel to update request e2e latency
 	e2eReqLatencyChan chan float64
+	// reqQueueTimeChan is a channel to update request queue time
+	reqQueueTimeChan chan float64
+	// reqInferenceTimeChan is a channel to update request inference time
+	reqInferenceTimeChan chan float64
+	// reqPrefillTimeChan is a channel to update request prefill time
+	reqPrefillTimeChan chan float64
+	// reqDecodeTimeChan is a channel to update request decode time
+	reqDecodeTimeChan chan float64
 	// kvCacheUsageChan is a channel to update kvCacheUsagePercentage
 	kvCacheUsageChan chan float64
 	// registry is a Prometheus registry
@@ -111,6 +119,14 @@ type metricsData struct {
 	tpot *prometheus.HistogramVec
 	// e2eReqLatency is prometheus histogram of end to end request latency in seconds
 	e2eReqLatency *prometheus.HistogramVec
+	// reqQueueTime is prometheus histogram of request queue time in seconds
+	reqQueueTime *prometheus.HistogramVec
+	// reqInferenceTime is prometheus histogram of request inference time in seconds
+	reqInferenceTime *prometheus.HistogramVec
+	// reqPrefillTime is prometheus histogram of request prefill time in seconds
+	reqPrefillTime *prometheus.HistogramVec
+	// reqDecodeTime is prometheus histogram of request decode time in seconds
+	reqDecodeTime *prometheus.HistogramVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
 	// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
@@ -139,6 +155,11 @@ type requestCompleted struct {
 	model  string
 }
 
+type waitingQueueItem struct {
+	reqCtx      *openaiserverapi.CompletionReqCtx
+	enqueueTime time.Time
+}
+
 // VllmSimulator simulates vLLM server supporting OpenAI API
 type VllmSimulator struct {
 	// logger is used for information and errors logging
@@ -276,6 +297,10 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 	s.metrics.ttftChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.tpotChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.e2eReqLatencyChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqQueueTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqInferenceTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqPrefillTimeChan = make(chan float64, maxNumberOfRequests)
+	s.metrics.reqDecodeTimeChan = make(chan float64, maxNumberOfRequests)
 	s.metrics.requestSuccessChan = make(chan requestSuccessEvent, maxNumberOfRequests)
 
 	s.newRequests = make(chan *openaiserverapi.CompletionReqCtx, maxNumberOfRequests)
@@ -575,19 +600,22 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
-func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
+func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string,
+	toolCalls []openaiserverapi.ToolCall, modelName string, finishReason string, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
+	startPrefill := time.Now()
 	ttft := s.getWaitTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	// report ttft in seconds
 	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
+	common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
 
+	startDecode := time.Now()
 	for range usageData.CompletionTokens - 1 {
 		perTokenLatency := s.getInterTokenLatency()
 		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
@@ -595,8 +623,9 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 		// report tpot in seconds
 		common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan")
 	}
-	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
+	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
 
+	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
 	s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID())
 }
 
@@ -639,7 +668,7 @@ func (s *VllmSimulator) enqueue(req *openaiserverapi.CompletionReqCtx) error {
 	if s.waitingQueue.Len() >= s.queueCapacity {
 		return errors.New("waiting requests queue is full")
 	}
-	s.waitingQueue.PushBack(req)
+	s.waitingQueue.PushBack(waitingQueueItem{req, time.Now()})
 	return nil
 }
 
@@ -650,20 +679,22 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 
 	// Find first request for a loaded LoRA
 	for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
-		req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
-		if ok && req != nil && s.loraIsLoaded(req.CompletionReq.GetModel()) {
+		item, ok := elem.Value.(waitingQueueItem)
+		if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
-			s.incrementLora(req.CompletionReq.GetModel())
-			return req
+			s.incrementLora(item.reqCtx.CompletionReq.GetModel())
+			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			return item.reqCtx
 		}
 	}
 
 	// All the requests require a LoRA that is not loaded, check if we can load a LoRA
 	for elem := s.waitingQueue.Front(); elem != nil; elem = elem.Next() {
-		req, ok := elem.Value.(*openaiserverapi.CompletionReqCtx)
-		if ok && req != nil && s.loadLora(req.CompletionReq.GetModel()) {
+		item, ok := elem.Value.(waitingQueueItem)
+		if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
-			return req
+			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			return item.reqCtx
 		}
 	}
 
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index 8e87af96..84320464 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -102,12 +102,15 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
 	tc *openaiserverapi.ToolCall, finishReason string) {
+	startPrefill := time.Now()
 	// time to first token delay
 	ttft := s.getWaitTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 	// report ttft in seconds
 	common.WriteToChannel(s.metrics.ttftChan, (float64(ttft) / 1000), s.logger, "metrics.ttftChan")
+	common.WriteToChannel(s.metrics.reqPrefillTimeChan, time.Since(startPrefill).Seconds(), s.logger, "metrics.reqPrefillTimeChan")
 
+	startDecode := time.Now()
 	for i, token := range genTokens {
 		if i != 0 {
 			interTokenLat := s.getInterTokenLatency()
@@ -148,6 +151,8 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 		}
 	}
 
+	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
+
 	// send the last chunk if finish reason is stop
 	var chunk openaiserverapi.CompletionRespChunk
 	if finishReason == dataset.StopFinishReason {
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
index b247a72b..e2a6e504 100644
--- a/pkg/llm-d-inference-sim/worker.go
+++ b/pkg/llm-d-inference-sim/worker.go
@@ -19,6 +19,7 @@ package llmdinferencesim
 
 import (
 	"context"
+	"time"
 
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -59,6 +60,11 @@ type requestProcessor interface {
 }
 
 func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) {
+	start := time.Now()
+	defer func() {
+		s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds()
+	}()
+
 	req := reqCtx.CompletionReq
 	model := req.GetModel()
 	displayModel := s.getDisplayedModelName(model)

From 7d0e5f09a0c484d9c0dc9a480660e9c65e6806be Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Thu, 23 Oct 2025 14:26:42 +0300
Subject: [PATCH 03/14] typo in metric name

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 37175fb3..92de74ef 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -174,7 +174,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_queue_time_seconds",
+			Name:      "vllm:request_decode_time_seconds",
 			Help:      "Histogram of time spent in DECODE phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},

From f0e8882e54f37b5c248aaf2cfa35aa8a16967786 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Sat, 25 Oct 2025 22:53:33 +0300
Subject: [PATCH 04/14] Initial tests for new metrics + create constant for
 part of metrics names

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go      |  18 +++-
 pkg/llm-d-inference-sim/metrics_test.go | 117 +++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 92de74ef..4322b024 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -32,6 +32,14 @@ import (
 	vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
 )
 
+const (
+	e2eReqLatencyMetricName    = "vllm:e2e_request_latency_seconds"
+	reqQueueTimeMetricName     = "vllm:request_queue_time_seconds"
+	reqInferenceTimeMetricName = "vllm:request_inference_time_seconds"
+	prefillTimeMetricName      = "vllm:request_prefill_time_seconds"
+	decodeTimeMetricName       = "vllm:request_decode_time_seconds"
+)
+
 // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
 // Metrics reported:
 // - lora_requests_info
@@ -114,7 +122,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.e2eReqLatency = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:e2e_request_latency_seconds",
+			Name:      e2eReqLatencyMetricName,
 			Help:      "Histogram of end to end request latency in seconds.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -129,7 +137,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqQueueTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_queue_time_seconds",
+			Name:      reqQueueTimeMetricName,
 			Help:      "Histogram of time spent in WAITING phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -144,7 +152,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqInferenceTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_inference_time_seconds",
+			Name:      reqInferenceTimeMetricName,
 			Help:      "Histogram of time spent in RUNNING phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -159,7 +167,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqPrefillTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_prefill_time_seconds",
+			Name:      prefillTimeMetricName,
 			Help:      "Histogram of time spent in PREFILL phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
@@ -174,7 +182,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.reqDecodeTime = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_decode_time_seconds",
+			Name:      decodeTimeMetricName,
 			Help:      "Histogram of time spent in DECODE phase for request.",
 			Buckets:   common.RequestLatencyBucketsBoundaries,
 		},
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 52d3aecc..754f2026 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -19,7 +19,9 @@ package llmdinferencesim
 import (
 	"context"
 	"errors"
+	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"os"
 	"reflect"
@@ -164,7 +166,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
+		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`))
 		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`))
 		// request_params_max_tokens_bucket
 		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`))
@@ -815,6 +817,93 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
 		})
 	})
+
+	Context("latency metrics", func() {
+		DescribeTable("should calculate all latency related metrics correctly for a single request",
+			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
+				ttft int, prefillTimePerToken int, interTokenLatency int) {
+				// Expect(true).To(BeFalse())
+				// send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
+				modelName := "my_model"
+				// Send one request, check that ttft and tpot are as defined in the simulator command line params
+				ctx := context.TODO()
+				args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
+					"--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
+					"--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
+					"--time-to-first-token", strconv.Itoa(ttft),
+					"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
+					"--inter-token-latency", strconv.Itoa(interTokenLatency),
+				}
+
+				client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+				Expect(err).NotTo(HaveOccurred())
+
+				// TODO - pass isStreaming
+				openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false)
+				// TODO - how to test remote prefill/decode
+
+				var reqWg, metricsWg sync.WaitGroup
+				metricsWg.Add(1)
+				reqWg.Add(1)
+
+				// send a single request
+				go func() {
+					defer reqWg.Done()
+					defer GinkgoRecover()
+
+					_, err := openaiclient.Chat.Completions.New(ctx, params)
+					Expect(err).NotTo(HaveOccurred())
+				}()
+
+				// wait untill request processing was finished, send /mertics request
+				reqWg.Wait()
+				time.Sleep(300 * time.Millisecond)
+				metricsResp, err := client.Get(metricsUrl)
+				Expect(err).NotTo(HaveOccurred())
+				Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+
+				data, err := io.ReadAll(metricsResp.Body)
+				Expect(err).NotTo(HaveOccurred())
+				metrics := string(data)
+
+				numOfTokens := 4
+				var expectedPrefillTime float64
+				// TODO take into consideration remote prefill
+				if ttft > 0 {
+					// time-to-first-token overwrites calculation of prefill time based on number of input tokens
+					expectedPrefillTime = float64(ttft) / 1000
+
+				} else {
+					expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
+				}
+				expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
+				expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
+
+				prevBoundary := math.Inf(-1)
+
+				for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
+					checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
+					checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
+					checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
+
+					prevBoundary = bucketBoudary
+				}
+				// check the last bucket
+				lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
+				checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
+				checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
+				checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
+			},
+			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
+				ttft int, prefillTimePerToken int, interTokenLatency int) string {
+				return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d",
+					testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency)
+			},
+			// pay attention: do not define times close to bucket boundaries, this can lead to test failure
+			Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100),
+			Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100),
+		)
+	})
 })
 
 // isLoraMetricPresent checks if a matching metric exists
@@ -1022,3 +1111,29 @@ func TestBuild125Buckets(t *testing.T) {
 		})
 	}
 }
+
+func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
+	buckerBoundStr := "+Inf"
+	if bucketBoundary != math.Inf(1) {
+		buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
+	}
+	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
+}
+
+func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
+	prevBoundary float64, expectedValue float64) {
+	if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 {
+		// expected time is too close to the bucket boudary
+		// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
+		// since there is some additional calculation time
+		fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
+			prevBoundary, bucketBoudary, expectedValue)
+		return
+	}
+	expectedCount := 0
+	if bucketBoudary > expectedValue {
+		expectedCount = 1
+	}
+	Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
+
+}

From 59bb8dd50da55b0634fc21ff9a31866b8d44f555 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Mon, 27 Oct 2025 11:20:58 +0200
Subject: [PATCH 05/14] Fix bug in metrics test + add latency test for
 streaming mode

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics_test.go | 158 ++++++++++++------------
 1 file changed, 79 insertions(+), 79 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 754f2026..0e9df371 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -820,88 +820,26 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 
 	Context("latency metrics", func() {
 		DescribeTable("should calculate all latency related metrics correctly for a single request",
-			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
-				ttft int, prefillTimePerToken int, interTokenLatency int) {
-				// Expect(true).To(BeFalse())
-				// send a single request with a prompt of 5 token and echo mode, so output tokens number of 5 too
+			func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) {
+				// send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too
 				modelName := "my_model"
-				// Send one request, check that ttft and tpot are as defined in the simulator command line params
-				ctx := context.TODO()
-				args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
-					"--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
-					"--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
-					"--time-to-first-token", strconv.Itoa(ttft),
-					"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
-					"--inter-token-latency", strconv.Itoa(interTokenLatency),
-				}
-
-				client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
-				Expect(err).NotTo(HaveOccurred())
-
-				// TODO - pass isStreaming
-				openaiclient, params := getOpenAIClientAndChatParams(client, modelName, "1 2 3 4", false)
-				// TODO - how to test remote prefill/decode
-
-				var reqWg, metricsWg sync.WaitGroup
-				metricsWg.Add(1)
-				reqWg.Add(1)
-
-				// send a single request
-				go func() {
-					defer reqWg.Done()
-					defer GinkgoRecover()
-
-					_, err := openaiclient.Chat.Completions.New(ctx, params)
-					Expect(err).NotTo(HaveOccurred())
-				}()
+				prompt := "1 2 3 4"
 
-				// wait untill request processing was finished, send /mertics request
-				reqWg.Wait()
-				time.Sleep(300 * time.Millisecond)
-				metricsResp, err := client.Get(metricsUrl)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+				client := sendRequest(modelName, prompt, false, ttft, prefillTimePerToken, interTokenLatency)
+				checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
 
-				data, err := io.ReadAll(metricsResp.Body)
-				Expect(err).NotTo(HaveOccurred())
-				metrics := string(data)
-
-				numOfTokens := 4
-				var expectedPrefillTime float64
-				// TODO take into consideration remote prefill
-				if ttft > 0 {
-					// time-to-first-token overwrites calculation of prefill time based on number of input tokens
-					expectedPrefillTime = float64(ttft) / 1000
-
-				} else {
-					expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
-				}
-				expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
-				expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
-
-				prevBoundary := math.Inf(-1)
-
-				for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
-					checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
-					checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
-					checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
-
-					prevBoundary = bucketBoudary
-				}
-				// check the last bucket
-				lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
-				checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
-				checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
-				checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
+				// same in streaming mode
+				client = sendRequest(modelName, prompt, true, ttft, prefillTimePerToken, interTokenLatency)
+				checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
 			},
-			func(testName string, doRemotePrefill bool, doRemoteDecode bool, kvcacheTransferLatency int, kvCacheTransferTimePerToken int,
-				ttft int, prefillTimePerToken int, interTokenLatency int) string {
-				return fmt.Sprintf("%s\ndoRemotePrefill: %v, doRemoteDecode: %v, kvcacheTransferLatency: %d, kvCacheTransferTimePerToken: %d, ttft: %d, prefillTimePerToken: %d, interTokenLatency: %d",
-					testName, doRemotePrefill, doRemoteDecode, kvcacheTransferLatency, kvCacheTransferTimePerToken, ttft, prefillTimePerToken, interTokenLatency)
+			func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) string {
+				return fmt.Sprintf("%s\nttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", testNamePrefix, ttft, prefillTimePerToken, interTokenLatency)
 			},
-			// pay attention: do not define times close to bucket boundaries, this can lead to test failure
-			Entry(nil, "constant prefil + inter token time", false, false, 0, 0, 900, 0, 100),
-			Entry(nil, "prefill per token + inter token time", false, false, 0, 0, 0, 100, 100),
+			// Params order: testName, ttft, prefillTimePerToken, interTokenLatency
+			Entry(nil, "constant prefill + inter token time", 0, 0, 100),
+			Entry(nil, "constant prefill + inter token time", 900, 0, 100),
+			Entry(nil, "constant prefill + inter token time", 1000, 0, 100),
+			Entry(nil, "prefill per token + inter token time", 0, 100, 100),
 		)
 	})
 })
@@ -1122,8 +1060,8 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
 
 func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
 	prevBoundary float64, expectedValue float64) {
-	if expectedValue > prevBoundary && bucketBoudary > expectedValue && (bucketBoudary-expectedValue) < 0.005 {
-		// expected time is too close to the bucket boudary
+	if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 {
+		// expected time is too close to the bucket's boudary
 		// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
 		// since there is some additional calculation time
 		fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
@@ -1135,5 +1073,67 @@ func checkBucketBoundary(metrics string, modelName string, metricName string, bu
 		expectedCount = 1
 	}
 	Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
+}
+
+// send a single request with the given prompt and echo mode
+func sendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client {
+	ctx := context.TODO()
+	args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
+		// "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
+		// "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
+		"--time-to-first-token", strconv.Itoa(ttft),
+		"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
+		"--inter-token-latency", strconv.Itoa(interTokenLatency),
+	}
+
+	client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+	Expect(err).NotTo(HaveOccurred())
+
+	openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming)
+
+	// send a single request in a serial way
+	_, err = openaiclient.Chat.Completions.New(ctx, params)
+	Expect(err).NotTo(HaveOccurred())
+
+	return client
+}
+
+func checkLatencyMertics(client *http.Client, modelName string, prompt string, ttft int, prefillTimePerToken int, interTokenLatency int) {
+	// wait a little bit and check metrics
+	time.Sleep(300 * time.Millisecond)
+	metricsResp, err := client.Get(metricsUrl)
+	Expect(err).NotTo(HaveOccurred())
+	Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+
+	data, err := io.ReadAll(metricsResp.Body)
+	Expect(err).NotTo(HaveOccurred())
+	metrics := string(data)
 
+	numOfTokens := len(common.Tokenize(prompt))
+	var expectedPrefillTime float64
+	// TODO take into consideration remote prefill
+	if ttft > 0 {
+		// time-to-first-token overwrites calculation of prefill time based on number of input tokens
+		expectedPrefillTime = float64(ttft) / 1000
+
+	} else {
+		expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
+	}
+	expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
+	expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
+
+	prevBoundary := math.Inf(-1)
+
+	for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
+		checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
+		checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
+		checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
+
+		prevBoundary = bucketBoudary
+	}
+	// check the last bucket
+	lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
+	checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
+	checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
+	checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
 }

From 99bdec45415884e79bc17f94f2a1585446180d65 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Mon, 27 Oct 2025 21:27:41 +0200
Subject: [PATCH 06/14] Move common simulator tests helper functions to
 test_utils.go, use same model name is all tests, refactoring in server start
 functions

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/failures_test.go  |  46 +-
 pkg/llm-d-inference-sim/lora_test.go      |  12 +-
 pkg/llm-d-inference-sim/metrics_test.go   | 680 +++++++---------------
 pkg/llm-d-inference-sim/seed_test.go      |   7 +-
 pkg/llm-d-inference-sim/server_test.go    |  20 +-
 pkg/llm-d-inference-sim/simulator_test.go | 183 +-----
 pkg/llm-d-inference-sim/test_utils.go     | 427 ++++++++++++++
 pkg/llm-d-inference-sim/tools_test.go     |  34 +-
 pkg/llm-d-inference-sim/worker_test.go    |  36 +-
 9 files changed, 751 insertions(+), 694 deletions(-)
 create mode 100644 pkg/llm-d-inference-sim/test_utils.go

diff --git a/pkg/llm-d-inference-sim/failures_test.go b/pkg/llm-d-inference-sim/failures_test.go
index da8f8576..1459eed5 100644
--- a/pkg/llm-d-inference-sim/failures_test.go
+++ b/pkg/llm-d-inference-sim/failures_test.go
@@ -126,15 +126,15 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should always return an error response for chat completions", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -147,7 +147,7 @@ var _ = Describe("Failures", func() {
 			})
 
 			It("should always return an error response for text completions", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -164,16 +164,16 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
 					"--failure-types", common.FailureTypeRateLimit,
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should return only rate limit errors", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -182,7 +182,7 @@ var _ = Describe("Failures", func() {
 				Expect(ok).To(BeTrue())
 				Expect(openaiError.StatusCode).To(Equal(429))
 				Expect(openaiError.Type).To(Equal(openaiserverapi.ErrorCodeToType(429)))
-				Expect(strings.Contains(openaiError.Message, model)).To(BeTrue())
+				Expect(strings.Contains(openaiError.Message, testModel)).To(BeTrue())
 			})
 		})
 
@@ -190,16 +190,16 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
 					"--failure-types", common.FailureTypeInvalidAPIKey, common.FailureTypeServerError,
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should return only specified error types", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 
 				// Make multiple requests to verify we get the expected error types
 				for i := 0; i < 10; i++ {
@@ -222,20 +222,20 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "0",
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should never return errors and behave like random mode", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				resp, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(resp.Choices).To(HaveLen(1))
 				Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-				Expect(resp.Model).To(Equal(model))
+				Expect(resp.Model).To(Equal(testModel))
 			})
 		})
 
@@ -243,14 +243,14 @@ var _ = Describe("Failures", func() {
 			DescribeTable("should return correct error for each failure type",
 				func(failureType string, expectedStatusCode int, expectedErrorType string) {
 					ctx := context.Background()
-					client, err := startServerWithArgs(ctx, "", []string{
-						"cmd", "--model", model,
+					client, err := startServerWithArgs(ctx, []string{
+						"cmd", "--model", testModel,
 						"--failure-injection-rate", "100",
 						"--failure-types", failureType,
-					}, nil)
+					})
 					Expect(err).ToNot(HaveOccurred())
 
-					openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+					openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 					_, err = openaiClient.Chat.Completions.New(ctx, params)
 					Expect(err).To(HaveOccurred())
 
diff --git a/pkg/llm-d-inference-sim/lora_test.go b/pkg/llm-d-inference-sim/lora_test.go
index 837a36fc..2bcd63c0 100644
--- a/pkg/llm-d-inference-sim/lora_test.go
+++ b/pkg/llm-d-inference-sim/lora_test.go
@@ -34,14 +34,14 @@ var _ = Describe("LoRAs", func() {
 	Context("LoRAs config and load", func() {
 		It("Should config, load and load LoRAs correctly", func() {
 			ctx := context.TODO()
-			client, err := startServerWithArgs(ctx, "",
-				[]string{"cmd", "--model", model, "--mode", common.ModeEcho,
+			client, err := startServerWithArgs(ctx,
+				[]string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 					"--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
-					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}, nil)
+					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"})
 			Expect(err).NotTo(HaveOccurred())
 
 			// Request to lora3
-			openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", testUserMessage, false)
 			resp, err := openaiclient.Chat.Completions.New(ctx, params)
 			Expect(err).ToNot(HaveOccurred())
 
@@ -49,7 +49,7 @@ var _ = Describe("LoRAs", func() {
 			Expect(string(resp.Object)).To(Equal(chatCompletionObject))
 
 			msg := resp.Choices[0].Message.Content
-			Expect(msg).Should(Equal(userMessage))
+			Expect(msg).Should(Equal(testUserMessage))
 
 			// Unknown model, should return 404
 			params.Model = "lora1"
@@ -88,7 +88,7 @@ var _ = Describe("LoRAs", func() {
 			Expect(string(resp.Object)).To(Equal(chatCompletionObject))
 
 			msg = resp.Choices[0].Message.Content
-			Expect(msg).Should(Equal(userMessage))
+			Expect(msg).Should(Equal(testUserMessage))
 
 			// Unload lora3
 			payload = map[string]string{
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 0e9df371..4cc1b948 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -18,19 +18,12 @@ package llmdinferencesim
 
 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
-	"math"
 	"net/http"
 	"os"
-	"reflect"
-	"regexp"
-	"sort"
-	"strconv"
 	"strings"
 	"sync"
-	"testing"
 	"time"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -41,8 +34,6 @@ import (
 )
 
 const (
-	metricsUrl = "http://localhost/metrics"
-
 	lora1 = "lora1"
 	lora2 = "lora2"
 )
@@ -53,51 +44,51 @@ var lora2Arr = []string{lora2}
 
 var paramsLora1 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{
 	Messages: []openai.ChatCompletionMessageParamUnion{
-		openai.UserMessage(userMessage),
+		openai.UserMessage(testUserMessage),
 	},
 	Model: "lora1",
 }
 
 var paramsLora2 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{
 	Messages: []openai.ChatCompletionMessageParamUnion{
-		openai.UserMessage(userMessage),
+		openai.UserMessage(testUserMessage),
 	},
 	Model: "lora2",
 }
 
 var paramsLora3 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{
 	Messages: []openai.ChatCompletionMessageParamUnion{
-		openai.UserMessage(userMessage),
+		openai.UserMessage(testUserMessage),
 	},
 	Model: "lora3",
 }
 
 var paramsLora4 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{
 	Messages: []openai.ChatCompletionMessageParamUnion{
-		openai.UserMessage(userMessage),
+		openai.UserMessage(testUserMessage),
 	},
 	Model: "lora4",
 }
 
 var paramsLora5 openai.ChatCompletionNewParams = openai.ChatCompletionNewParams{
 	Messages: []openai.ChatCompletionMessageParamUnion{
-		openai.UserMessage(userMessage),
+		openai.UserMessage(testUserMessage),
 	},
 	Model: "lora5",
 }
 
 var _ = Describe("Simulator metrics", Ordered, func() {
-	It("Should send correct running and waiting requests metrics", func() {
+	It("should send correct running and waiting requests metrics", func() {
 		// Three requests, only two can run in parallel, we expect
 		// two running requests and one waiting request in the metrics
 		ctx := context.TODO()
-		args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
+		args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 			"--time-to-first-token", "3000", "--max-num-seqs", "2"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
-		openaiclient, params := getOpenAIClientAndChatParams(client, modelName, userMessage, false)
+		openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 
 		for range 3 {
 			go func() {
@@ -128,7 +119,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
 			"--time-to-first-token", "100", "--max-num-seqs", "4"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
 		openaiclient := openai.NewClient(
@@ -192,12 +183,12 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 
 	It("Should send correct lora metrics", func() {
 		ctx := context.TODO()
-		args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+		args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 			"--time-to-first-token", "3000",
 			"--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 			"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
 		openaiclient := openai.NewClient(
@@ -237,12 +228,12 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 
 	It("Should send correct lora metrics for parallel requests with delay", func() {
 		ctx := context.TODO()
-		args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+		args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 			"--time-to-first-token", "3000",
 			"--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 			"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
 		openaiclient := openai.NewClient(
@@ -312,12 +303,12 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 
 	It("Should send correct lora metrics for parallel requests without delay", func() {
 		ctx := context.TODO()
-		args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+		args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 			"--time-to-first-token", "3000",
 			"--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 			"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
 		openaiclient := openai.NewClient(
@@ -392,17 +383,16 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 	})
 
 	It("Should send correct ttft and tpot metrics", func() {
-		modelName := "my_model"
 		// Send one request, check that ttft and tpot are as defined in the simulator command line params
 		ctx := context.TODO()
 		// use mode echo to be sure that response is more than one token - this makes sure that tpot is reported to prometheus
-		args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
+		args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 			"--time-to-first-token", "200", "--inter-token-latency", "100"}
 
-		client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+		client, err := startServerWithArgs(ctx, args)
 		Expect(err).NotTo(HaveOccurred())
 
-		openaiclient, params := getOpenAIClientAndChatParams(client, modelName, userMessage, false)
+		openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 
 		var reqWg, metricsWg sync.WaitGroup
 		metricsWg.Add(1)
@@ -430,83 +420,83 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
 			// ttft
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
-			// check tpot only is it exists in metrics, when a single
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1"))
+			// tpot
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 0"))
 
 			metricsLines := strings.Split(metrics, "\n")
 			// the following values should be greater than 0, we don't know the exact value since it depends on the random response length
-			count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"}")
+			count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.2\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.2\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.3\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.3\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.4\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.4\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"1\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"5\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"10\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"20\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"40\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"80\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"}")
+			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"}")
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
 		}()
@@ -528,7 +518,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				"--enable-kvcache", "true", "--kv-cache-size", "16", "--block-size", "8",
 				"--time-to-first-token", "5000", "--tokenizers-cache-dir", tmpDir}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
@@ -605,7 +595,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				"--enable-kvcache", "true", "--kv-cache-size", "16", "--block-size", "8",
 				"--time-to-first-token", "5000", "--tokenizers-cache-dir", tmpDir, "--max-num-seqs", "2"}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
@@ -670,7 +660,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 	Context("fake metrics", func() {
 		It("Should respond with fake metrics to /metrics", func() {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--fake-metrics",
 				`{` +
 					`"running-requests":10,` +
@@ -702,7 +692,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 					`}`,
 			}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			resp, err := client.Get(metricsUrl)
@@ -712,76 +702,76 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			data, err := io.ReadAll(resp.Body)
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"my_model\"} 10"))
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"my_model\"} 30"))
-			Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"my_model\"} 0.4"))
+			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10"))
+			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30"))
+			Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
 
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 3"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 6"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 6"))
-
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.025\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.05\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.075\"} 3"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 6"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"my_model\",le=\"0.15\"} 6"))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="my_model",le="+Inf"} 60`))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="my_model",le="+Inf"} 60`))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="my_model",le="+Inf"} 60`))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="my_model"} 0`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="my_model"} 0`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="my_model"} 20`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="my_model"} 0`))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 3"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 6"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 6"))
+
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 3"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 6"))
+			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"} 6"))
+
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1"} 10`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="2"} 30`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="5"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="10"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="20"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="50"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="100"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="200"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="500"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1000"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
+
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 10`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 30`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
+
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 10`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 30`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 60`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
+
+			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
+			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
 		})
 	})
 
 	Context("fake ttft metrics", func() {
 		It("Should respond with fake ttft metrics to /metrics", func() {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--fake-metrics",
 				"{\"ttft-buckets-values\":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}",
 			}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			resp, err := client.Get(metricsUrl)
@@ -792,45 +782,44 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
 
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.001\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.005\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.02\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.04\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.06\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.08\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.25\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"0.75\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"7.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"10\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"20\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"40\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"80\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"160\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"640\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"2560\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"my_model\",le=\"+Inf\"} 1"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 0"))
+			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1"))
 		})
 	})
 
 	Context("latency metrics", func() {
+		numOfTokens := len(common.Tokenize(testUserMessage))
+
 		DescribeTable("should calculate all latency related metrics correctly for a single request",
 			func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) {
 				// send a single request with a prompt of 4 tokens and echo mode, so output tokens number of 4 too
-				modelName := "my_model"
-				prompt := "1 2 3 4"
-
-				client := sendRequest(modelName, prompt, false, ttft, prefillTimePerToken, interTokenLatency)
-				checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
+				client := startServerAndSendRequest(testModel, testUserMessage, false, ttft, prefillTimePerToken, interTokenLatency)
+				checkLatencyMertics(client, testModel, numOfTokens, numOfTokens, ttft, prefillTimePerToken, interTokenLatency)
 
-				// same in streaming mode
-				client = sendRequest(modelName, prompt, true, ttft, prefillTimePerToken, interTokenLatency)
-				checkLatencyMertics(client, modelName, prompt, ttft, prefillTimePerToken, interTokenLatency)
+				// same in streaming modeq
+				client = startServerAndSendRequest(testModel, testUserMessage, true, ttft, prefillTimePerToken, interTokenLatency)
+				checkLatencyMertics(client, testModel, numOfTokens, numOfTokens, ttft, prefillTimePerToken, interTokenLatency)
 			},
 			func(testNamePrefix string, ttft int, prefillTimePerToken int, interTokenLatency int) string {
 				return fmt.Sprintf("%s\nttft: %d, prefillTimePerToken: %d, interTokenLatency: %d", testNamePrefix, ttft, prefillTimePerToken, interTokenLatency)
@@ -844,296 +833,77 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 	})
 })
 
-// isLoraMetricPresent checks if a matching metric exists
-// metrics: the list of metrics
-// running: list of loras in running_lora_adapters, the order does not matter
-// waiting: list of loras in waiting_lora_adapters, the order does not matter
-func isLoraMetricPresent(metrics []string, running, waiting []string) bool {
-	return findLoraMetric(metrics, running, waiting) != ""
-}
-
-// getLoraTimestamp returns timestamp or nil, error
-func getLoraTimestamp(metrics []string, running, waiting []string) (*float64, error) {
-	metric := findLoraMetric(metrics, running, waiting)
-	if metric == "" {
-		return nil, nil // not found
-	}
-	return extractTimestamp(metric)
-}
-
-func extractTimestamp(metric string) (*float64, error) {
-	// Extract timestamp: last part after space
-	parts := strings.Split(metric, " ")
-	if len(parts) < 2 {
-		return nil, errors.New("invalid metric format")
-	}
-	timestampStr := parts[len(parts)-1]
-	timestamp, err := strconv.ParseFloat(timestampStr, 64)
-	Expect(err).NotTo(HaveOccurred())
-
-	return &timestamp, nil
-}
-
-func getLoraValidTimestamp(metrics []string, running, waiting []string) float64 {
-	timestamp, err := getLoraTimestamp(metrics, running, waiting)
-	Expect(err).NotTo(HaveOccurred())
-	Expect(timestamp).ToNot(BeNil())
-	return *timestamp
-}
-
-func getLastLoraMetrics(metrics []string) ([]string, error) {
-	lastTimestamp := float64(0)
-	var lastMetrics []string
-	for _, metric := range metrics {
-		if strings.HasPrefix(metric, "vllm:lora_requests_info") {
-			timestamp, err := extractTimestamp(metric)
-			if err != nil {
-				return nil, err
-			}
-			if lastTimestamp > *timestamp {
-				continue
-			}
-			lastTimestamp = *timestamp
-			if lastTimestamp < *timestamp {
-				lastMetrics = make([]string, 0)
-			}
-			lastMetrics = append(lastMetrics, metric)
+var _ = Describe("build125Buckets", Ordered, func() {
+	It("should create valid 125 buckets", func() {
+		// tests the build125Buckets function with various inputs.
+		tests := []struct {
+			name     string
+			maxValue int
+			want     []float64
+		}{
+			{
+				name:     "max_value zero",
+				maxValue: 0,
+				want:     []float64{}, // no bucket <= 0
+			},
+			{
+				name:     "max_value one",
+				maxValue: 1,
+				want:     []float64{1},
+			},
+			{
+				name:     "max_value five",
+				maxValue: 5,
+				want:     []float64{1, 2, 5},
+			},
+			{
+				name:     "max_value ten",
+				maxValue: 10,
+				want:     []float64{1, 2, 5, 10},
+			},
+			{
+				name:     "max_value 100",
+				maxValue: 100,
+				want:     []float64{1, 2, 5, 10, 20, 50, 100},
+			},
+			{
+				name:     "max_value 999",
+				maxValue: 999,
+				want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
+			},
+			{
+				name:     "max_value 1024",
+				maxValue: 1024,
+				want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
+			},
+			{
+				name:     "max_value 4096",
+				maxValue: 4096,
+				want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000},
+			},
+			{
+				name:     "max_value 32768",
+				maxValue: 32768,
+				want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000},
+			},
+			{
+				name:     "max_value just below power of 10",
+				maxValue: 99,
+				want:     []float64{1, 2, 5, 10, 20, 50},
+			},
+			{
+				name:     "max_value negative",
+				maxValue: -1,
+				want:     []float64{}, // no positive bucket <= -1
+			},
 		}
-	}
-	return lastMetrics, nil
-}
 
-// findLoraMetric finds the relevant metric by comparing with the given loras sets (ignoring order)
-// metrics: lines of metrics
-// running: list of running loras to find
-// waiting: list of waiting loras to find
-// Looks for a line with the given running and waiting loras sets, the comparison is order agnostic.
-// Return metric should match in both running and waiting sets.
-// E.g. for input running=["l1", "l2", "l3"] and waiting=[] will return metric
-// with running_lora_adapters=["l3", "l1", "l2"] and waiting_lora_adapters=[]
-func findLoraMetric(metrics []string, running, waiting []string) string {
-	// sort input arrays before compare, create string of all values, separated by comma
-	sort.Strings(running)
-	sort.Strings(waiting)
-	runStr := strings.Join(running, ",")
-	waitStr := strings.Join(waiting, ",")
-
-	// regex to extract lora metrics and values
-	re := regexp.MustCompile(`vllm:lora_requests_info\{.*running_lora_adapters="([^"]*)".*waiting_lora_adapters="([^"]*)".*\}\s+([0-9.e\+\-]+)`)
-	for _, metric := range metrics {
-		matches := re.FindStringSubmatch(metric)
-		if len(matches) == 4 {
-			// this line contains loraInfo metric, check running and waiting loras lists
-			// split and sort metric's running and waiting loras lists for the comparison
-			metricRun := splitString(matches[1])
-			metricWait := splitString(matches[2])
-			sort.Strings(metricRun)
-			sort.Strings(metricWait)
-			// if both lists are the same - return the metric
-			if strings.Join(metricRun, ",") == runStr && strings.Join(metricWait, ",") == waitStr {
-				return metric
-			}
-		} // if the metric is not in the required format - skip it
-	}
-
-	// required metric was not found
-	return ""
-}
-
-// splits the given string to array of strings with separator = ","
-func splitString(str string) []string {
-	if str == "" {
-		return []string{}
-	}
-	return strings.Split(str, ",")
-}
-
-func findMetric(metrics []string, metricPrefix string) string {
-	// regex to extract metrics and values
-	for _, metric := range metrics {
-		if strings.Contains(metric, metricPrefix) {
-			arr := strings.Split(metric, " ")
-			if len(arr) == 2 {
-				return arr[1]
-			}
-			break
+		for _, test := range tests {
+			got := build125Buckets(test.maxValue)
+			Expect(got).To(Equal(test.want))
+			// if !reflect.DeepEqual(got, test.want) {
+			// 	t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
+			// }
 		}
-	}
-	// required metric was not found
-	return ""
-}
-
-func findIntMetric(metrics []string, metricPrefix string) *int {
-	valueStr := findMetric(metrics, metricPrefix)
-	if valueStr == "" {
-		return nil
-	}
-
-	val, err := strconv.Atoi(valueStr)
-	if err != nil {
-		return nil
-	}
-	return &val
-}
-
-// TestBuild125Buckets tests the build125Buckets function with various inputs.
-func TestBuild125Buckets(t *testing.T) {
-	tests := []struct {
-		name     string
-		maxValue int
-		want     []float64
-	}{
-		{
-			name:     "max_value zero",
-			maxValue: 0,
-			want:     []float64{}, // no bucket <= 0
-		},
-		{
-			name:     "max_value one",
-			maxValue: 1,
-			want:     []float64{1},
-		},
-		{
-			name:     "max_value five",
-			maxValue: 5,
-			want:     []float64{1, 2, 5},
-		},
-		{
-			name:     "max_value ten",
-			maxValue: 10,
-			want:     []float64{1, 2, 5, 10},
-		},
-		{
-			name:     "max_value 100",
-			maxValue: 100,
-			want:     []float64{1, 2, 5, 10, 20, 50, 100},
-		},
-		{
-			name:     "max_value 999",
-			maxValue: 999,
-			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
-		},
-		{
-			name:     "max_value 1024",
-			maxValue: 1024,
-			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
-		},
-		{
-			name:     "max_value 4096",
-			maxValue: 4096,
-			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000},
-		},
-		{
-			name:     "max_value 32768",
-			maxValue: 32768,
-			want:     []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000},
-		},
-		{
-			name:     "max_value just below power of 10",
-			maxValue: 99,
-			want:     []float64{1, 2, 5, 10, 20, 50},
-		},
-		{
-			name:     "max_value negative",
-			maxValue: -1,
-			want:     []float64{}, // no positive bucket <= -1
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := build125Buckets(tt.maxValue)
-			if !reflect.DeepEqual(got, tt.want) {
-				t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
-			}
-		})
-	}
-}
-
-func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
-	buckerBoundStr := "+Inf"
-	if bucketBoundary != math.Inf(1) {
-		buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
-	}
-	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
-}
-
-func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
-	prevBoundary float64, expectedValue float64) {
-	if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 {
-		// expected time is too close to the bucket's boudary
-		// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
-		// since there is some additional calculation time
-		fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
-			prevBoundary, bucketBoudary, expectedValue)
-		return
-	}
-	expectedCount := 0
-	if bucketBoudary > expectedValue {
-		expectedCount = 1
-	}
-	Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
-}
-
-// send a single request with the given prompt and echo mode
-func sendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client {
-	ctx := context.TODO()
-	args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
-		// "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
-		// "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
-		"--time-to-first-token", strconv.Itoa(ttft),
-		"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
-		"--inter-token-latency", strconv.Itoa(interTokenLatency),
-	}
-
-	client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
-	Expect(err).NotTo(HaveOccurred())
-
-	openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming)
-
-	// send a single request in a serial way
-	_, err = openaiclient.Chat.Completions.New(ctx, params)
-	Expect(err).NotTo(HaveOccurred())
-
-	return client
-}
-
-func checkLatencyMertics(client *http.Client, modelName string, prompt string, ttft int, prefillTimePerToken int, interTokenLatency int) {
-	// wait a little bit and check metrics
-	time.Sleep(300 * time.Millisecond)
-	metricsResp, err := client.Get(metricsUrl)
-	Expect(err).NotTo(HaveOccurred())
-	Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
-
-	data, err := io.ReadAll(metricsResp.Body)
-	Expect(err).NotTo(HaveOccurred())
-	metrics := string(data)
-
-	numOfTokens := len(common.Tokenize(prompt))
-	var expectedPrefillTime float64
-	// TODO take into consideration remote prefill
-	if ttft > 0 {
-		// time-to-first-token overwrites calculation of prefill time based on number of input tokens
-		expectedPrefillTime = float64(ttft) / 1000
-
-	} else {
-		expectedPrefillTime = float64(numOfTokens*prefillTimePerToken) / 1000
-	}
-	expectedDecodeTime := float64(interTokenLatency*(numOfTokens-1)) / 1000
-	expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
-
-	prevBoundary := math.Inf(-1)
-
-	for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
-		checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
-		checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
-		checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
-
-		prevBoundary = bucketBoudary
-	}
-	// check the last bucket
-	lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
-	checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
-	checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
-	checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
-}
+	})
+})
diff --git a/pkg/llm-d-inference-sim/seed_test.go b/pkg/llm-d-inference-sim/seed_test.go
index 4e10f1f8..210419e1 100644
--- a/pkg/llm-d-inference-sim/seed_test.go
+++ b/pkg/llm-d-inference-sim/seed_test.go
@@ -31,11 +31,10 @@ var _ = Describe("Simulator with seed", func() {
 		// use a function so that httpClient is captured when running
 		func() {
 			ctx := context.TODO()
-			client, err := startServerWithArgs(ctx, common.ModeRandom,
-				[]string{"cmd", "--model", model, "--mode", common.ModeRandom, "--seed", "100"}, nil)
+			client, err := startServerWithArgs(ctx, []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--seed", "100"})
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false)
 			params.MaxTokens = openai.Int(10)
 			resp, err := openaiclient.Completions.New(ctx, params)
 			Expect(err).NotTo(HaveOccurred())
@@ -67,7 +66,7 @@ var _ = Describe("Simulator with seed", func() {
 			client, err := startServer(ctx, common.ModeRandom)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false)
 			resp, err := openaiclient.Completions.New(ctx, params)
 			Expect(err).NotTo(HaveOccurred())
 			Expect(resp.Choices).ShouldNot(BeEmpty())
diff --git a/pkg/llm-d-inference-sim/server_test.go b/pkg/llm-d-inference-sim/server_test.go
index 1f610562..0f648681 100644
--- a/pkg/llm-d-inference-sim/server_test.go
+++ b/pkg/llm-d-inference-sim/server_test.go
@@ -63,7 +63,7 @@ var _ = Describe("Server", func() {
 			ctx := context.TODO()
 			args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom,
 				"--tokenizers-cache-dir", tmpDir, "--max-model-len", "2048"}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			reqBody := `{
@@ -92,7 +92,7 @@ var _ = Describe("Server", func() {
 			ctx := context.TODO()
 			args := []string{"cmd", "--model", qwenModelName, "--mode", common.ModeRandom,
 				"--tokenizers-cache-dir", tmpDir, "--max-model-len", "2048"}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			reqBody := `{
@@ -129,7 +129,7 @@ var _ = Describe("Server", func() {
 				os.Args = oldArgs
 			}()
 
-			os.Args = []string{"cmd", "--model", model, "--ssl-certfile", certFile, "--ssl-keyfile", keyFile}
+			os.Args = []string{"cmd", "--model", testModel, "--ssl-certfile", certFile, "--ssl-keyfile", keyFile}
 			config, err := common.ParseCommandParamsAndLoadConfig()
 			Expect(err).NotTo(HaveOccurred())
 			Expect(config.SSLEnabled()).To(BeTrue())
@@ -143,7 +143,7 @@ var _ = Describe("Server", func() {
 				os.Args = oldArgs
 			}()
 
-			os.Args = []string{"cmd", "--model", model, "--self-signed-certs"}
+			os.Args = []string{"cmd", "--model", testModel, "--self-signed-certs"}
 			config, err := common.ParseCommandParamsAndLoadConfig()
 			Expect(err).NotTo(HaveOccurred())
 			Expect(config.SSLEnabled()).To(BeTrue())
@@ -168,7 +168,7 @@ var _ = Describe("Server", func() {
 			certFile, _, err := GenerateTempCerts(tempDir)
 			Expect(err).NotTo(HaveOccurred())
 
-			os.Args = []string{"cmd", "--model", model, "--ssl-certfile", certFile}
+			os.Args = []string{"cmd", "--model", testModel, "--ssl-certfile", certFile}
 			_, err = common.ParseCommandParamsAndLoadConfig()
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring("both ssl-certfile and ssl-keyfile must be provided together"))
@@ -176,7 +176,7 @@ var _ = Describe("Server", func() {
 			_, keyFile, err := GenerateTempCerts(tempDir)
 			Expect(err).NotTo(HaveOccurred())
 
-			os.Args = []string{"cmd", "--model", model, "--ssl-keyfile", keyFile}
+			os.Args = []string{"cmd", "--model", testModel, "--ssl-keyfile", keyFile}
 			_, err = common.ParseCommandParamsAndLoadConfig()
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring("both ssl-certfile and ssl-keyfile must be provided together"))
@@ -187,9 +187,9 @@ var _ = Describe("Server", func() {
 			certFile, keyFile, err := GenerateTempCerts(tempDir)
 			Expect(err).NotTo(HaveOccurred())
 
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--ssl-certfile", certFile, "--ssl-keyfile", keyFile}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			resp, err := client.Get("https://localhost/health")
@@ -198,8 +198,8 @@ var _ = Describe("Server", func() {
 		})
 
 		It("Should start HTTPS server with self-signed certificates", func(ctx SpecContext) {
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--self-signed-certs"}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--self-signed-certs"}
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			resp, err := client.Get("https://localhost/health")
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index a461ff01..010b82ad 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -18,13 +18,10 @@ package llmdinferencesim
 
 import (
 	"context"
-	"crypto/tls"
 	"errors"
 	"fmt"
 	"io"
-	"net"
 	"net/http"
-	"os"
 	"strings"
 
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -34,89 +31,10 @@ import (
 	"github.com/openai/openai-go/v3"
 	"github.com/openai/openai-go/v3/option"
 	"github.com/openai/openai-go/v3/packages/param"
-	"github.com/valyala/fasthttp/fasthttputil"
-	"k8s.io/klog/v2"
 )
 
-const model = "my_model"
-const qwenModelName = "Qwen/Qwen2-0.5B"
-const baseURL = "http://localhost/v1"
-const userMessage = "This is a test."
 const invalidMaxTokensErrMsg = "Max completion tokens and max tokens should be positive"
 
-var userMsgTokens int64
-
-func startServer(ctx context.Context, mode string) (*http.Client, error) {
-	return startServerWithArgs(ctx, mode, nil, nil)
-}
-
-func startServerWithArgs(ctx context.Context, mode string, args []string, envs map[string]string) (*http.Client, error) {
-	oldArgs := os.Args
-	defer func() {
-		os.Args = oldArgs
-	}()
-
-	if args != nil {
-		os.Args = args
-	} else {
-		os.Args = []string{"cmd", "--model", model, "--mode", mode}
-	}
-
-	if envs != nil {
-		for k, v := range envs {
-			err := os.Setenv(k, v)
-			Expect(err).NotTo(HaveOccurred())
-		}
-
-		defer func() {
-			for k := range envs {
-				err := os.Unsetenv(k)
-				Expect(err).NotTo(HaveOccurred())
-			}
-		}()
-	}
-
-	logger := klog.Background()
-
-	s, err := New(logger)
-	if err != nil {
-		return nil, err
-	}
-	config, err := common.ParseCommandParamsAndLoadConfig()
-	if err != nil {
-		return nil, err
-	}
-	s.config = config
-
-	// calculate number of tokens for user message,
-	// must be activated after parseCommandParamsAndLoadConfig since it initializes the random engine
-	userMsgTokens = int64(len(common.Tokenize(userMessage)))
-
-	if err := s.initializeSim(ctx); err != nil {
-		return nil, err
-	}
-
-	listener := fasthttputil.NewInmemoryListener()
-
-	// start the http server
-	go func() {
-		if err := s.startServer(ctx, listener); err != nil {
-			logger.Error(err, "error starting server")
-		}
-	}()
-
-	return &http.Client{
-		Transport: &http.Transport{
-			DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
-				return listener.Dial()
-			},
-			TLSClientConfig: &tls.Config{
-				InsecureSkipVerify: true,
-			},
-		},
-	}, nil
-}
-
 var _ = Describe("Simulator", func() {
 
 	DescribeTable("chat completions streaming",
@@ -125,7 +43,7 @@ var _ = Describe("Simulator", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, true)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, true)
 			stream := openaiclient.Chat.Completions.NewStreaming(ctx, params)
 			defer func() {
 				err := stream.Close()
@@ -161,7 +79,7 @@ var _ = Describe("Simulator", func() {
 				Expect(dataset.IsValidText(msg)).To(BeTrue())
 			} else {
 				// in case of echo mode check that the text is returned as-is
-				Expect(msg).Should(Equal(userMessage))
+				Expect(msg).Should(Equal(testUserMessage))
 			}
 			Expect(role).Should(Equal("assistant"))
 		},
@@ -178,7 +96,7 @@ var _ = Describe("Simulator", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, true)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, true)
 			stream := openaiclient.Completions.NewStreaming(ctx, params)
 			defer func() {
 				err := stream.Close()
@@ -210,7 +128,7 @@ var _ = Describe("Simulator", func() {
 				Expect(dataset.IsValidText(text)).To(BeTrue())
 			} else {
 				// in case of echo mode check that the text is returned as-is
-				Expect(text).Should(Equal(userMessage))
+				Expect(text).Should(Equal(testUserMessage))
 			}
 		},
 		func(mode string) string {
@@ -226,7 +144,7 @@ var _ = Describe("Simulator", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			numTokens := 0
 			// if maxTokens and maxCompletionTokens are passsed
 			// maxCompletionTokens is used
@@ -271,7 +189,7 @@ var _ = Describe("Simulator", func() {
 					Expect(dataset.IsValidText(msg)).To(BeTrue())
 				} else {
 					// in case of echo mode check that the text is returned as-is
-					Expect(msg).Should(Equal(userMessage))
+					Expect(msg).Should(Equal(testUserMessage))
 				}
 			}
 		},
@@ -303,7 +221,7 @@ var _ = Describe("Simulator", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false)
 			numTokens := 0
 			if maxTokens != 0 {
 				params.MaxTokens = param.NewOpt(int64(maxTokens))
@@ -342,7 +260,7 @@ var _ = Describe("Simulator", func() {
 					Expect(dataset.IsValidText(text)).To(BeTrue())
 				} else {
 					// in case of echo mode check that the text is returned as-is
-					Expect(text).Should(Equal(userMessage))
+					Expect(text).Should(Equal(testUserMessage))
 				}
 			}
 		},
@@ -433,10 +351,10 @@ var _ = Describe("Simulator", func() {
 				podNameEnv: testPod,
 				podNsEnv:   testNamespace,
 			}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs)
+			client, err := startServerWithEnv(ctx, common.ModeRandom, envs)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, false)
 			var httpResp *http.Response
 			resp, err := openaiclient.Completions.New(ctx, params, option.WithResponseInto(&httpResp))
 			Expect(err).NotTo(HaveOccurred())
@@ -461,10 +379,10 @@ var _ = Describe("Simulator", func() {
 				podNameEnv: testPod,
 				podNsEnv:   testNamespace,
 			}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs)
+			client, err := startServerWithEnv(ctx, common.ModeRandom, envs)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClentAndCompletionParams(client, model, userMessage, true)
+			openaiclient, params := getOpenAIClentAndCompletionParams(client, testModel, testUserMessage, true)
 			var httpResp *http.Response
 			resp, err := openaiclient.Completions.New(ctx, params, option.WithResponseInto(&httpResp))
 			Expect(err).NotTo(HaveOccurred())
@@ -485,14 +403,14 @@ var _ = Describe("Simulator", func() {
 		It("Should reject requests exceeding context window", func() {
 			ctx := context.TODO()
 			// Start server with max-model-len=10
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--max-model-len", "10"}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--max-model-len", "10"}
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			// Test with raw HTTP to verify the error response format
 			reqBody := `{
 				"messages": [{"role": "user", "content": "This is a test message"}],
-				"model": "my_model",
+				"model": "testmodel",
 				"max_tokens": 8
 			}`
 
@@ -513,7 +431,7 @@ var _ = Describe("Simulator", func() {
 			Expect(string(body)).To(ContainSubstring("BadRequestError"))
 
 			// Also test with OpenAI client to ensure it gets an error
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, "This is a test message", false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, "This is a test message", false)
 			params.MaxTokens = openai.Int(8)
 
 			_, err = openaiclient.Chat.Completions.New(ctx, params)
@@ -526,11 +444,11 @@ var _ = Describe("Simulator", func() {
 		It("Should accept requests within context window", func() {
 			ctx := context.TODO()
 			// Start server with max-model-len=50
-			args := []string{"cmd", "--model", model, "--mode", common.ModeEcho, "--max-model-len", "50"}
-			client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho, "--max-model-len", "50"}
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, "Hello", false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, "Hello", false)
 			params.MaxTokens = openai.Int(5)
 
 			// Send a request within the context window
@@ -538,20 +456,20 @@ var _ = Describe("Simulator", func() {
 
 			Expect(err).NotTo(HaveOccurred())
 			Expect(resp.Choices).To(HaveLen(1))
-			Expect(resp.Model).To(Equal(model))
+			Expect(resp.Model).To(Equal(testModel))
 		})
 
 		It("Should handle text completion requests exceeding context window", func() {
 			ctx := context.TODO()
 			// Start server with max-model-len=10
-			args := []string{"cmd", "--model", model, "--mode", common.ModeRandom, "--max-model-len", "10"}
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom, "--max-model-len", "10"}
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			// Test with raw HTTP for text completion
 			reqBody := `{
 				"prompt": "This is a long test prompt with many words",
-				"model": "my_model",
+				"model": "testmodel",
 				"max_tokens": 5
 			}`
 
@@ -571,58 +489,3 @@ var _ = Describe("Simulator", func() {
 		})
 	})
 })
-
-func sendSimpleChatRequest(envs map[string]string, streaming bool) *http.Response {
-	ctx := context.TODO()
-
-	client, err := startServerWithArgs(ctx, common.ModeRandom, nil, envs)
-	Expect(err).NotTo(HaveOccurred())
-
-	openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, streaming)
-	var httpResp *http.Response
-	resp, err := openaiclient.Chat.Completions.New(ctx, params, option.WithResponseInto(&httpResp))
-	Expect(err).NotTo(HaveOccurred())
-	Expect(resp).NotTo(BeNil())
-
-	Expect(resp.Choices).ShouldNot(BeEmpty())
-	Expect(string(resp.Object)).To(Equal(chatCompletionObject))
-
-	return httpResp
-}
-
-func getOpenAIClientAndChatParams(client option.HTTPClient, model string, message string,
-	streaming bool) (openai.Client, openai.ChatCompletionNewParams) {
-	openaiclient := openai.NewClient(
-		option.WithBaseURL(baseURL),
-		option.WithHTTPClient(client))
-
-	params := openai.ChatCompletionNewParams{
-		Messages: []openai.ChatCompletionMessageParamUnion{
-			openai.UserMessage(message),
-		},
-		Model: model,
-	}
-	if streaming {
-		params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)}
-	}
-	return openaiclient, params
-}
-
-// nolint
-func getOpenAIClentAndCompletionParams(client option.HTTPClient, model string, message string,
-	streaming bool) (openai.Client, openai.CompletionNewParams) {
-	openaiclient := openai.NewClient(
-		option.WithBaseURL(baseURL),
-		option.WithHTTPClient(client))
-
-	params := openai.CompletionNewParams{
-		Prompt: openai.CompletionNewParamsPromptUnion{
-			OfString: openai.String(message),
-		},
-		Model: openai.CompletionNewParamsModel(model),
-	}
-	if streaming {
-		params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)}
-	}
-	return openaiclient, params
-}
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
new file mode 100644
index 00000000..1919ec1c
--- /dev/null
+++ b/pkg/llm-d-inference-sim/test_utils.go
@@ -0,0 +1,427 @@
+package llmdinferencesim
+
+import (
+	"context"
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"net"
+	"net/http"
+	"os"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
+	"github.com/openai/openai-go/v3"
+	"github.com/openai/openai-go/v3/option"
+	"github.com/openai/openai-go/v3/packages/param"
+	"github.com/valyala/fasthttp/fasthttputil"
+	"k8s.io/klog/v2"
+
+	"github.com/onsi/gomega"
+)
+
+const (
+	qwenModelName   = "Qwen/Qwen2-0.5B"
+	baseURL         = "http://localhost/v1"
+	testModel       = "testmodel"
+	testUserMessage = "This is a test."
+	metricsUrl      = "http://localhost/metrics"
+)
+
+var userMsgTokens int64
+
+// Starts server in the given mode, no additional arguments or environment variables
+func startServer(ctx context.Context, mode string) (*http.Client, error) {
+	return startServerWithArgsAndEnv(ctx, mode, nil, nil)
+}
+
+// Starts server in the given mode and environment variables
+func startServerWithEnv(ctx context.Context, mode string, envs map[string]string) (*http.Client, error) {
+	return startServerWithArgsAndEnv(ctx, mode, nil, envs)
+}
+
+// Starts server according the given arguments
+// if args are defined - mode defined in args will override the mode defined by the mode parameter
+func startServerWithArgs(ctx context.Context, args []string) (*http.Client, error) {
+	return startServerWithArgsAndEnv(ctx, "", args, nil)
+}
+
+// Starts server according the given parmaters: mode, arguments and environment
+// if args are defined - the mode parameter is discarded, value from args is used
+func startServerWithArgsAndEnv(ctx context.Context, mode string, args []string, envs map[string]string) (*http.Client, error) {
+	oldArgs := os.Args
+	defer func() {
+		os.Args = oldArgs
+	}()
+
+	if args != nil {
+		os.Args = args
+	} else {
+		os.Args = []string{"cmd", "--model", testModel, "--mode", mode}
+	}
+
+	if envs != nil {
+		for k, v := range envs {
+			err := os.Setenv(k, v)
+			gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		}
+
+		defer func() {
+			for k := range envs {
+				err := os.Unsetenv(k)
+				gomega.Expect(err).NotTo(gomega.HaveOccurred())
+			}
+		}()
+	}
+
+	logger := klog.Background()
+
+	s, err := New(logger)
+	if err != nil {
+		return nil, err
+	}
+	config, err := common.ParseCommandParamsAndLoadConfig()
+	if err != nil {
+		return nil, err
+	}
+	s.config = config
+
+	// calculate number of tokens for user message,
+	// must be activated after parseCommandParamsAndLoadConfig since it initializes the random engine
+	userMsgTokens = int64(len(common.Tokenize(testUserMessage)))
+
+	if err := s.initializeSim(ctx); err != nil {
+		return nil, err
+	}
+
+	listener := fasthttputil.NewInmemoryListener()
+
+	// start the http server
+	go func() {
+		if err := s.startServer(ctx, listener); err != nil {
+			logger.Error(err, "error starting server")
+		}
+	}()
+
+	return &http.Client{
+		Transport: &http.Transport{
+			DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
+				return listener.Dial()
+			},
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: true,
+			},
+		},
+	}, nil
+}
+
+// startServerAndSendRequest - starts server configured according the given latency parameters in echo mode,
+// sends a single request with the given prompt
+func startServerAndSendRequest(modelName string, prompt string, isStreaming bool, ttft int, prefillTimePerToken int, interTokenLatency int) *http.Client {
+	ctx := context.TODO()
+	args := []string{"cmd", "--model", modelName, "--mode", common.ModeEcho,
+		// "--kv-cache-transfer-latency", strconv.Itoa(kvcacheTransferLatency),
+		// "--kv-cache-transfer-time-per-token", strconv.Itoa(kvCacheTransferTimePerToken),
+		"--time-to-first-token", strconv.Itoa(ttft),
+		"--prefill-time-per-token", strconv.Itoa(prefillTimePerToken),
+		"--inter-token-latency", strconv.Itoa(interTokenLatency),
+	}
+
+	client, err := startServerWithArgs(ctx, args)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+	openaiclient, params := getOpenAIClientAndChatParams(client, modelName, prompt, isStreaming)
+
+	// send a single request in a serial way
+	_, err = openaiclient.Chat.Completions.New(ctx, params)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+	return client
+}
+
+// sendSimpleChatRequest starts server using the given environment variables and sends one chat completions request
+func sendSimpleChatRequest(envs map[string]string, streaming bool) *http.Response {
+	ctx := context.TODO()
+
+	client, err := startServerWithEnv(ctx, common.ModeRandom, envs)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+	openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, streaming)
+	var httpResp *http.Response
+	resp, err := openaiclient.Chat.Completions.New(ctx, params, option.WithResponseInto(&httpResp))
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	gomega.Expect(resp).NotTo(gomega.BeNil())
+
+	gomega.Expect(resp.Choices).ShouldNot(gomega.BeEmpty())
+	gomega.Expect(string(resp.Object)).To(gomega.Equal(chatCompletionObject))
+
+	return httpResp
+}
+
+// getOpenAIClientAndChatParams - creates an openai client and params for /chat/completions call based on the given parameters
+func getOpenAIClientAndChatParams(client option.HTTPClient, model string, message string,
+	streaming bool) (openai.Client, openai.ChatCompletionNewParams) {
+	openaiclient := openai.NewClient(
+		option.WithBaseURL(baseURL),
+		option.WithHTTPClient(client))
+
+	params := openai.ChatCompletionNewParams{
+		Messages: []openai.ChatCompletionMessageParamUnion{
+			openai.UserMessage(message),
+		},
+		Model: model,
+	}
+	if streaming {
+		params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)}
+	}
+	return openaiclient, params
+}
+
+// nolint
+// getOpenAIClentAndCompletionParams - creates an openai client and params for /completions call based on the given parameters
+func getOpenAIClentAndCompletionParams(client option.HTTPClient, model string, message string,
+	streaming bool) (openai.Client, openai.CompletionNewParams) {
+	openaiclient := openai.NewClient(
+		option.WithBaseURL(baseURL),
+		option.WithHTTPClient(client))
+
+	params := openai.CompletionNewParams{
+		Prompt: openai.CompletionNewParamsPromptUnion{
+			OfString: openai.String(message),
+		},
+		Model: openai.CompletionNewParamsModel(model),
+	}
+	if streaming {
+		params.StreamOptions = openai.ChatCompletionStreamOptionsParam{IncludeUsage: param.NewOpt(true)}
+	}
+	return openaiclient, params
+}
+
+// isLoraMetricPresent checks if a matching metric exists
+// metrics: the list of metrics
+// running: list of loras in running_lora_adapters, the order does not matter
+// waiting: list of loras in waiting_lora_adapters, the order does not matter
+func isLoraMetricPresent(metrics []string, running, waiting []string) bool {
+	return findLoraMetric(metrics, running, waiting) != ""
+}
+
+// getLoraTimestamp returns timestamp or nil, error
+func getLoraTimestamp(metrics []string, running, waiting []string) (*float64, error) {
+	metric := findLoraMetric(metrics, running, waiting)
+	if metric == "" {
+		return nil, nil // not found
+	}
+	return extractTimestamp(metric)
+}
+
+// extractTimestamp gets timestamp from the given metric
+func extractTimestamp(metric string) (*float64, error) {
+	// Extract timestamp: last part after space
+	parts := strings.Split(metric, " ")
+	if len(parts) < 2 {
+		return nil, errors.New("invalid metric format")
+	}
+	timestampStr := parts[len(parts)-1]
+	timestamp, err := strconv.ParseFloat(timestampStr, 64)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+	return &timestamp, nil
+}
+
+func getLoraValidTimestamp(metrics []string, running, waiting []string) float64 {
+	timestamp, err := getLoraTimestamp(metrics, running, waiting)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	gomega.Expect(timestamp).ToNot(gomega.BeNil())
+	return *timestamp
+}
+
+func getLastLoraMetrics(metrics []string) ([]string, error) {
+	lastTimestamp := float64(0)
+	var lastMetrics []string
+	for _, metric := range metrics {
+		if strings.HasPrefix(metric, "vllm:lora_requests_info") {
+			timestamp, err := extractTimestamp(metric)
+			if err != nil {
+				return nil, err
+			}
+			if lastTimestamp > *timestamp {
+				continue
+			}
+			lastTimestamp = *timestamp
+			if lastTimestamp < *timestamp {
+				lastMetrics = make([]string, 0)
+			}
+			lastMetrics = append(lastMetrics, metric)
+		}
+	}
+	return lastMetrics, nil
+}
+
+// findLoraMetric finds the relevant metric by comparing with the given loras sets (ignoring order)
+// metrics: lines of metrics
+// running: list of running loras to find
+// waiting: list of waiting loras to find
+// Looks for a line with the given running and waiting loras sets, the comparison is order agnostic.
+// Return metric should match in both running and waiting sets.
+// E.g. for input running=["l1", "l2", "l3"] and waiting=[] will return metric
+// with running_lora_adapters=["l3", "l1", "l2"] and waiting_lora_adapters=[]
+func findLoraMetric(metrics []string, running, waiting []string) string {
+	// sort input arrays before compare, create string of all values, separated by comma
+	sort.Strings(running)
+	sort.Strings(waiting)
+	runStr := strings.Join(running, ",")
+	waitStr := strings.Join(waiting, ",")
+
+	// regex to extract lora metrics and values
+	re := regexp.MustCompile(`vllm:lora_requests_info\{.*running_lora_adapters="([^"]*)".*waiting_lora_adapters="([^"]*)".*\}\s+([0-9.e\+\-]+)`)
+	for _, metric := range metrics {
+		matches := re.FindStringSubmatch(metric)
+		if len(matches) == 4 {
+			// this line contains loraInfo metric, check running and waiting loras lists
+			// split and sort metric's running and waiting loras lists for the comparison
+			metricRun := splitString(matches[1])
+			metricWait := splitString(matches[2])
+			sort.Strings(metricRun)
+			sort.Strings(metricWait)
+			// if both lists are the same - return the metric
+			if strings.Join(metricRun, ",") == runStr && strings.Join(metricWait, ",") == waitStr {
+				return metric
+			}
+		} // if the metric is not in the required format - skip it
+	}
+
+	// required metric was not found
+	return ""
+}
+
+// splits the given string to array of strings with separator = ","
+func splitString(str string) []string {
+	if str == "" {
+		return []string{}
+	}
+	return strings.Split(str, ",")
+}
+
+// findMetric returns the value for the first metrics with the given prefix or an empty string if not found
+func findMetric(metrics []string, metricPrefix string) string {
+	// regex to extract metrics and values
+	for _, metric := range metrics {
+		if strings.Contains(metric, metricPrefix) {
+			arr := strings.Split(metric, " ")
+			if len(arr) == 2 {
+				return arr[1]
+			}
+			break
+		}
+	}
+	// required metric was not found
+	return ""
+}
+
+// findIntMetric returns the value for the first metrics with the given prefix as int or nil if not found
+func findIntMetric(metrics []string, metricPrefix string) *int {
+	valueStr := findMetric(metrics, metricPrefix)
+	if valueStr == "" {
+		return nil
+	}
+
+	val, err := strconv.Atoi(valueStr)
+	if err != nil {
+		return nil
+	}
+	return &val
+}
+
+// getFloatBucketMetricLine builds a string which will defin bucket metric line for the given parameters
+// model the model name
+// metrics the metric name
+// bucketBoundary the upper bucket boundary, Inf(1) defines the last bucket
+// count bucket samples count
+func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
+	buckerBoundStr := "+Inf"
+	if bucketBoundary != math.Inf(1) {
+		buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
+	}
+	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
+}
+
+// checkBucketBoundary checks that the given bucket's samples count is valid according the given parameters
+// Scenario is a single request, so buckets counts could be 0 or 1.
+// Buckets lower than the expected value should have count 0, other buckets - count 1.
+// Important note: since metrics represent real timing, it could be a little bit higher than the expected,
+// which is based on the pure latencies calculations, on in case the expected value is equal or very close to the
+// upper bounary we can get any value (0 or 1), in this case we don't check this bucket
+// metrics the full metrics response
+// modelName the model name
+// metricName the specific metric name
+// bucketBoudary the upper boundary of the required bucket
+// prevBoundary the upper boundary of the previous bucket
+// expectedValue expected value in the histogram
+func checkBucketBoundary(metrics string, modelName string, metricName string, bucketBoudary float64,
+	prevBoundary float64, expectedValue float64) {
+	if expectedValue > prevBoundary && bucketBoudary >= expectedValue && (bucketBoudary-expectedValue) < 0.005 {
+		// expected time is too close to the bucket's boudary
+		// it's possiblt that in theory we expect 1 in this bucket but will get 0 and this situation is ok
+		// since there is some additional calculation time
+		fmt.Printf("Expected value is too close to the boundary - skip test for this bucket (%.4f - %.4f] and expected value %.4f\n",
+			prevBoundary, bucketBoudary, expectedValue)
+		return
+	}
+	expectedCount := 0
+	if bucketBoudary > expectedValue {
+		expectedCount = 1
+	}
+	gomega.Expect(metrics).To(gomega.ContainSubstring(getFloatBucketMetricLine(modelName, metricName, bucketBoudary, expectedCount)))
+}
+
+// checkLatencyMertics sends /metrics request and checks that latency related values are valid
+// client the http client to be used for request send
+// modelName the model name
+// numOfOutputTokens number of tokens in the output of the completion request we want to validate
+// ttft time to first token parameter
+// prefillTimePerToken prefill time per input tokens
+// interTokenLatency processing time per output token
+func checkLatencyMertics(client *http.Client, modelName string, numOfInputTokens int, numOfOutputTokens int, ttft int, prefillTimePerToken int, interTokenLatency int) {
+	// wait a little bit and check metrics
+	time.Sleep(300 * time.Millisecond)
+	metricsResp, err := client.Get(metricsUrl)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	gomega.Expect(metricsResp.StatusCode).To(gomega.Equal(http.StatusOK))
+
+	data, err := io.ReadAll(metricsResp.Body)
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	metrics := string(data)
+
+	var expectedPrefillTime float64
+	// TODO take into consideration remote prefill
+	if ttft > 0 {
+		// time-to-first-token overwrites calculation of prefill time based on number of input tokens
+		expectedPrefillTime = float64(ttft) / 1000
+
+	} else {
+		expectedPrefillTime = float64(numOfInputTokens*prefillTimePerToken) / 1000
+	}
+	expectedDecodeTime := float64(interTokenLatency*(numOfOutputTokens-1)) / 1000
+	expectedE2ELatency := expectedPrefillTime + expectedDecodeTime
+
+	prevBoundary := math.Inf(-1)
+
+	for _, bucketBoudary := range common.RequestLatencyBucketsBoundaries {
+		checkBucketBoundary(metrics, modelName, prefillTimeMetricName, bucketBoudary, prevBoundary, expectedPrefillTime)
+		checkBucketBoundary(metrics, modelName, decodeTimeMetricName, bucketBoudary, prevBoundary, expectedDecodeTime)
+		checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, bucketBoudary, prevBoundary, expectedE2ELatency)
+
+		prevBoundary = bucketBoudary
+	}
+	// check the last bucket
+	lastBoundary := common.RequestLatencyBucketsBoundaries[len(common.RequestLatencyBucketsBoundaries)-1]
+	checkBucketBoundary(metrics, modelName, prefillTimeMetricName, math.Inf(1), lastBoundary, expectedPrefillTime)
+	checkBucketBoundary(metrics, modelName, decodeTimeMetricName, math.Inf(1), lastBoundary, expectedDecodeTime)
+	checkBucketBoundary(metrics, modelName, e2eReqLatencyMetricName, math.Inf(1), lastBoundary, expectedE2ELatency)
+}
diff --git a/pkg/llm-d-inference-sim/tools_test.go b/pkg/llm-d-inference-sim/tools_test.go
index c504c5d5..431d742e 100644
--- a/pkg/llm-d-inference-sim/tools_test.go
+++ b/pkg/llm-d-inference-sim/tools_test.go
@@ -377,7 +377,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, true)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, true)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = tools
 
@@ -459,7 +459,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = tools
 
@@ -569,8 +569,8 @@ var _ = Describe("Simulator for request with tools", func() {
 
 			for _, invalidTool := range invalidTools {
 				params := openai.ChatCompletionNewParams{
-					Messages:   []openai.ChatCompletionMessageParamUnion{openai.UserMessage(userMessage)},
-					Model:      model,
+					Messages:   []openai.ChatCompletionMessageParamUnion{openai.UserMessage(testUserMessage)},
+					Model:      testModel,
 					ToolChoice: openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")},
 					Tools:      invalidTool,
 				}
@@ -588,16 +588,16 @@ var _ = Describe("Simulator for request with tools", func() {
 	DescribeTable("array parameter, no streaming",
 		func(mode string, minLength int, maxLength int, min float64, max float64) {
 			ctx := context.TODO()
-			serverArgs := []string{"cmd", "--model", model, "--mode", mode,
+			serverArgs := []string{"cmd", "--model", testModel, "--mode", mode,
 				"--min-tool-call-array-param-length", strconv.Itoa(minLength),
 				"--max-tool-call-array-param-length", strconv.Itoa(maxLength),
 				"--min-tool-call-number-param", fmt.Sprint(min),
 				"--max-tool-call-number-param", fmt.Sprint(max),
 			}
-			client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil)
+			client, err := startServerWithArgs(ctx, serverArgs)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithArray
 
@@ -646,7 +646,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWith3DArray
 
@@ -699,7 +699,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithWrongMinMax
 
@@ -718,7 +718,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithObjects
 
@@ -773,7 +773,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithObjectAndArray
 
@@ -818,13 +818,13 @@ var _ = Describe("Simulator for request with tools", func() {
 	DescribeTable("tool with not required params",
 		func(probability int, numberOfParams int) {
 			ctx := context.TODO()
-			serverArgs := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+			serverArgs := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--tool-call-not-required-param-probability", strconv.Itoa(probability),
 			}
-			client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil)
+			client, err := startServerWithArgs(ctx, serverArgs)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithoutRequiredParams
 
@@ -854,15 +854,15 @@ var _ = Describe("Simulator for request with tools", func() {
 	DescribeTable("tool with object with not required params",
 		func(probability int, numberOfParams int, min int, max int) {
 			ctx := context.TODO()
-			serverArgs := []string{"cmd", "--model", model, "--mode", common.ModeRandom,
+			serverArgs := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--object-tool-call-not-required-field-probability", strconv.Itoa(probability),
 				"--min-tool-call-integer-param", strconv.Itoa(min),
 				"--max-tool-call-integer-param", strconv.Itoa(max),
 			}
-			client, err := startServerWithArgs(ctx, common.ModeEcho, serverArgs, nil)
+			client, err := startServerWithArgs(ctx, serverArgs)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ChatCompletionToolChoiceOptionUnionParam{OfAuto: param.NewOpt("required")}
 			params.Tools = toolWithObjectWithoutRequiredParams
 
diff --git a/pkg/llm-d-inference-sim/worker_test.go b/pkg/llm-d-inference-sim/worker_test.go
index bec8bcb6..21181842 100644
--- a/pkg/llm-d-inference-sim/worker_test.go
+++ b/pkg/llm-d-inference-sim/worker_test.go
@@ -33,18 +33,16 @@ import (
 	"github.com/openai/openai-go/v3/option"
 )
 
-const modelName = "testmodel"
-
 var _ = Describe("Simulator requests scheduling", Ordered, func() {
 	Context("Requests for already loaded loras should be handled first", func() {
 		DescribeTable("Should process in correct order simultaneous requests to two loras", func(maxNumSeq string) {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", model, "--mode", common.ModeEcho,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 				"--time-to-first-token", "500", "--max-num-seqs", maxNumSeq,
 				"--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 				"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-			client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 			openaiclient := openai.NewClient(option.WithBaseURL(baseURL),
 				option.WithHTTPClient(client))
@@ -86,13 +84,13 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 		DescribeTable("Should process in correct order delayed requests to two loras",
 			func(maxNumSeq string, maxLoras string, checkOrder func([]int)) {
 				ctx := context.TODO()
-				args := []string{"cmd", "--model", model, "--mode", common.ModeEcho,
+				args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 					"--time-to-first-token", "1000",
 					"--max-num-seqs", maxNumSeq, "--max-loras", maxLoras,
 					"--lora-modules", "{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 					"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-				client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+				client, err := startServerWithArgs(ctx, args)
 				Expect(err).NotTo(HaveOccurred())
 
 				openaiclient := openai.NewClient(option.WithBaseURL(baseURL),
@@ -127,7 +125,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 
 		It("Should keep the order of requests with one worker", func() {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", model, "--mode", common.ModeEcho,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 				"--time-to-first-token", "500",
 				"--max-num-seqs", "1", "--max-loras", "1",
 				"--lora-modules",
@@ -136,7 +134,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 				"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}",
 				"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-			client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(option.WithBaseURL(baseURL),
@@ -167,7 +165,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 
 		It("Should keep the order of requests with two workers", func() {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", model, "--mode", common.ModeEcho,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 				"--time-to-first-token", "500",
 				"--max-num-seqs", "2", "--max-loras", "1",
 				"--lora-modules",
@@ -176,7 +174,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 				"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}",
 				"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-			client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(option.WithBaseURL(baseURL),
@@ -207,7 +205,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 		DescribeTable("Should keep the order of requests with multiple workers and loras",
 			func(maxNumSeq string, maxLoras string, checkOrder func([]int)) {
 				ctx := context.TODO()
-				args := []string{"cmd", "--model", model, "--mode", common.ModeEcho,
+				args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 					"--time-to-first-token", "1000",
 					"--max-num-seqs", maxNumSeq, "--max-loras", maxLoras,
 					"--lora-modules",
@@ -217,7 +215,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 					"{\"name\":\"lora5\",\"path\":\"/path/to/lora5\"}",
 					"{\"name\":\"lora2\",\"path\":\"/path/to/lora2\"}"}
 
-				client, err := startServerWithArgs(ctx, common.ModeEcho, args, nil)
+				client, err := startServerWithArgs(ctx, args)
 				Expect(err).NotTo(HaveOccurred())
 
 				openaiclient := openai.NewClient(option.WithBaseURL(baseURL),
@@ -258,7 +256,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 	Context("Stress", func() {
 		It("Should work correctly with many simultaneous requests", func() {
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--time-to-first-token", "3000", "--max-num-seqs", "12", "--max-loras", "2",
 				"--lora-modules",
 				"{\"name\":\"lora0\",\"path\":\"/path/to/lora0\"}",
@@ -268,7 +266,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 				"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}",
 			}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
@@ -282,7 +280,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 					defer GinkgoRecover()
 					params := openai.ChatCompletionNewParams{
 						Messages: []openai.ChatCompletionMessageParamUnion{
-							openai.UserMessage(userMessage),
+							openai.UserMessage(testUserMessage),
 						},
 						Model: fmt.Sprintf("lora%d", i%5),
 					}
@@ -331,7 +329,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 			runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}"
 			waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}"
 			ctx := context.TODO()
-			args := []string{"cmd", "--model", modelName, "--mode", common.ModeRandom,
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600",
 				"--max-num-seqs", "1000", "--max-loras", "2", "--max-waiting-queue-length", "1500",
 				"--lora-modules",
@@ -339,7 +337,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 				"{\"name\":\"lora1\",\"path\":\"/path/to/lora1\"}",
 			}
 
-			client, err := startServerWithArgs(ctx, common.ModeRandom, args, nil)
+			client, err := startServerWithArgs(ctx, args)
 			Expect(err).NotTo(HaveOccurred())
 
 			openaiclient := openai.NewClient(
@@ -353,7 +351,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 					defer GinkgoRecover()
 					params := openai.ChatCompletionNewParams{
 						Messages: []openai.ChatCompletionMessageParamUnion{
-							openai.UserMessage(userMessage),
+							openai.UserMessage(testUserMessage),
 						},
 						Model: fmt.Sprintf("lora%d", i%2),
 					}
@@ -392,7 +390,7 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 					defer GinkgoRecover()
 					params := openai.ChatCompletionNewParams{
 						Messages: []openai.ChatCompletionMessageParamUnion{
-							openai.UserMessage(userMessage),
+							openai.UserMessage(testUserMessage),
 						},
 						Model: fmt.Sprintf("lora%d", i%2),
 					}

From 031e4619c95a3e8be3b11ffee2faf47611e16c10 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Mon, 27 Oct 2025 22:48:33 +0200
Subject: [PATCH 07/14] Add test for vllm:request_queue_time_seconds and
 vllm:request_inference_time_seconds

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics_test.go | 53 ++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 4cc1b948..8d2cee97 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"os"
 	"strings"
@@ -808,7 +809,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		})
 	})
 
-	Context("latency metrics", func() {
+	Context("single request latency metrics", func() {
 		numOfTokens := len(common.Tokenize(testUserMessage))
 
 		DescribeTable("should calculate all latency related metrics correctly for a single request",
@@ -831,6 +832,56 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Entry(nil, "prefill per token + inter token time", 0, 100, 100),
 		)
 	})
+
+	Context("multiple requests latency metrics", func() {
+		It("should calculate waiting and inference time correctly", func() {
+			ctx := context.TODO()
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
+				"--time-to-first-token", "1000", "--max-num-seqs", "1",
+			}
+
+			client, err := startServerWithArgs(ctx, args)
+			Expect(err).NotTo(HaveOccurred())
+
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
+
+			var reqWg sync.WaitGroup
+			reqWg.Add(2)
+
+			// send two requests
+			for range 2 {
+				go func() {
+					defer reqWg.Done()
+					defer GinkgoRecover()
+
+					_, err := openaiclient.Chat.Completions.New(ctx, params)
+					Expect(err).NotTo(HaveOccurred())
+				}()
+			}
+
+			reqWg.Wait()
+			time.Sleep(300 * time.Millisecond)
+			metricsResp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(metricsResp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(metricsResp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+
+			for _, boundary := range common.RequestLatencyBucketsBoundaries {
+				if boundary < 1.5 {
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, 0)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, 0)))
+				} else {
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, 2)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, 1)))
+				}
+			}
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, math.Inf(1), 2)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, math.Inf(1), 1)))
+		})
+	})
 })
 
 var _ = Describe("build125Buckets", Ordered, func() {

From e16f9f9fbdc352f05477e13cdf4d42f30e75db40 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 10:18:19 +0200
Subject: [PATCH 08/14] Define constant for metrics names, use helper functions
 in metrics test for histogram buckets validation

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go      |  15 +-
 pkg/llm-d-inference-sim/metrics_test.go | 250 ++++++++----------------
 pkg/llm-d-inference-sim/test_utils.go   |   7 +-
 3 files changed, 94 insertions(+), 178 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 4322b024..725b3c6c 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -38,6 +38,11 @@ const (
 	reqInferenceTimeMetricName = "vllm:request_inference_time_seconds"
 	prefillTimeMetricName      = "vllm:request_prefill_time_seconds"
 	decodeTimeMetricName       = "vllm:request_decode_time_seconds"
+	ttftMetricName             = "vllm:time_to_first_token_seconds"
+	tpotMetricName             = "vllm:time_per_output_token_seconds"
+	generationTokensMetricName = "vllm:request_generation_tokens"
+	paramMaxTokensMetricName   = "vllm:request_params_max_tokens"
+	promptTokensMetricName     = "vllm:request_prompt_tokens"
 )
 
 // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
@@ -92,7 +97,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.ttft = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:time_to_first_token_seconds",
+			Name:      ttftMetricName,
 			Help:      "Histogram of time to first token in seconds.",
 			Buckets:   common.TTFTBucketsBoundaries,
 		},
@@ -107,7 +112,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.tpot = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:time_per_output_token_seconds",
+			Name:      tpotMetricName,
 			Help:      "Histogram of time per output token in seconds.",
 			Buckets:   common.TPOTBucketsBoundaries,
 		},
@@ -211,7 +216,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.requestPromptTokens = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_prompt_tokens",
+			Name:      promptTokensMetricName,
 			Help:      "Number of prefill tokens processed.",
 			Buckets:   build125Buckets(s.config.MaxModelLen),
 		},
@@ -225,7 +230,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.requestGenerationTokens = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_generation_tokens",
+			Name:      generationTokensMetricName,
 			Help:      "Number of generation tokens processed.",
 			Buckets:   build125Buckets(s.config.MaxModelLen),
 		},
@@ -239,7 +244,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.requestParamsMaxTokens = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: "",
-			Name:      "vllm:request_params_max_tokens",
+			Name:      paramMaxTokensMetricName,
 			Help:      "Histogram of the max_tokens request parameter.",
 			Buckets:   build125Buckets(s.config.MaxModelLen),
 		},
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index 8d2cee97..b55068f8 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -148,30 +148,21 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		data, err := io.ReadAll(metricsResp.Body)
 		Expect(err).NotTo(HaveOccurred())
 		metrics := string(data)
-		// request_prompt_tokens_bucket
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 1`))
-		// request_params_max_tokens_bucket
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 0`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 1`))
-		Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 1`))
+		// request_prompt_tokens_bucket and request_params_max_tokens_bucket
+		buckets := build125Buckets(1024)
+
+		for _, boundary := range buckets {
+			if boundary <= 20 {
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, 0)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, 0)))
+			} else {
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, 1)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, 1)))
+			}
+		}
+		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
+		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
+
 		// request_generation_tokens
 		// We do not verify the distribution of the number of tokens generated per request,
 		// as the number of generated tokens is unpredictable in this test.
@@ -420,84 +411,35 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			data, err := io.ReadAll(metricsResp.Body)
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
+
 			// ttft
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1"))
-			// tpot
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 0"))
+			for _, boundary := range common.TTFTBucketsBoundaries {
+				if boundary <= 0.1 {
+					// buckets up to 0.1 should be empty
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 0)))
+				} else {
+					// buckets higher than 0.1 should contain a single sample
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 1)))
+				}
+			}
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, math.Inf(1), 1)))
 
+			// tpot
 			metricsLines := strings.Split(metrics, "\n")
-			// the following values should be greater than 0, we don't know the exact value since it depends on the random response length
-			count := findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.2\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.3\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.4\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"}")
-			Expect(count).ToNot(BeNil())
-			Expect(*count).To(BeNumerically(">", 0))
-			count = findIntMetric(metricsLines, "vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"}")
+			var count *int
+
+			for _, boundary := range common.TPOTBucketsBoundaries {
+				if boundary <= 0.075 {
+					// ensure that values for buckets up to 0.075 have count 0
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, boundary, 0)))
+				} else {
+					// buckets higher than 0.75 should be greater than 0, we don't know the exact value since it depends on the random response length
+					count = findIntMetric(metricsLines, getFloatBucketMetricPrefix(testModel, tpotMetricName, 0.1))
+					Expect(count).ToNot(BeNil())
+					Expect(*count).To(BeNumerically(">", 0))
+				}
+			}
+			count = findIntMetric(metricsLines, getFloatBucketMetricPrefix(testModel, tpotMetricName, math.Inf(1)))
 			Expect(count).ToNot(BeNil())
 			Expect(*count).To(BeNumerically(">", 0))
 		}()
@@ -709,53 +651,39 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
 
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 3"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 6"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 6"))
-
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.025\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.05\"} 1"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.075\"} 3"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 6"))
-			Expect(metrics).To(ContainSubstring("vllm:time_per_output_token_seconds_bucket{model_name=\"testmodel\",le=\"0.15\"} 6"))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_prompt_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
-
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1"} 10`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="2"} 30`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="5"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="10"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="20"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="50"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="100"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="200"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="500"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="1000"} 60`))
-			Expect(metrics).To(ContainSubstring(`vllm:request_params_max_tokens_bucket{model_name="testmodel",le="+Inf"} 60`))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.001, 1)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.005, 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.01, 6)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, 0.02, 6)))
+
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.01, 0)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.025, 0)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.05, 1)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.075, 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.1, 6)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.15, 6)))
+
+			buckets := build125Buckets(1024)
+
+			for _, boudary := range buckets {
+				switch boudary {
+				case 1.0:
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 1, 10)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 1, 10)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 1, 10)))
+				case 2.0:
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 2, 30)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 2, 30)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 2, 30)))
+				default:
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boudary, 60)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boudary, 60)))
+					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boudary, 60)))
+				}
+			}
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), 60)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 60)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 60)))
 
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
@@ -783,29 +711,10 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
 
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.001\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.005\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.01\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.02\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.04\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.06\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.08\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.25\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"0.75\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"1\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"7.5\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"10\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"20\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"40\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"80\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"160\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"640\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"2560\"} 0"))
-			Expect(metrics).To(ContainSubstring("vllm:time_to_first_token_seconds_bucket{model_name=\"testmodel\",le=\"+Inf\"} 1"))
+			for _, boundary := range common.TTFTBucketsBoundaries {
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, boundary, 0)))
+			}
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, ttftMetricName, math.Inf(1), 1)))
 		})
 	})
 
@@ -952,9 +861,6 @@ var _ = Describe("build125Buckets", Ordered, func() {
 		for _, test := range tests {
 			got := build125Buckets(test.maxValue)
 			Expect(got).To(Equal(test.want))
-			// if !reflect.DeepEqual(got, test.want) {
-			// 	t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
-			// }
 		}
 	})
 })
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
index 1919ec1c..c123ea85 100644
--- a/pkg/llm-d-inference-sim/test_utils.go
+++ b/pkg/llm-d-inference-sim/test_utils.go
@@ -344,11 +344,16 @@ func findIntMetric(metrics []string, metricPrefix string) *int {
 // bucketBoundary the upper bucket boundary, Inf(1) defines the last bucket
 // count bucket samples count
 func getFloatBucketMetricLine(model string, metric string, bucketBoundary float64, count int) string {
+	return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count)
+}
+
+// same as getFloatBucketMetricLine but without the value part
+func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string {
 	buckerBoundStr := "+Inf"
 	if bucketBoundary != math.Inf(1) {
 		buckerBoundStr = fmt.Sprintf("%g", bucketBoundary)
 	}
-	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"} %d", metric, model, buckerBoundStr, count)
+	return fmt.Sprintf("%s_bucket{model_name=\"%s\",le=\"%s\"}", metric, model, buckerBoundStr)
 }
 
 // checkBucketBoundary checks that the given bucket's samples count is valid according the given parameters

From 62e4c6aa27a0456d7c41ff8f2ee6c8f44d58ddb7 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:09:14 +0200
Subject: [PATCH 09/14] - Add full list of supported metrics to readme - Create
 constants for all metrics - Define all latency related fake metrics in config
 - Add validation for new fake metrics in config

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 README.md                               | 13 +++++-
 pkg/common/config.go                    | 59 +++++++++++++++++++++----
 pkg/llm-d-inference-sim/metrics.go      | 15 ++++---
 pkg/llm-d-inference-sim/metrics_test.go | 30 ++++++-------
 pkg/llm-d-inference-sim/test_utils.go   | 10 ++++-
 pkg/llm-d-inference-sim/worker_test.go  |  8 ++--
 pkg/vllm-api/vllm-models.go             |  3 --
 7 files changed, 100 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index c7c88c98..0853bd60 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:lora_requests_info | Running stats on LoRA requests |
 | vllm:num_requests_running | Number of requests currently running on GPU |
 | vllm:num_requests_waiting | Prometheus metric for the number of queued requests |
-
+| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds |
+| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request |
+| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request |
+| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request |
+| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request |
+| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
+| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
+| vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
+| vllm:request_prompt_tokens | Number of prefill tokens processed |
+| vllm:request_success_total | Count of successfully processed requests |
+  
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
 
 The simulator supports two modes of operation:
diff --git a/pkg/common/config.go b/pkg/common/config.go
index 49825a48..bc82087b 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -232,16 +232,17 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
-	// TTFTBuckets is an array of values for time-to-first-token buckets,
-	// each value in this array is a value for the corresponding bucket.
+
+	// Histogram metrics - defined by array of values.
+	// Each value in this array is a value for the corresponding bucket.
 	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+
+	// TTFTBuckets is an array of values for time-to-first-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
 	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
 	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
-	// TPOTBuckets is an array of values for time-per-output-token buckets,
-	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// TPOTBuckets is an array of values for time-per-output-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
@@ -253,13 +254,21 @@ type Metrics struct {
 	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
-	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets,
-	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
-	// Buckets upper boundaries in seconds are:
+
+	// Latency histograms - have same buckets upper boundaries in seconds are:
 	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
+
+	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
 	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
+	// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
+	ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
+	// ReqInfTimeBucketValues is an array of values for request inference time buckets.
+	ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
+	// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
+	ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
+	// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
+	ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
 }
 
 type LorasMetrics struct {
@@ -595,6 +604,38 @@ func (c *Configuration) validate() error {
 				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
 			}
 		}
+
+		for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
+			if v < 0 {
+				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
+			}
+		}
+
+		for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics e2erl-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics queue-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqInfTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics inf-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics decode-time-buckets-values cannot contain negative values")
+			}
+		}
 	}
 
 	if c.DPSize < 1 || c.DPSize > 8 {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 725b3c6c..91c3dc25 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -43,6 +43,11 @@ const (
 	generationTokensMetricName = "vllm:request_generation_tokens"
 	paramMaxTokensMetricName   = "vllm:request_params_max_tokens"
 	promptTokensMetricName     = "vllm:request_prompt_tokens"
+	successTotalMetricName     = "vllm:request_success_total"
+	loraRequestsMetricName     = "vllm:lora_requests_info"
+	reqRunningMetricName       = "vllm:num_requests_running"
+	reqWaitingMetricName       = "vllm:num_requests_waiting"
+	gpuCacheUsageMetricName    = "vllm:gpu_cache_usage_perc"
 )
 
 // createAndRegisterPrometheus creates and registers prometheus metrics used by vLLM simulator
@@ -54,7 +59,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.loraInfo = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:lora_requests_info",
+			Name:      loraRequestsMetricName,
 			Help:      "Running stats on lora requests.",
 		},
 		[]string{vllmapi.PromLabelMaxLora, vllmapi.PromLabelRunningLoraAdapters, vllmapi.PromLabelWaitingLoraAdapters},
@@ -68,7 +73,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.runningRequests = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:num_requests_running",
+			Name:      reqRunningMetricName,
 			Help:      "Number of requests currently running on GPU.",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -83,7 +88,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.waitingRequests = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:num_requests_waiting",
+			Name:      reqWaitingMetricName,
 			Help:      "Prometheus metric for the number of queued requests.",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -202,7 +207,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.kvCacheUsagePercentage = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Subsystem: "",
-			Name:      "vllm:gpu_cache_usage_perc",
+			Name:      gpuCacheUsageMetricName,
 			Help:      "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).",
 		},
 		[]string{vllmapi.PromLabelModelName},
@@ -258,7 +263,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: "",
-			Name:      "vllm:request_success_total",
+			Name:      successTotalMetricName,
 			Help:      "Count of successfully processed requests.",
 		},
 		[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index b55068f8..a8f2f3ee 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -107,8 +107,8 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		data, err := io.ReadAll(metricsResp.Body)
 		Expect(err).NotTo(HaveOccurred())
 		metrics := string(data)
-		Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 2"))
-		Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 1"))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 2)))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 1)))
 	})
 
 	It("Should record correct prompt and generation token counts", func() {
@@ -168,7 +168,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		// as the number of generated tokens is unpredictable in this test.
 		// Therefore, we only verify the number of requests and the total number of generated tokens,
 		// and skip the bucket distribution.
-		Expect(metrics).To(ContainSubstring(`vllm:request_generation_tokens_count{model_name="testmodel"} 1`))
+		Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, generationTokensMetricName+"_count", 1)))
 		// request_success_total
 		Expect(metrics).To(MatchRegexp(`vllm:request_success_total{finish_reason="(stop|length)",model_name="testmodel"} 1`))
 	})
@@ -512,9 +512,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				Expect(err).NotTo(HaveOccurred())
 				metrics := string(data)
 				// Expect three running requests and two blocks in the kv cache - usage 2/16=0.125
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 3"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.125"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 3)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.125)))
 
 				time.Sleep(4 * time.Second)
 				metricsResp, err = client.Get(metricsUrl)
@@ -525,9 +525,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				Expect(err).NotTo(HaveOccurred())
 				metrics = string(data)
 				// The requests finished running, expect 0 usage
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 0)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0)))
 			}()
 			wg.Wait()
 		})
@@ -592,9 +592,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 				// The requests were sent with 500 millisecond intervals, and the first two should be still running.
 				// The third is waiting, and is still not in the kv-cache.
 				// We expect one block in the kv-cache, usage 1/16=0.0625.
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"Qwen/Qwen2-0.5B\"} 2"))
-				Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"Qwen/Qwen2-0.5B\"} 1"))
-				Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"Qwen/Qwen2-0.5B\"} 0.0625"))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqRunningMetricName, 2)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, reqWaitingMetricName, 1)))
+				Expect(metrics).To(ContainSubstring(getCountMetricLine(qwenModelName, gpuCacheUsageMetricName, 0.0625)))
 			}()
 			wg.Wait()
 		})
@@ -645,9 +645,9 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			data, err := io.ReadAll(resp.Body)
 			Expect(err).NotTo(HaveOccurred())
 			metrics := string(data)
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 10"))
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 30"))
-			Expect(metrics).To(ContainSubstring("vllm:gpu_cache_usage_perc{model_name=\"testmodel\"} 0.4"))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 10)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 30)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, gpuCacheUsageMetricName, 0.4)))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora2\",waiting_lora_adapters=\"lora3\"} 1.257894567e+09"))
 			Expect(metrics).To(ContainSubstring("vllm:lora_requests_info{max_lora=\"1\",running_lora_adapters=\"lora4,lora3\",waiting_lora_adapters=\"\"} 1.257894569e+09"))
 
diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
index c123ea85..5368e770 100644
--- a/pkg/llm-d-inference-sim/test_utils.go
+++ b/pkg/llm-d-inference-sim/test_utils.go
@@ -245,7 +245,7 @@ func getLastLoraMetrics(metrics []string) ([]string, error) {
 	lastTimestamp := float64(0)
 	var lastMetrics []string
 	for _, metric := range metrics {
-		if strings.HasPrefix(metric, "vllm:lora_requests_info") {
+		if strings.HasPrefix(metric, loraRequestsMetricName) {
 			timestamp, err := extractTimestamp(metric)
 			if err != nil {
 				return nil, err
@@ -347,6 +347,14 @@ func getFloatBucketMetricLine(model string, metric string, bucketBoundary float6
 	return fmt.Sprintf("%s %d", getFloatBucketMetricPrefix(model, metric, bucketBoundary), count)
 }
 
+func getCountMetricPrefix(model string, metric string) string {
+	return fmt.Sprintf("%s{model_name=\"%s\"}", metric, model)
+}
+
+func getCountMetricLine(model string, metric string, count float64) string {
+	return fmt.Sprintf("%s %g", getCountMetricPrefix(model, metric), count)
+}
+
 // same as getFloatBucketMetricLine but without the value part
 func getFloatBucketMetricPrefix(model string, metric string, bucketBoundary float64) string {
 	buckerBoundStr := "+Inf"
diff --git a/pkg/llm-d-inference-sim/worker_test.go b/pkg/llm-d-inference-sim/worker_test.go
index 21181842..ce5ee076 100644
--- a/pkg/llm-d-inference-sim/worker_test.go
+++ b/pkg/llm-d-inference-sim/worker_test.go
@@ -300,8 +300,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 
 			// max-num-seqs is 12, so number of running requests should be 12
 			// and the number of waiting requests 1000-12=988
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_running{model_name=\"testmodel\"} 12"))
-			Expect(metrics).To(ContainSubstring("vllm:num_requests_waiting{model_name=\"testmodel\"} 988"))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqRunningMetricName, 12)))
+			Expect(metrics).To(ContainSubstring(getCountMetricLine(testModel, reqWaitingMetricName, 988)))
 
 			// max-loras is 2, so the last lora metric should be:
 			// running: two loras (doesn't matter which two)
@@ -326,8 +326,8 @@ var _ = Describe("Simulator requests scheduling", Ordered, func() {
 		})
 
 		It("Should work correctly with many simultaneous requests with many workers", func() {
-			runningMetric := "vllm:num_requests_running{model_name=\"testmodel\"}"
-			waitingMetric := "vllm:num_requests_waiting{model_name=\"testmodel\"}"
+			runningMetric := getCountMetricPrefix(testModel, reqRunningMetricName)
+			waitingMetric := getCountMetricPrefix(testModel, reqWaitingMetricName)
 			ctx := context.TODO()
 			args := []string{"cmd", "--model", testModel, "--mode", common.ModeRandom,
 				"--time-to-first-token", "2000", "--time-to-first-token-std-dev", "600",
diff --git a/pkg/vllm-api/vllm-models.go b/pkg/vllm-api/vllm-models.go
index 6c83af69..333a8284 100644
--- a/pkg/vllm-api/vllm-models.go
+++ b/pkg/vllm-api/vllm-models.go
@@ -26,9 +26,6 @@ const (
 	PromLabelMaxLora             = "max_lora"
 	PromLabelModelName           = "model_name"
 	PromLabelFinishReason        = "finish_reason"
-
-	VllmLoraRequestInfo    = "vllm:lora_requests_info"
-	VllmNumRequestsRunning = "vllm:num_requests_running"
 )
 
 // modelInfo defines data about model returned by /models API

From b201e1ee4b1436315bc06945be322871055a3962 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:10:16 +0200
Subject: [PATCH 10/14] add license to test_utils.go

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/test_utils.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pkg/llm-d-inference-sim/test_utils.go b/pkg/llm-d-inference-sim/test_utils.go
index 5368e770..516f2f7f 100644
--- a/pkg/llm-d-inference-sim/test_utils.go
+++ b/pkg/llm-d-inference-sim/test_utils.go
@@ -1,3 +1,18 @@
+/*
+Copyright 2025 The llm-d-inference-sim Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 package llmdinferencesim
 
 import (

From 3e85d382c9c4645c327c5af2dfbcd5bf30745b5a Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:31:53 +0200
Subject: [PATCH 11/14] Set fake latency metrics if defined in configuration,
 added tests for latency fake metrics

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics.go | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
index 91c3dc25..787d452c 100644
--- a/pkg/llm-d-inference-sim/metrics.go
+++ b/pkg/llm-d-inference-sim/metrics.go
@@ -170,7 +170,7 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	)
 
 	if err := s.metrics.registry.Register(s.metrics.reqInferenceTime); err != nil {
-		s.logger.Error(err, "Prometheus request inerence time histogram register failed")
+		s.logger.Error(err, "Prometheus request inference time histogram register failed")
 		return err
 	}
 
@@ -310,7 +310,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics() {
 		}
 
 		if s.config.FakeMetrics.E2ERequestLatencyBucketValues != nil {
-			s.initFakeHistogram(s.metrics.tpot, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+			s.initFakeHistogram(s.metrics.e2eReqLatency, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.E2ERequestLatencyBucketValues)
+		}
+
+		if s.config.FakeMetrics.ReqQueueTimeBucketValues != nil {
+			s.initFakeHistogram(s.metrics.reqQueueTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqQueueTimeBucketValues)
+		}
+
+		if s.config.FakeMetrics.ReqInfTimeBucketValues != nil {
+			s.initFakeHistogram(s.metrics.reqInferenceTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqInfTimeBucketValues)
+		}
+
+		if s.config.FakeMetrics.ReqPrefillTimeBucketValues != nil {
+			s.initFakeHistogram(s.metrics.reqPrefillTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqPrefillTimeBucketValues)
+		}
+
+		if s.config.FakeMetrics.ReqDecodeTimeBucketValues != nil {
+			s.initFakeHistogram(s.metrics.reqDecodeTime, common.RequestLatencyBucketsBoundaries, s.config.FakeMetrics.ReqDecodeTimeBucketValues)
 		}
 	}
 

From 94be0aaac4e0d454aaad416d9ff2cbd4d71bb177 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:40:49 +0200
Subject: [PATCH 12/14] add fake latency metrics test

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/metrics_test.go | 80 ++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 14 deletions(-)

diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
index a8f2f3ee..e64b7323 100644
--- a/pkg/llm-d-inference-sim/metrics_test.go
+++ b/pkg/llm-d-inference-sim/metrics_test.go
@@ -664,26 +664,26 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, tpotMetricName, 0.15, 6)))
 
 			buckets := build125Buckets(1024)
+			var expectedCount int
 
-			for _, boudary := range buckets {
-				switch boudary {
+			for _, boundary := range buckets {
+				switch boundary {
 				case 1.0:
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 1, 10)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 1, 10)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 1, 10)))
+					expectedCount = 10
 				case 2.0:
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, 2, 30)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, 2, 30)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, 2, 30)))
+					expectedCount = 30
 				default:
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boudary, 60)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boudary, 60)))
-					Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boudary, 60)))
+					expectedCount = 60
 				}
+
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, boundary, expectedCount)))
+
 			}
-			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), 60)))
-			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 60)))
-			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 60)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
 
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
@@ -718,6 +718,58 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		})
 	})
 
+	Context("fake latency metrics", func() {
+		It("should respond with valid fake latency metrics to /metrics", func() {
+			ctx := context.TODO()
+			args := []string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
+				"--fake-metrics",
+				`{` +
+					`"e2erl-buckets-values":[0, 1, 2],` +
+					`"queue-time-buckets-values":[0, 1, 2],` +
+					`"inf-time-buckets-values":[0, 1, 2],` +
+					`"prefill-time-buckets-values":[0, 1, 2],` +
+					`"decode-time-buckets-values":[0, 1, 2]` +
+					`}`,
+			}
+
+			client, err := startServerWithArgs(ctx, args)
+			Expect(err).NotTo(HaveOccurred())
+
+			resp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(resp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+
+			// buckets counts should be 0, 1, 3, 3, 3, ...
+			var expectedCount int
+
+			for i, boundary := range common.RequestLatencyBucketsBoundaries {
+				switch i {
+				case 0:
+					expectedCount = 0
+				case 1:
+					expectedCount = 1
+				default:
+					expectedCount = 3
+				}
+
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, e2eReqLatencyMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, prefillTimeMetricName, boundary, expectedCount)))
+				Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, decodeTimeMetricName, boundary, expectedCount)))
+			}
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, e2eReqLatencyMetricName, math.Inf(1), 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqInferenceTimeMetricName, math.Inf(1), 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, reqQueueTimeMetricName, math.Inf(1), 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, prefillTimeMetricName, math.Inf(1), 3)))
+			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, decodeTimeMetricName, math.Inf(1), 3)))
+		})
+	})
+
 	Context("single request latency metrics", func() {
 		numOfTokens := len(common.Tokenize(testUserMessage))
 

From 9e6f8c1e54e154ea35a37de9cb4b63024fe18d64 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:50:26 +0200
Subject: [PATCH 13/14] fix sending latency metrics, use WriteToChannel
 function

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/simulator.go | 8 ++++----
 pkg/llm-d-inference-sim/streaming.go | 2 +-
 pkg/llm-d-inference-sim/worker.go    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index cd180219..7f9bc249 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -492,7 +492,7 @@ func (s *VllmSimulator) addRequestToQueue(reqCtx *openaiserverapi.CompletionReqC
 func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatCompletion bool) {
 	startTime := time.Now()
 	defer func() {
-		s.metrics.e2eReqLatencyChan <- time.Since(startTime).Seconds()
+		common.WriteToChannel(s.metrics.e2eReqLatencyChan, time.Since(startTime).Seconds(), s.logger, "metrics.e2eReqLatencyChan")
 	}()
 
 	// Check if we should inject a failure
@@ -623,7 +623,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 		// report tpot in seconds
 		common.WriteToChannel(s.metrics.tpotChan, (float64(perTokenLatency) / 1000), s.logger, "metrics.tpotChan")
 	}
-	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
+	common.WriteToChannel(s.metrics.reqDecodeTimeChan, time.Since(startDecode).Seconds(), s.logger, "metrics.reqDecodeTimeChan")
 
 	s.sendCompletionResponse(reqCtx.HTTPReqCtx, resp)
 	s.responseSentCallback(modelName, reqCtx.IsChatCompletion, reqCtx.CompletionReq.GetRequestID())
@@ -683,7 +683,7 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 		if ok && item.reqCtx != nil && s.loraIsLoaded(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
 			s.incrementLora(item.reqCtx.CompletionReq.GetModel())
-			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			common.WriteToChannel(s.metrics.reqQueueTimeChan, time.Since(item.enqueueTime).Seconds(), s.logger, "metrics.reqQueueTimeChan")
 			return item.reqCtx
 		}
 	}
@@ -693,7 +693,7 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 		item, ok := elem.Value.(waitingQueueItem)
 		if ok && item.reqCtx != nil && s.loadLora(item.reqCtx.CompletionReq.GetModel()) {
 			s.waitingQueue.Remove(elem)
-			s.metrics.reqQueueTimeChan <- time.Since(item.enqueueTime).Seconds()
+			common.WriteToChannel(s.metrics.reqQueueTimeChan, time.Since(item.enqueueTime).Seconds(), s.logger, "metrics.reqQueueTimeChan")
 			return item.reqCtx
 		}
 	}
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index 84320464..6be9a43e 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -151,7 +151,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 		}
 	}
 
-	s.metrics.reqDecodeTimeChan <- time.Since(startDecode).Seconds()
+	common.WriteToChannel(s.metrics.reqDecodeTimeChan, time.Since(startDecode).Seconds(), s.logger, "metrics.reqDecodeTimeChan")
 
 	// send the last chunk if finish reason is stop
 	var chunk openaiserverapi.CompletionRespChunk
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
index e2a6e504..674c283f 100644
--- a/pkg/llm-d-inference-sim/worker.go
+++ b/pkg/llm-d-inference-sim/worker.go
@@ -62,7 +62,7 @@ type requestProcessor interface {
 func (s *VllmSimulator) processRequest(reqCtx *openaiserverapi.CompletionReqCtx) {
 	start := time.Now()
 	defer func() {
-		s.metrics.reqInferenceTimeChan <- time.Since(start).Seconds()
+		common.WriteToChannel(s.metrics.reqInferenceTimeChan, time.Since(start).Seconds(), s.logger, "metrics.reqInferenceTimeChan")
 	}()
 
 	req := reqCtx.CompletionReq

From 8d623a99311a1e3558517c4d4e3476acfc8b6a09 Mon Sep 17 00:00:00 2001
From: Maya Barnea <mayab@il.ibm.com>
Date: Tue, 28 Oct 2025 12:57:04 +0200
Subject: [PATCH 14/14] fix merge

Signed-off-by: Maya Barnea <mayab@il.ibm.com>
---
 pkg/llm-d-inference-sim/tools_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/tools_test.go b/pkg/llm-d-inference-sim/tools_test.go
index 431d742e..aa6f54c0 100644
--- a/pkg/llm-d-inference-sim/tools_test.go
+++ b/pkg/llm-d-inference-sim/tools_test.go
@@ -510,7 +510,7 @@ var _ = Describe("Simulator for request with tools", func() {
 			client, err := startServer(ctx, mode)
 			Expect(err).NotTo(HaveOccurred())
 
-			openaiclient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 			params.ToolChoice = openai.ToolChoiceOptionFunctionToolChoice(openai.ChatCompletionNamedToolChoiceFunctionParam{
 				Name: specificTool,
 			})