From 210fa8a7ccae3daf07a72a1153942a21e4a47100 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 00:18:50 +1000
Subject: [PATCH 01/17] Validate max-num-seqs

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go      |  4 ++++
 pkg/common/config_test.go | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 442f53b0..ca4be087 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -373,6 +373,10 @@ func (c *Configuration) validate() error {
 		return errors.New("max model len cannot be less than 1")
 	}
 
+	if c.MaxNumSeqs < 1 {
+		return errors.New("max num seqs cannot be less than 1")
+	}
+
 	for _, lora := range c.LoraModules {
 		if lora.Name == "" {
 			return errors.New("empty LoRA name")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 20aba9a4..815037f0 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -431,6 +431,16 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--data-parallel-size", "15",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid max-num-seqs",
+			args: []string{"cmd", "--max-num-seqs", "0",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid max-num-seqs",
+			args: []string{"cmd", "--max-num-seqs", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {

From abdb2fa2a587ed4d8ff4f589417ea209eec894f0 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 00:53:19 +1000
Subject: [PATCH 02/17] Validate PrefillTimeStdDev

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index ca4be087..f63871f8 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -338,6 +338,9 @@ func (c *Configuration) validate() error {
 	if c.PrefillTimeStdDev < 0 {
 		return errors.New("prefill time standard deviation cannot be negative")
 	}
+	if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) {
+		return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token")
+	}
 
 	if c.KVCacheTransferTimePerToken < 0 {
 		return errors.New("kv-cache tranfer time per token cannot be negative")

From 1d30ea03aedc74ad46c7236f7ac0ac4e706f00aa Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 01:14:09 +1000
Subject: [PATCH 03/17] Add param time-factor-under-load

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go      | 13 +++++++++++++
 pkg/common/config_test.go | 10 ++++++++++
 2 files changed, 23 insertions(+)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index f63871f8..7e9200a5 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -104,6 +104,13 @@ type Configuration struct {
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
 	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
+	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests
+	// when parallel requests are being processed. The value of this factor must be >= 1.0, with a default of 1.0.
+	// If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are MaxNumSeqs
+	// requests, the total time will be multiplied by x.
+	// The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
+	TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
 	// Seed defines random seed for operations
@@ -259,6 +266,7 @@ func newConfig() *Configuration {
 		MaxModelLen:                         1024,
 		Mode:                                ModeRandom,
 		Seed:                                time.Now().UnixNano(),
+		TimeFactorUnderLoad:                 1.0,
 		MaxToolCallIntegerParam:             100,
 		MaxToolCallNumberParam:              100,
 		MaxToolCallArrayParamLength:         5,
@@ -362,6 +370,10 @@ func (c *Configuration) validate() error {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
 
+	if c.TimeFactorUnderLoad < 1.0 {
+		return errors.New("time factor under load cannot be less than 1.0")
+	}
+
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
@@ -509,6 +521,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
+	f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
 	f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 815037f0..1c0353ed 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -441,6 +441,16 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--max-num-seqs", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid time-factor-under-load",
+			args: []string{"cmd", "--time-factor-under-load", "0",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid time-factor-under-load",
+			args: []string{"cmd", "--time-factor-under-load", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {

From 04542f27baf93878fc5b15da79a58fab6ff5a6ff Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 03:33:30 +1000
Subject: [PATCH 04/17] The factor applies on time-to-first-token

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 manifests/dev-config.yaml                 | 46 +++++++++++++++
 pkg/common/config.go                      | 11 ++++
 pkg/llm-d-inference-sim/simulator.go      |  8 +--
 pkg/llm-d-inference-sim/simulator_test.go | 71 ++++++++++++++++++++---
 pkg/llm-d-inference-sim/streaming.go      |  8 +--
 5 files changed, 128 insertions(+), 16 deletions(-)
 create mode 100644 manifests/dev-config.yaml

diff --git a/manifests/dev-config.yaml b/manifests/dev-config.yaml
new file mode 100644
index 00000000..a3497c16
--- /dev/null
+++ b/manifests/dev-config.yaml
@@ -0,0 +1,46 @@
+block-size: 16
+data-parallel-size: 1
+enable-kvcache: false
+event-batch-size: 16
+failure-injection-rate: 0
+failure-types: null
+fake-metrics:
+  kv-cache-usage: 0.4
+  running-requests: 10
+  waiting-requests: 30
+hash-seed: 'hashseed'
+inter-token-latency: 50
+inter-token-latency-std-dev: 15
+kv-cache-size: 1024
+kv-cache-transfer-latency: 0
+kv-cache-transfer-latency-std-dev: 0
+kv-cache-transfer-time-per-token: 100
+kv-cache-transfer-time-std-dev: 30
+lora-modules: null
+max-cpu-loras: 1
+max-loras: 1
+max-model-len: 1024
+max-num-seqs: 7
+max-tool-call-array-param-length: 5
+max-tool-call-integer-param: 100
+max-tool-call-number-param: 100
+min-tool-call-array-param-length: 1
+min-tool-call-integer-param: 0
+min-tool-call-number-param: 0
+mode: random
+model: Qwen/Qwen2.5-1.5B-Instruct
+object-tool-call-not-required-field-probability: 50
+port: 8000
+prefill-overhead: 80
+prefill-time-per-token: 20
+prefill-time-std-dev: 3
+seed: 1757050700239757600
+served-model-name:
+  - Qwen/Qwen2.5-1.5B-Instruct
+time-factor-under-load: 5
+time-to-first-token: 0
+time-to-first-token-std-dev: 0
+tokenizers-cache-dir: ''
+tool-call-not-required-param-probability: 50
+zmq-endpoint: tcp://localhost:5557
+zmq-max-connect-attempts: 0
\ No newline at end of file
diff --git a/pkg/common/config.go b/pkg/common/config.go
index 7e9200a5..6dc71023 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -175,6 +175,17 @@ type Configuration struct {
 	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
 }
 
+func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
+	if c.MaxNumSeqs <= 1 {
+		return 1.0
+	}
+	return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
+}
+
+func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
+	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
+}
+
 type Metrics struct {
 	// LoraMetrics
 	LoraMetrics []LorasMetrics `json:"loras"`
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 24446685..002c871b 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 						finishReason = common.RemoteDecodeFinishReason
 					}
 
-					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
+					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData)
 				}
 			}
 			reqCtx.Wg.Done()
@@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
 func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
+	modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
@@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
+	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan)
 	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
@@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int {
 	if doRemotePrefill {
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index df43ff57..b6af37d4 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
+				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
 
 			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
@@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
 			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
@@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = stdDev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
 
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = 0
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(Equal(expectedTTFT))
 			},
@@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
@@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
 				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
+				ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan)
 
 				expectedTTFT := kvCacheTransTPT * nTokens
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -945,5 +945,60 @@ var _ = Describe("Simulator", func() {
 			Entry("very long prompt", 150, 100, 20000),
 		)
 
+		It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 1.0
+
+			simulator.runReqChan <- 100
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			Expect(ttft).To(Equal(42))
+		})
+
+		It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 100.0
+			simulator.config.MaxNumSeqs = 1
+
+			for len(simulator.runReqChan) > 0 {
+				<-simulator.runReqChan
+			}
+
+			simulator.runReqChan <- 1
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			Expect(ttft).To(Equal(42))
+		})
+
+		DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
+			func(timeFactorUnderLoad float64, maxNumOfReq int) {
+				simulator.config.TimeToFirstToken = 42
+				simulator.config.TimeToFirstTokenStdDev = 0
+				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
+				simulator.config.MaxNumSeqs = maxNumOfReq
+				for len(simulator.runReqChan) > 0 {
+					<-simulator.runReqChan
+				}
+				for i := 0; i < maxNumOfReq; i++ {
+					simulator.runReqChan <- 1
+				}
+
+				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
+
+			},
+			func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
+				return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
+					timeFactorUnderLoad, maxNumOfReq)
+			},
+
+			Entry("factor: 1.5", 1.5, 70),
+			Entry("factor: 2.0", 2.0, 2),
+			Entry("factor: 100.0", 100.0, 150),
+			Entry("factor: 20000.0", 20000.0, 310),
+		)
+
 	})
 })
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index 5ff1e240..a503af6d 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
+					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan)
 			}
 		}
 
@@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
-	tc *openaiserverapi.ToolCall, finishReason string) {
+	tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) {
 	// time to first token delay
-	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
+	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	for i, token := range genTokens {

From 9ccbe9532ecc5e366a52f86c5b533d11acd0459b Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 12:53:48 +1000
Subject: [PATCH 05/17] Test TTFT when partially loaded

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator_test.go | 33 ++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index b6af37d4..73912534 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -981,7 +981,7 @@ var _ = Describe("Simulator", func() {
 				for len(simulator.runReqChan) > 0 {
 					<-simulator.runReqChan
 				}
-				for i := 0; i < maxNumOfReq; i++ {
+				for range maxNumOfReq {
 					simulator.runReqChan <- 1
 				}
 
@@ -1000,5 +1000,36 @@ var _ = Describe("Simulator", func() {
 			Entry("factor: 20000.0", 20000.0, 310),
 		)
 
+		DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token",
+			func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) {
+				simulator.config.TimeToFirstToken = 42
+				simulator.config.TimeToFirstTokenStdDev = 0
+				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
+				simulator.config.MaxNumSeqs = maxNumOfReq
+
+				for len(simulator.runReqChan) > 0 {
+					<-simulator.runReqChan
+				}
+				for range nCurrNumOfReq {
+					simulator.runReqChan <- 1
+				}
+
+				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				max := timeFactorUnderLoad * float64(42)
+				Expect(ttft).To(BeNumerically(">=", 42))
+				Expect(ttft).To(BeNumerically("<=", max))
+
+			},
+			func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string {
+				return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d",
+					timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq)
+			},
+
+			Entry("factor: 1.5", 1.5, 70, 35),
+			Entry("factor: 2.0", 2.0, 2, 1),
+			Entry("factor: 100.0", 100.0, 150, 75),
+			Entry("factor: 20000.0", 20000.0, 310, 155),
+		)
+
 	})
 })

From d21f9c7ef1c6cca796ab5152dd30ed7d7423ebf3 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 13:49:48 +1000
Subject: [PATCH 06/17] Apply time factor under load to prefill and inter token
 latency

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      | 12 +++++++
 pkg/common/config_test.go                 | 44 +++++++++++++++++++++++
 pkg/llm-d-inference-sim/simulator.go      | 12 +++----
 pkg/llm-d-inference-sim/simulator_test.go | 24 ++++++-------
 pkg/llm-d-inference-sim/streaming.go      |  8 ++---
 5 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 6dc71023..d822daa5 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -186,6 +186,18 @@ func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
 	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
 }
 
+func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int {
+	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))
+}
+
+func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int {
+	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))
+}
+
+func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int {
+	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))
+}
+
 type Metrics struct {
 	// LoraMetrics
 	LoraMetrics []LorasMetrics `json:"loras"`
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 1c0353ed..4cf59136 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -461,4 +461,48 @@ var _ = Describe("Simulator configuration", func() {
 			})
 		})
 	}
+
+	It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 1.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, 3)
+		for i := 0; i < 3; i++ {
+			reqChan <- 1
+		}
+
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically("==", 1.0))
+		close(reqChan)
+	})
+
+	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 2.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, c.MaxNumSeqs)
+		for i := 0; i < c.MaxNumSeqs; i++ {
+			reqChan <- 1
+		}
+
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
+		close(reqChan)
+
+	})
+
+	It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 2.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, c.MaxNumSeqs)
+		for i := 0; i < c.MaxNumSeqs/2; i++ {
+			reqChan <- 1
+		}
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically(">", 1.0))
+		Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
+		close(reqChan)
+
+	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 002c871b..9076c48b 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 						finishReason = common.RemoteDecodeFinishReason
 					}
 
-					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData)
+					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
 				}
 			}
 			reqCtx.Wg.Done()
@@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
 func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) {
+	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
@@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan)
+	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
 	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
@@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
 	if doRemotePrefill {
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
@@ -708,11 +708,11 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
+		prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)
 		return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev)
+	return common.RandomNorm(s.config.GetTimeToFirstToken(&s.runReqChan), s.config.TimeToFirstTokenStdDev)
 }
 
 // returns inter token latency
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 73912534..fa06830d 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan)
+				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 
 			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
@@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
@@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = stdDev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
 
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = 0
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(Equal(expectedTTFT))
 			},
@@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
@@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
 				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
 
 				expectedTTFT := kvCacheTransTPT * nTokens
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -952,7 +952,7 @@ var _ = Describe("Simulator", func() {
 
 			simulator.runReqChan <- 100
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).To(Equal(42))
 		})
 
@@ -968,7 +968,7 @@ var _ = Describe("Simulator", func() {
 
 			simulator.runReqChan <- 1
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).To(Equal(42))
 		})
 
@@ -985,7 +985,7 @@ var _ = Describe("Simulator", func() {
 					simulator.runReqChan <- 1
 				}
 
-				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
 
 			},
@@ -1014,7 +1014,7 @@ var _ = Describe("Simulator", func() {
 					simulator.runReqChan <- 1
 				}
 
-				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				max := timeFactorUnderLoad * float64(42)
 				Expect(ttft).To(BeNumerically(">=", 42))
 				Expect(ttft).To(BeNumerically("<=", max))
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index a503af6d..5ff1e240 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan)
+					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
 			}
 		}
 
@@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
-	tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) {
+	tc *openaiserverapi.ToolCall, finishReason string) {
 	// time to first token delay
-	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan)
+	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	for i, token := range genTokens {

From bbfcbe8dd1b144c898c732b563b2a552fde0d89f Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 13:54:03 +1000
Subject: [PATCH 07/17] Improve param desc

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index d822daa5..05a1316a 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -104,11 +104,12 @@ type Configuration struct {
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
 	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
-	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests
-	// when parallel requests are being processed. The value of this factor must be >= 1.0, with a default of 1.0.
-	// If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are MaxNumSeqs
-	// requests, the total time will be multiplied by x.
-	// The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
+	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel 
+	// requests are being processed.
+	// The value of this factor must be >= 1.0, with a default of 1.0.
+	// - If this factor is 1.0, no extra time is added.
+	// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
+	// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
 	TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`
 
 	// Mode defines the simulator response generation mode, valid values: echo, random

From dd31c34a181946ea574dd6d0a9a8f2d53239be08 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 5 Sep 2025 17:17:33 +1000
Subject: [PATCH 08/17] Use nRunningReqs instead of runReqChan

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 .gitignore                                |  3 ++-
 pkg/common/config.go                      | 22 +++++++++++-----------
 pkg/common/config_test.go                 | 22 +++-------------------
 pkg/llm-d-inference-sim/simulator.go      |  4 ++--
 pkg/llm-d-inference-sim/simulator_test.go | 15 ++-------------
 5 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3906cfb9..950b0cb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@ vendor
 .devcontainer
 # MacOSX
 .DS_Store
-*.test
\ No newline at end of file
+*.test
+manifests/dev-config.yaml
diff --git a/pkg/common/config.go b/pkg/common/config.go
index 05a1316a..9a54fe86 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -104,7 +104,7 @@ type Configuration struct {
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
 	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
-	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel 
+	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
 	// requests are being processed.
 	// The value of this factor must be >= 1.0, with a default of 1.0.
 	// - If this factor is 1.0, no extra time is added.
@@ -176,27 +176,27 @@ type Configuration struct {
 	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
 }
 
-func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
+func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 {
 	if c.MaxNumSeqs <= 1 {
 		return 1.0
 	}
-	return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
+	return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1)
 }
 
-func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
-	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int {
+	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int {
-	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int {
+	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int {
-	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int {
+	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int {
-	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int {
+	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs))
 }
 
 type Metrics struct {
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 4cf59136..1edbea07 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -466,28 +466,18 @@ var _ = Describe("Simulator configuration", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 1.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, 3)
-		for i := 0; i < 3; i++ {
-			reqChan <- 1
-		}
 
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(3)
 		Expect(factor).To(BeNumerically("==", 1.0))
-		close(reqChan)
 	})
 
 	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 2.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, c.MaxNumSeqs)
-		for i := 0; i < c.MaxNumSeqs; i++ {
-			reqChan <- 1
-		}
 
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(11)
 		Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
-		close(reqChan)
 
 	})
 
@@ -495,14 +485,8 @@ var _ = Describe("Simulator configuration", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 2.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, c.MaxNumSeqs)
-		for i := 0; i < c.MaxNumSeqs/2; i++ {
-			reqChan <- 1
-		}
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(6)
 		Expect(factor).To(BeNumerically(">", 1.0))
 		Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
-		close(reqChan)
-
 	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 9076c48b..d9d691bc 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -708,11 +708,11 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)
+		prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs)
 		return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return common.RandomNorm(s.config.GetTimeToFirstToken(&s.runReqChan), s.config.TimeToFirstTokenStdDev)
+	return common.RandomNorm(s.config.GetTimeToFirstToken(s.nRunningReqs), s.config.TimeToFirstTokenStdDev)
 }
 
 // returns inter token latency
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index fa06830d..9c9f8647 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -978,12 +978,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = 0
 				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 				simulator.config.MaxNumSeqs = maxNumOfReq
-				for len(simulator.runReqChan) > 0 {
-					<-simulator.runReqChan
-				}
-				for range maxNumOfReq {
-					simulator.runReqChan <- 1
-				}
+				simulator.nRunningReqs = int64(maxNumOfReq)
 
 				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
@@ -1006,13 +1001,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = 0
 				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 				simulator.config.MaxNumSeqs = maxNumOfReq
-
-				for len(simulator.runReqChan) > 0 {
-					<-simulator.runReqChan
-				}
-				for range nCurrNumOfReq {
-					simulator.runReqChan <- 1
-				}
+				simulator.nRunningReqs = int64(nCurrNumOfReq)
 
 				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				max := timeFactorUnderLoad * float64(42)

From bb10ce7351add18bf0a41150278780becad80b80 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 7 Sep 2025 16:43:15 +1000
Subject: [PATCH 09/17] unstage manifests/dev-config.yaml

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 manifests/dev-config.yaml | 46 ---------------------------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 manifests/dev-config.yaml

diff --git a/manifests/dev-config.yaml b/manifests/dev-config.yaml
deleted file mode 100644
index a3497c16..00000000
--- a/manifests/dev-config.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-block-size: 16
-data-parallel-size: 1
-enable-kvcache: false
-event-batch-size: 16
-failure-injection-rate: 0
-failure-types: null
-fake-metrics:
-  kv-cache-usage: 0.4
-  running-requests: 10
-  waiting-requests: 30
-hash-seed: 'hashseed'
-inter-token-latency: 50
-inter-token-latency-std-dev: 15
-kv-cache-size: 1024
-kv-cache-transfer-latency: 0
-kv-cache-transfer-latency-std-dev: 0
-kv-cache-transfer-time-per-token: 100
-kv-cache-transfer-time-std-dev: 30
-lora-modules: null
-max-cpu-loras: 1
-max-loras: 1
-max-model-len: 1024
-max-num-seqs: 7
-max-tool-call-array-param-length: 5
-max-tool-call-integer-param: 100
-max-tool-call-number-param: 100
-min-tool-call-array-param-length: 1
-min-tool-call-integer-param: 0
-min-tool-call-number-param: 0
-mode: random
-model: Qwen/Qwen2.5-1.5B-Instruct
-object-tool-call-not-required-field-probability: 50
-port: 8000
-prefill-overhead: 80
-prefill-time-per-token: 20
-prefill-time-std-dev: 3
-seed: 1757050700239757600
-served-model-name:
-  - Qwen/Qwen2.5-1.5B-Instruct
-time-factor-under-load: 5
-time-to-first-token: 0
-time-to-first-token-std-dev: 0
-tokenizers-cache-dir: ''
-tool-call-not-required-param-probability: 50
-zmq-endpoint: tcp://localhost:5557
-zmq-max-connect-attempts: 0
\ No newline at end of file

From ed59319b695328e9fbf3b8d4c180db6d397cc84e Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 7 Sep 2025 16:48:49 +1000
Subject: [PATCH 10/17] Update readme

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 535e77fc..29d4d405 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
 - `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
 ---
+- `time-factor-under-load`: a multiplicative factor that affects the overall time taken for requests when parallelrequests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. If this factor is 1.0, no extra time is added.  When the factor is x (where x > 1.0) and there are `max-num-seqs` requests, the total time will be multiplied by x. The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 ---
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100

From 374842b40bf9ba5872f316b4231709d9041d5e8d Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 7 Sep 2025 16:54:39 +1000
Subject: [PATCH 11/17] Restore changes for inter token latency (lost due to
 conflicts resolve)

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index d9d691bc..27c5eba4 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -717,7 +717,7 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 
 // returns inter token latency
 func (s *VllmSimulator) getInterTokenLatency() int {
-	return common.RandomNorm(s.config.InterTokenLatency, s.config.InterTokenLatencyStdDev)
+	return common.RandomNorm(s.config.GetInterTokenLatency(s.nRunningReqs), s.config.InterTokenLatencyStdDev)
 }
 
 // returns total inter token latency for the given number of tokens

From f35c34ce855f7df0549b36bb1a420ab6b335bebf Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 7 Sep 2025 20:21:06 +1000
Subject: [PATCH 12/17] Calc inter token latency based on load instead of
 one-calc-for-whole request

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go      | 17 ++++++-----------
 pkg/llm-d-inference-sim/simulator_test.go |  7 ++++++-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 27c5eba4..72cb357e 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -678,8 +678,12 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
 	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
-	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
-	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
+
+	time.Sleep(time.Duration(ttft) * time.Microsecond)
+	for range nGenTokens - 1 {
+		perTokenLatency := s.getInterTokenLatency()
+		time.Sleep(time.Duration(perTokenLatency) * time.Microsecond)
+	}
 
 	ctx.Response.Header.SetContentType("application/json")
 	ctx.Response.Header.SetStatusCode(fasthttp.StatusOK)
@@ -720,15 +724,6 @@ func (s *VllmSimulator) getInterTokenLatency() int {
 	return common.RandomNorm(s.config.GetInterTokenLatency(s.nRunningReqs), s.config.InterTokenLatencyStdDev)
 }
 
-// returns total inter token latency for the given number of tokens
-func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
-	total := 0
-	for range numOfTokens - 1 {
-		total += s.getInterTokenLatency()
-	}
-	return total
-}
-
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
 func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse {
 	modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}}
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 9c9f8647..008b31ad 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -777,7 +777,12 @@ var _ = Describe("Simulator", func() {
 			func(interTokenLatency int, stddev int, numberOfTokens int) {
 				simulator.config.InterTokenLatency = interTokenLatency
 				simulator.config.InterTokenLatencyStdDev = stddev
-				latency := simulator.getTotalInterTokenLatency(numberOfTokens)
+
+				latency := 0
+				for range numberOfTokens - 1 {
+					latency += simulator.getInterTokenLatency()
+				}
+
 				Expect(latency).To(BeNumerically(">=", int(float32(interTokenLatency)*0.3*float32(numberOfTokens))))
 				Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens))))
 			},

From e3e3ffd51c0460ae68695092947b1412a8db8fb2 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 7 Sep 2025 21:13:13 +1000
Subject: [PATCH 13/17] Calc inter token latency based on load instead of
 one-calc-for-whole request

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 72cb357e..ad653c7c 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -674,15 +674,12 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	}
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
-	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
-	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
-
-	time.Sleep(time.Duration(ttft) * time.Microsecond)
-	for range nGenTokens - 1 {
+	ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
+	time.Sleep(time.Duration(ttft) * time.Millisecond)
+	for range usageData.CompletionTokens - 1 {
 		perTokenLatency := s.getInterTokenLatency()
-		time.Sleep(time.Duration(perTokenLatency) * time.Microsecond)
+		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
 	}
 
 	ctx.Response.Header.SetContentType("application/json")

From 75415e30719452f035d041508de5fd716578626d Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Tue, 9 Sep 2025 16:22:13 +1000
Subject: [PATCH 14/17] Move methods to simulator

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      | 23 -----------------
 pkg/common/config_test.go                 | 28 ---------------------
 pkg/llm-d-inference-sim/simulator.go      | 29 +++++++++++++++++++---
 pkg/llm-d-inference-sim/simulator_test.go | 30 +++++++++++++++++++++++
 4 files changed, 56 insertions(+), 54 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 9a54fe86..c367c029 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -176,29 +176,6 @@ type Configuration struct {
 	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
 }
 
-func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 {
-	if c.MaxNumSeqs <= 1 {
-		return 1.0
-	}
-	return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1)
-}
-
-func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int {
-	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs))
-}
-
-func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int {
-	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs))
-}
-
-func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int {
-	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs))
-}
-
-func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int {
-	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs))
-}
-
 type Metrics struct {
 	// LoraMetrics
 	LoraMetrics []LorasMetrics `json:"loras"`
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 1edbea07..1c0353ed 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -461,32 +461,4 @@ var _ = Describe("Simulator configuration", func() {
 			})
 		})
 	}
-
-	It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
-		c := newConfig()
-		c.TimeFactorUnderLoad = 1.0
-		c.MaxNumSeqs = 11
-
-		factor := c.calcLoadFactor(3)
-		Expect(factor).To(BeNumerically("==", 1.0))
-	})
-
-	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
-		c := newConfig()
-		c.TimeFactorUnderLoad = 2.0
-		c.MaxNumSeqs = 11
-
-		factor := c.calcLoadFactor(11)
-		Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
-
-	})
-
-	It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
-		c := newConfig()
-		c.TimeFactorUnderLoad = 2.0
-		c.MaxNumSeqs = 11
-		factor := c.calcLoadFactor(6)
-		Expect(factor).To(BeNumerically(">", 1.0))
-		Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
-	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index ad653c7c..dcf85873 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -709,16 +709,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs)
+		prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken()
 		return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return common.RandomNorm(s.config.GetTimeToFirstToken(s.nRunningReqs), s.config.TimeToFirstTokenStdDev)
+	return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev)
 }
 
 // returns inter token latency
 func (s *VllmSimulator) getInterTokenLatency() int {
-	return common.RandomNorm(s.config.GetInterTokenLatency(s.nRunningReqs), s.config.InterTokenLatencyStdDev)
+	return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev)
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
@@ -812,3 +812,26 @@ func (s *VllmSimulator) showConfig(dp bool) error {
 	s.logger.Info("Configuration:", "", string(cfgJSON))
 	return nil
 }
+
+func (s *VllmSimulator) getRealtimeFactor() float64 {
+	if s.config.MaxNumSeqs <= 1 {
+		return 1.0
+	}
+	return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
+}
+
+func (s *VllmSimulator) GetTimeToFirstToken() int {
+	return int(float64(s.config.TimeToFirstToken) * s.getRealtimeFactor())
+}
+
+func (s *VllmSimulator) GetPrefillOverhead() int {
+	return int(float64(s.config.PrefillOverhead) * s.getRealtimeFactor())
+}
+
+func (s *VllmSimulator) GetPrefillTimePerToken() int {
+	return int(float64(s.config.PrefillTimePerToken) * s.getRealtimeFactor())
+}
+
+func (s *VllmSimulator) GetInterTokenLatency() int {
+	return int(float64(s.config.InterTokenLatency) * s.getRealtimeFactor())
+}
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 008b31ad..0b3b2212 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -1025,5 +1025,35 @@ var _ = Describe("Simulator", func() {
 			Entry("factor: 20000.0", 20000.0, 310, 155),
 		)
 
+		It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
+			simulator.config.TimeFactorUnderLoad = 1.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 3
+
+			factor := simulator.getRealtimeFactor()
+			Expect(factor).To(BeNumerically("==", 1.0))
+		})
+
+		It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
+			simulator.config.TimeFactorUnderLoad = 2.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 11
+
+			factor := simulator.getRealtimeFactor()
+			Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
+
+		})
+
+		It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
+			simulator.config.TimeFactorUnderLoad = 2.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 6
+
+			factor := simulator.getRealtimeFactor()
+			Expect(factor).To(BeNumerically(">", 1.0))
+			Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
+		})
+
 	})
+
 })

From 00c172455c05eaecc46a8563cd22d2fceabf9152 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Tue, 9 Sep 2025 16:26:47 +1000
Subject: [PATCH 15/17] Rename helper func

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go      | 10 +++++-----
 pkg/llm-d-inference-sim/simulator_test.go |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index dcf85873..f042a6fb 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -813,7 +813,7 @@ func (s *VllmSimulator) showConfig(dp bool) error {
 	return nil
 }
 
-func (s *VllmSimulator) getRealtimeFactor() float64 {
+func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 {
 	if s.config.MaxNumSeqs <= 1 {
 		return 1.0
 	}
@@ -821,17 +821,17 @@ func (s *VllmSimulator) getRealtimeFactor() float64 {
 }
 
 func (s *VllmSimulator) GetTimeToFirstToken() int {
-	return int(float64(s.config.TimeToFirstToken) * s.getRealtimeFactor())
+	return int(float64(s.config.TimeToFirstToken) * s.getCurrTimeFactorUnderLoad())
 }
 
 func (s *VllmSimulator) GetPrefillOverhead() int {
-	return int(float64(s.config.PrefillOverhead) * s.getRealtimeFactor())
+	return int(float64(s.config.PrefillOverhead) * s.getCurrTimeFactorUnderLoad())
 }
 
 func (s *VllmSimulator) GetPrefillTimePerToken() int {
-	return int(float64(s.config.PrefillTimePerToken) * s.getRealtimeFactor())
+	return int(float64(s.config.PrefillTimePerToken) * s.getCurrTimeFactorUnderLoad())
 }
 
 func (s *VllmSimulator) GetInterTokenLatency() int {
-	return int(float64(s.config.InterTokenLatency) * s.getRealtimeFactor())
+	return int(float64(s.config.InterTokenLatency) * s.getCurrTimeFactorUnderLoad())
 }
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 0b3b2212..9fa75c4e 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -1030,7 +1030,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 3
 
-			factor := simulator.getRealtimeFactor()
+			factor := simulator.getCurrTimeFactorUnderLoad()
 			Expect(factor).To(BeNumerically("==", 1.0))
 		})
 
@@ -1039,7 +1039,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 11
 
-			factor := simulator.getRealtimeFactor()
+			factor := simulator.getCurrTimeFactorUnderLoad()
 			Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
 
 		})
@@ -1049,7 +1049,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 6
 
-			factor := simulator.getRealtimeFactor()
+			factor := simulator.getCurrTimeFactorUnderLoad()
 			Expect(factor).To(BeNumerically(">", 1.0))
 			Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
 		})

From f22745bb119e21e246baf5b51236160af2ec8a43 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Tue, 9 Sep 2025 16:29:44 +1000
Subject: [PATCH 16/17] Rename helper func

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go      | 10 +++++-----
 pkg/llm-d-inference-sim/simulator_test.go |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index f042a6fb..5ba1b7e7 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -813,7 +813,7 @@ func (s *VllmSimulator) showConfig(dp bool) error {
 	return nil
 }
 
-func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 {
+func (s *VllmSimulator) getCurrFactor() float64 {
 	if s.config.MaxNumSeqs <= 1 {
 		return 1.0
 	}
@@ -821,17 +821,17 @@ func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 {
 }
 
 func (s *VllmSimulator) GetTimeToFirstToken() int {
-	return int(float64(s.config.TimeToFirstToken) * s.getCurrTimeFactorUnderLoad())
+	return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor())
 }
 
 func (s *VllmSimulator) GetPrefillOverhead() int {
-	return int(float64(s.config.PrefillOverhead) * s.getCurrTimeFactorUnderLoad())
+	return int(float64(s.config.PrefillOverhead) * s.getCurrFactor())
 }
 
 func (s *VllmSimulator) GetPrefillTimePerToken() int {
-	return int(float64(s.config.PrefillTimePerToken) * s.getCurrTimeFactorUnderLoad())
+	return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor())
 }
 
 func (s *VllmSimulator) GetInterTokenLatency() int {
-	return int(float64(s.config.InterTokenLatency) * s.getCurrTimeFactorUnderLoad())
+	return int(float64(s.config.InterTokenLatency) * s.getCurrFactor())
 }
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 9fa75c4e..fd0c3ed9 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -1030,7 +1030,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 3
 
-			factor := simulator.getCurrTimeFactorUnderLoad()
+			factor := simulator.getCurrFactor()
 			Expect(factor).To(BeNumerically("==", 1.0))
 		})
 
@@ -1039,7 +1039,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 11
 
-			factor := simulator.getCurrTimeFactorUnderLoad()
+			factor := simulator.getCurrFactor()
 			Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
 
 		})
@@ -1049,7 +1049,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.MaxNumSeqs = 11
 			simulator.nRunningReqs = 6
 
-			factor := simulator.getCurrTimeFactorUnderLoad()
+			factor := simulator.getCurrFactor()
 			Expect(factor).To(BeNumerically(">", 1.0))
 			Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
 		})

From a624b0b120c0b4ff6e0c788de8b0211435834f3d Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Tue, 9 Sep 2025 17:14:31 +1000
Subject: [PATCH 17/17] Fix inter token latency test

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index fd0c3ed9..17485001 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -777,6 +777,8 @@ var _ = Describe("Simulator", func() {
 			func(interTokenLatency int, stddev int, numberOfTokens int) {
 				simulator.config.InterTokenLatency = interTokenLatency
 				simulator.config.InterTokenLatencyStdDev = stddev
+				simulator.config.MaxNumSeqs = 1
+				simulator.config.TimeFactorUnderLoad = 1.0
 
 				latency := 0
 				for range numberOfTokens - 1 {