From 0b7c39e2cb6bc5ac78c0f7631c065d5bd5b1d3de Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 24 Aug 2025 22:15:00 +1000
Subject: [PATCH 01/19] Fix comments on prefill arg in completion request
 interface

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/openai-server-api/request.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
index d368a211..afab801d 100644
--- a/pkg/openai-server-api/request.go
+++ b/pkg/openai-server-api/request.go
@@ -53,9 +53,9 @@ type CompletionRequest interface {
 	GetToolChoice() string
 	// GetMaxCompletionTokens returns the maximum completion tokens requested
 	GetMaxCompletionTokens() *int64
-	// IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request
+	// IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is decode request
 	IsDoRemoteDecode() bool
-	// IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request
+	// IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is prefill request
 	IsDoRemotePrefill() bool
 }
 

From e0d61de4392aed8924ba2929799c7857ccee2784 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 00:45:07 +1000
Subject: [PATCH 02/19] Add feature of calc ttft by prefill overhead. TODO:
 kvcache transfer overhead

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      | 17 +++++++
 pkg/common/config_test.go                 |  4 ++
 pkg/llm-d-inference-sim/simulator.go      | 33 +++++++++++--
 pkg/llm-d-inference-sim/simulator_test.go | 60 ++++++++++++++++++++++-
 pkg/llm-d-inference-sim/streaming.go      | 14 +++---
 5 files changed, 116 insertions(+), 12 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 3d5f6ac1..439fc038 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -65,6 +65,11 @@ type Configuration struct {
 	// in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
 	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
+
+	// PrefillOverhead time taken to prefill the context, in milliseconds
+	PrefillOverhead           int    `yaml:"prefill-overhead" json:"prefill-overhead"`
+	PrefillOverheadComplexity string `yaml:"prefill-overhead-complexity" json:"prefill-overhead-complexity"`
+
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"`
 	// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
@@ -295,6 +300,16 @@ func (c *Configuration) validate() error {
 	if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) {
 		return errors.New("time to first token standard deviation cannot be more than 30% of time to first token")
 	}
+	if c.PrefillOverhead < 0 {
+		return errors.New("prefill overhead cannot be negative")
+	} else if c.PrefillOverhead == 0 {
+		if c.PrefillOverheadComplexity != "" {
+			return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
+		}
+	}
+	if c.PrefillOverheadComplexity != "" && c.PrefillOverheadComplexity != "n^2" && c.PrefillOverheadComplexity != "nlog(n)" {
+		return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
+	}
 	if c.KVCacheTransferLatency < 0 {
 		return errors.New("kv-cache tranfer time cannot be negative")
 	}
@@ -400,6 +415,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
+	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
+	f.StringVar(&config.PrefillOverheadComplexity, "prefill-overhead-complexity", config.PrefillOverheadComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index f50c40a9..f7cf2e16 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -388,6 +388,10 @@ var _ = Describe("Simulator configuration", func() {
 			name: "invalid (negative) zmq-max-connect-attempts for config file",
 			args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"},
 		},
+		{
+			name: "<prefill-overhead> must be set when <prefill-overhead-complexity> is set",
+			args: []string{"cmd", "--prefill-overhead-complexity", "n^2", "--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index d9813996..93154712 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -22,6 +22,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"math"
 	"net"
 	"os"
 	"strings"
@@ -465,7 +466,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 							model:            displayModel,
 							doRemotePrefill:  req.IsDoRemotePrefill(),
 						},
-						responseTokens, toolCalls, finishReason, usageDataToSend,
+						usageDataToSend.PromptTokens, responseTokens, toolCalls, finishReason, usageDataToSend,
 					)
 				} else {
 					if req.IsDoRemoteDecode() {
@@ -633,8 +634,9 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	}
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
-	numOfTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens)
+	nPromptTokens := usageData.PromptTokens
+	nGenTokens := usageData.CompletionTokens
+	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill, nPromptTokens) + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	ctx.Response.Header.SetContentType("application/json")
@@ -652,7 +654,14 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
+func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool, nPromptTokens int) int {
+	if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 {
+		if nPromptTokens <= 1 {
+			return s.config.PrefillOverhead
+		}
+		return s.calcPrefillOverhead(nPromptTokens)
+	}
+
 	mean := float64(s.config.TimeToFirstToken)
 	stddev := float64(s.config.TimeToFirstTokenStdDev)
 	if doRemotePrefill {
@@ -678,6 +687,22 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 	return total
 }
 
+// calc the prefill overhead against number of tokens
+func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
+	pfOverhead := s.config.PrefillOverhead
+	complexity := s.config.PrefillOverheadComplexity
+	// policies of different complexities of prefill implementation
+	switch complexity {
+	case "n^2", "":
+		// this is simple implementation of n^2
+		return pfOverhead * nPromptTokens * nPromptTokens
+	case "nlog(n)":
+		return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
+	}
+
+	return 0
+}
+
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
 func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse {
 	modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}}
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 88d87759..5fc462a0 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
 	"net"
 	"net/http"
 	"os"
@@ -801,7 +802,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill)
+				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill, 1)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -822,6 +823,63 @@ var _ = Describe("Simulator", func() {
 			Entry(nil, 10000, 0, 1000, 0, true),
 			Entry(nil, 10000, 0, 1000, 0, false),
 		)
+
+		It("when <time-to-first-token> is not 0, ignore <prefill-overhead>", func() {
+			timeToFirstToken := 10000
+			prefillOverhead := 100
+			simulator.config.TimeToFirstToken = timeToFirstToken
+			simulator.config.PrefillOverhead = prefillOverhead
+			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
+			Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
+		})
+
+		It("when <time-to-first-token> is 0, use <prefill-overhead>", func() {
+			simulator.config.TimeToFirstToken = 0
+			simulator.config.PrefillOverhead = 100
+			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			Expect(timeToFirst).To(BeNumerically(">=", 100))
+		})
+
+		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
+			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					square := prefillOverhead * nTokens * nTokens
+					simulator.config.PrefillOverhead = prefillOverhead
+					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
+					Expect(diffRatio).To(BeNumerically("<", tolerance))
+				}
+			},
+			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 1, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
+
+		DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
+			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.PrefillOverheadComplexity = "nlog(n)"
+
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
+					simulator.config.PrefillOverhead = prefillOverhead
+					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
+					Expect(diffRatio).To(BeNumerically("<", tolerance))
+				}
+			},
+			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 2, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
 	})
 
 	Context("fake metrics", func() {
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index 969f29af..e2295244 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -39,7 +39,7 @@ type streamingContext struct {
 // as defined by isChatCompletion
 // response content is wrapped according SSE format
 // First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds
-func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []openaiserverapi.ToolCall,
+func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, nPromptTokens int, responseTokens []string, toolCalls []openaiserverapi.ToolCall,
 	finishReason string, usageData *openaiserverapi.Usage) {
 	context.ctx.SetContentType("text/event-stream")
 	context.ctx.SetStatusCode(fasthttp.StatusOK)
@@ -67,11 +67,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
+					s.sendTokenChunks(context, w, nPromptTokens, tc.Function.TokenizedArguments, &tc, finishReason)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
+				s.sendTokenChunks(context, w, nPromptTokens, responseTokens, nil, finishReason)
 			}
 		}
 
@@ -94,11 +94,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 }
 
 // sendTokenChunks creates and sends response chunks
-func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *openaiserverapi.ToolCall, finishReason string) {
+func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, nPromptTokens int, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) {
 	// time to first token delay
-	time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill)) * time.Millisecond)
+	time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill, nPromptTokens)) * time.Millisecond)
 
-	for i, token := range tokens {
+	for i, token := range genTokens {
 		if i != 0 {
 			time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)
 		}
@@ -119,7 +119,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 
 		var chunk openaiserverapi.CompletionRespChunk
 		var finishReasonToSend *string
-		if i == len(tokens)-1 && (finishReason == common.LengthFinishReason || finishReason == common.ToolsFinishReason) {
+		if i == len(genTokens)-1 && (finishReason == common.LengthFinishReason || finishReason == common.ToolsFinishReason) {
 			finishReasonToSend = &finishReason
 		}
 		if context.isChatCompletion {

From a199aeaa214c72b369bd5647a1d2c577a03a0973 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 12:02:04 +1000
Subject: [PATCH 03/19] Rename prefill-overhead-complexity to
 prefill-complexity

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      | 10 +++++-----
 pkg/common/config_test.go                 |  4 ++--
 pkg/llm-d-inference-sim/simulator.go      |  2 +-
 pkg/llm-d-inference-sim/simulator_test.go |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 439fc038..c8e956f9 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -67,8 +67,8 @@ type Configuration struct {
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
 
 	// PrefillOverhead time taken to prefill the context, in milliseconds
-	PrefillOverhead           int    `yaml:"prefill-overhead" json:"prefill-overhead"`
-	PrefillOverheadComplexity string `yaml:"prefill-overhead-complexity" json:"prefill-overhead-complexity"`
+	PrefillOverhead   int    `yaml:"prefill-overhead" json:"prefill-overhead"`
+	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
 
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"`
@@ -303,11 +303,11 @@ func (c *Configuration) validate() error {
 	if c.PrefillOverhead < 0 {
 		return errors.New("prefill overhead cannot be negative")
 	} else if c.PrefillOverhead == 0 {
-		if c.PrefillOverheadComplexity != "" {
+		if c.PrefillComplexity != "" {
 			return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
 		}
 	}
-	if c.PrefillOverheadComplexity != "" && c.PrefillOverheadComplexity != "n^2" && c.PrefillOverheadComplexity != "nlog(n)" {
+	if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
 		return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
 	}
 	if c.KVCacheTransferLatency < 0 {
@@ -416,7 +416,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
-	f.StringVar(&config.PrefillOverheadComplexity, "prefill-overhead-complexity", config.PrefillOverheadComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
+	f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index f7cf2e16..830e55a0 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -389,8 +389,8 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"},
 		},
 		{
-			name: "<prefill-overhead> must be set when <prefill-overhead-complexity> is set",
-			args: []string{"cmd", "--prefill-overhead-complexity", "n^2", "--config", "../../manifests/config.yaml"},
+			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
+			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
 		},
 	}
 
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 93154712..7603ada2 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -690,7 +690,7 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 // calc the prefill overhead against number of tokens
 func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
 	pfOverhead := s.config.PrefillOverhead
-	complexity := s.config.PrefillOverheadComplexity
+	complexity := s.config.PrefillComplexity
 	// policies of different complexities of prefill implementation
 	switch complexity {
 	case "n^2", "":
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 5fc462a0..3584165a 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -862,7 +862,7 @@ var _ = Describe("Simulator", func() {
 
 		DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
-				simulator.config.PrefillOverheadComplexity = "nlog(n)"
+				simulator.config.PrefillComplexity = "nlog(n)"
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
 					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))

From cecb32c8fb1215c3813b564a343b4db8f0d6462e Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 15:18:04 +1000
Subject: [PATCH 04/19] Calc kv cache transfer overhead based on prompt length

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      | 24 +++++-
 pkg/common/config_test.go                 | 12 +++
 pkg/llm-d-inference-sim/simulator.go      | 34 +++++++--
 pkg/llm-d-inference-sim/simulator_test.go | 89 ++++++++++++++++++++---
 pkg/llm-d-inference-sim/streaming.go      |  2 +-
 5 files changed, 142 insertions(+), 19 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index c8e956f9..f4ea6198 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -67,7 +67,9 @@ type Configuration struct {
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
 
 	// PrefillOverhead time taken to prefill the context, in milliseconds
-	PrefillOverhead   int    `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
+	PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// options are "n^2" and "nlog(n)"
 	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
 
 	// InterTokenLatency time between generated tokens, in milliseconds
@@ -85,6 +87,13 @@ type Configuration struct {
 	// KVCacheTransferLatency
 	KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`
 
+	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated,
+	// in milliseconds.
+	// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
+	KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
+	// options are "linear" and "in-place", default is "linear"
+	KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
 	// Seed defines random seed for operations
@@ -319,6 +328,17 @@ func (c *Configuration) validate() error {
 	if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
+	if c.KVCacheTransferOverhead < 0 {
+		return errors.New("kv-cache transfer overhead cannot be negative")
+	} else if c.KVCacheTransferOverhead == 0 {
+		if c.KVCacheTransferComplexity != "" {
+			return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
+		}
+	}
+	if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
+		return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
+	}
+
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
@@ -422,6 +442,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
+	f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
+	f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
 	f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 830e55a0..373b8b80 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -392,6 +392,18 @@ var _ = Describe("Simulator configuration", func() {
 			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
 			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "<prefill-complexity> should not be 'xxx'",
+			args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
+			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "<kv-cache-transfer-complexity> should not be 'xxx'",
+			args: []string{"cmd", "--kv-cache-transfer-complexity", "xxx", "--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 7603ada2..93797291 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -636,7 +636,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	nPromptTokens := usageData.PromptTokens
 	nGenTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill, nPromptTokens) + s.getTotalInterTokenLatency(nGenTokens)
+	totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	ctx.Response.Header.SetContentType("application/json")
@@ -654,13 +654,17 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool, nPromptTokens int) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
 	if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 {
 		if nPromptTokens <= 1 {
-			return s.config.PrefillOverhead
+			if !doRemotePrefill {
+				return s.config.PrefillOverhead
+			}
+			return s.config.KVCacheTransferOverhead
 		}
-		return s.calcPrefillOverhead(nPromptTokens)
+		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
+	fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
 
 	mean := float64(s.config.TimeToFirstToken)
 	stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -688,7 +692,10 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 }
 
 // calc the prefill overhead against number of tokens
-func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
+func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int {
+	if doRemotePrefill {
+		return s.calcRemotePrefillOverhead(nPromptTokens)
+	}
 	pfOverhead := s.config.PrefillOverhead
 	complexity := s.config.PrefillComplexity
 	// policies of different complexities of prefill implementation
@@ -699,7 +706,24 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
 	case "nlog(n)":
 		return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
 	}
+	// should never reach here
+	return 0
+}
 
+// calc the remote prefill overhead against number of tokens
+func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
+	overhead := s.config.KVCacheTransferOverhead
+	complexity := s.config.KVCacheTransferComplexity
+	switch complexity {
+	case "linear", "":
+		fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
+		return overhead * nPromptTokens
+	case "in-place":
+		// when the context is already filled
+		// this is a simple implementation which return a defined overhead
+		return overhead
+	}
+	// should never reach here
 	return 0
 }
 
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 3584165a..11853564 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -802,7 +802,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill, 1)
+				timeToFirst := simulator.getTimeToFirstToken(1, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -826,29 +826,30 @@ var _ = Describe("Simulator", func() {
 
 		It("when <time-to-first-token> is not 0, ignore <prefill-overhead>", func() {
 			timeToFirstToken := 10000
-			prefillOverhead := 100
 			simulator.config.TimeToFirstToken = timeToFirstToken
-			simulator.config.PrefillOverhead = prefillOverhead
-			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			simulator.config.PrefillOverhead = 100
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
 			Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
 			Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
 		})
 
-		It("when <time-to-first-token> is 0, use <prefill-overhead>", func() {
+		It("when <time-to-first-token> is 0, and <prefill-overhead> is not 0, use <prefill-overhead>", func() {
 			simulator.config.TimeToFirstToken = 0
 			simulator.config.PrefillOverhead = 100
-			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
 			Expect(timeToFirst).To(BeNumerically(">=", 100))
 		})
 
 		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.PrefillComplexity = "n^2"
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					square := prefillOverhead * nTokens * nTokens
 					simulator.config.PrefillOverhead = prefillOverhead
-					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
+
+					square := prefillOverhead * nTokens * nTokens
 					diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
-					Expect(diffRatio).To(BeNumerically("<", tolerance))
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
 				}
 			},
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
@@ -865,11 +866,12 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillComplexity = "nlog(n)"
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
 					simulator.config.PrefillOverhead = prefillOverhead
-					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
+
+					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
 					diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
-					Expect(diffRatio).To(BeNumerically("<", tolerance))
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
 				}
 			},
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
@@ -880,6 +882,69 @@ var _ = Describe("Simulator", func() {
 			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
 			Entry("large numbers", 150, 0.05, 20000, 20010),
 		)
+
+		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
+			overhead := 100
+			simulator.config.KVCacheTransferLatency = 1000
+			simulator.config.KVCacheTransferOverhead = overhead
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
+			Expect(timeToFirst).To(BeNumerically(">=", overhead))
+		})
+
+		It("when <kv-cache-transfer-latency> is 0, and <kv-cache-transfer-overhead> is not 0, use <kv-cache-transfer-overhead>", func() {
+			overhead := 100
+			simulator.config.KVCacheTransferLatency = 0
+			simulator.config.KVCacheTransferOverhead = overhead
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
+			Expect(timeToFirst).To(BeNumerically(">", 0))
+		})
+
+		DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = 1
+				simulator.config.KVCacheTransferComplexity = "linear"
+
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
+
+					linear := kvCacheOverhead * nTokens
+					diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+				}
+			},
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 1, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
+
+		DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = 1
+				simulator.config.KVCacheTransferComplexity = "in-place"
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
+
+					inPlace := kvCacheOverhead
+					diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+				}
+			},
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 1, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
 	})
 
 	Context("fake metrics", func() {
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
index e2295244..d234114a 100644
--- a/pkg/llm-d-inference-sim/streaming.go
+++ b/pkg/llm-d-inference-sim/streaming.go
@@ -96,7 +96,7 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, nPrompt
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, nPromptTokens int, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) {
 	// time to first token delay
-	time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill, nPromptTokens)) * time.Millisecond)
+	time.Sleep(time.Duration(s.getTimeToFirstToken(nPromptTokens, context.doRemotePrefill)) * time.Millisecond)
 
 	for i, token := range genTokens {
 		if i != 0 {

From 0c80d58382cbccdaaca8c559b7f20e7f42aca54c Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 15:29:08 +1000
Subject: [PATCH 05/19] Add invalid test cases for args prefill-overhead and
 kv-cache-transfer-overhead

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config_test.go | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 373b8b80..cae17fd5 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -388,6 +388,11 @@ var _ = Describe("Simulator configuration", func() {
 			name: "invalid (negative) zmq-max-connect-attempts for config file",
 			args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"},
 		},
+		{
+			name: "invalid (negative) prefill-overhead",
+			args: []string{"cmd", "--prefill-overhead", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
 			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
@@ -396,6 +401,11 @@ var _ = Describe("Simulator configuration", func() {
 			name: "<prefill-complexity> should not be 'xxx'",
 			args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) kv-cache-transfer-overhead",
+			args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
 			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},

From 18d30756a7173798e9bf0ed010478829b6b14d9a Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 17:12:17 +1000
Subject: [PATCH 06/19] Add standard deviation in utils

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/utils.go      | 19 +++++++++++++++++++
 pkg/common/utils_test.go | 12 ++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index 2cb4ad66..39555c2d 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -261,3 +261,22 @@ func init() {
 func Tokenize(text string) []string {
 	return re.FindAllString(text, -1)
 }
+
+// Calculate standard deviation of an int array
+func StdDevInt(data []int) float64 {
+	var sum int
+	for _, value := range data {
+		sum += value
+	}
+	mean := sum / len(data)
+
+	var sumSquares int
+	for _, value := range data {
+		diff := value - mean
+		sumSquares += diff * diff
+	}
+
+	variance := sumSquares / len(data)
+
+	return math.Sqrt(float64(variance))
+}
diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go
index dd6cadab..81341078 100644
--- a/pkg/common/utils_test.go
+++ b/pkg/common/utils_test.go
@@ -156,4 +156,16 @@ var _ = Describe("Utils", Ordered, func() {
 		}
 	})
 
+	Context("Standard Deviation", func() {
+		It("should return 0 for a single element", func() {
+			data := []int{42}
+			Expect(StdDevInt(data)).To(Equal(0.0))
+		})
+
+		It("should return the correct standard deviation for multiple elements", func() {
+			data := []int{1, 2, 3, 4, 5}
+			Expect(StdDevInt(data)).To(Equal(1.4142135623730951))
+		})
+	})
+
 })

From 1fd0a9af12e6be85ba8cde29314be1e8ba39a8e0 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 20:59:30 +1000
Subject: [PATCH 07/19] Add stddev for prefill overhead and kvcache trans
 overhead

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      |  12 +++
 pkg/common/config_test.go                 |  10 ++
 pkg/llm-d-inference-sim/simulator.go      |  18 ++--
 pkg/llm-d-inference-sim/simulator_test.go | 109 +++++++++++++---------
 4 files changed, 94 insertions(+), 55 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index f4ea6198..10367a49 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -69,6 +69,8 @@ type Configuration struct {
 	// PrefillOverhead time taken to prefill the context, in milliseconds
 	// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
 	PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
+	PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"`
 	// options are "n^2" and "nlog(n)"
 	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
 
@@ -91,6 +93,8 @@ type Configuration struct {
 	// in milliseconds.
 	// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
 	KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
+	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
+	KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"`
 	// options are "linear" and "in-place", default is "linear"
 	KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
 
@@ -316,6 +320,9 @@ func (c *Configuration) validate() error {
 			return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
 		}
 	}
+	if c.PrefillOverheadStdDev < 0 {
+		return errors.New("prefill overhead standard deviation cannot be negative")
+	}
 	if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
 		return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
 	}
@@ -335,6 +342,9 @@ func (c *Configuration) validate() error {
 			return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
 		}
 	}
+	if c.KVCacheTransferOverheadStdDev < 0 {
+		return errors.New("kv-cache transfer overhead standard deviation cannot be negative")
+	}
 	if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
 		return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
 	}
@@ -436,6 +446,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
+	f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)")
 	f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
@@ -443,6 +454,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
 	f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
+	f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)")
 	f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index cae17fd5..54651c97 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -393,6 +393,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--prefill-overhead", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) prefill-overhead-std-dev",
+			args: []string{"cmd", "--prefill-overhead-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
 			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
@@ -406,6 +411,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) kv-cache-transfer-overhead-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
 			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 93797291..5f628d33 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -664,7 +664,6 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
 		}
 		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
-	fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
 
 	mean := float64(s.config.TimeToFirstToken)
 	stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -699,32 +698,31 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b
 	pfOverhead := s.config.PrefillOverhead
 	complexity := s.config.PrefillComplexity
 	// policies of different complexities of prefill implementation
+	overhead := 0
 	switch complexity {
 	case "n^2", "":
 		// this is simple implementation of n^2
-		return pfOverhead * nPromptTokens * nPromptTokens
+		overhead = pfOverhead * nPromptTokens * nPromptTokens
 	case "nlog(n)":
-		return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
+		overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
 	}
-	// should never reach here
-	return 0
+	return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev)))
 }
 
 // calc the remote prefill overhead against number of tokens
 func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
 	overhead := s.config.KVCacheTransferOverhead
 	complexity := s.config.KVCacheTransferComplexity
+	total := 0
 	switch complexity {
 	case "linear", "":
-		fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
-		return overhead * nPromptTokens
+		total = overhead * nPromptTokens
 	case "in-place":
 		// when the context is already filled
 		// this is a simple implementation which return a defined overhead
-		return overhead
+		total = overhead
 	}
-	// should never reach here
-	return 0
+	return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev)))
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 11853564..8cc21b9e 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -841,46 +841,57 @@ var _ = Describe("Simulator", func() {
 		})
 
 		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillComplexity = "n^2"
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev
+
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.PrefillOverhead = prefillOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
 
-					square := prefillOverhead * nTokens * nTokens
-					diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					n2 := prefillOverhead * nTokens * nTokens
+					n2logn := n2 * int(math.Log2(float64(nTokens)))
+					nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens)))
+
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7)))
 				}
 			},
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 100, 50, 100),
+			Entry("large numbers", 150, 125, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillComplexity = "nlog(n)"
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.PrefillOverhead = prefillOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
 
-					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
-					diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					logn := prefillOverhead * int(math.Log2(float64(nTokens)))
+					n2 := prefillOverhead * nTokens * nTokens
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
 				}
 			},
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 2, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 100, 50, 100),
+			Entry("large numbers", 150, 125, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
@@ -900,50 +911,58 @@ var _ = Describe("Simulator", func() {
 		})
 
 		DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = 1
 				simulator.config.KVCacheTransferComplexity = "linear"
+				simulator.config.KVCacheTransferOverheadStdDev = stddev
+				simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
 
-					linear := kvCacheOverhead * nTokens
-					diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					n2 := kvCacheOverhead * nTokens * nTokens
+					logn := kvCacheOverhead * int(math.Log2(float64(nTokens)))
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
 				}
 			},
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, stddev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 180, 50, 100),
+			Entry("large numbers", 150, 70, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = 1
 				simulator.config.KVCacheTransferComplexity = "in-place"
+				simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev
+				simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead
+
+				var ttfts []int
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
-
-					inPlace := kvCacheOverhead
-					diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					ttfts = append(ttfts, timeToFirst)
 				}
+				// get stdv of ttfts
+				stdv := common.StdDevInt(ttfts)
+				fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
+				Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev))
 			},
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",
+					kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 150, 50, 100),
+			Entry("large numbers", 150, 200, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 	})
 

From 1e8f33d6e55422730d423002597aeb011ec4bc2d Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Mon, 25 Aug 2025 21:58:35 +1000
Subject: [PATCH 08/19] Fix test condition when remove p/d is enabled and
 in-place policy is used

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index 8cc21b9e..df023cf1 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -953,7 +953,7 @@ var _ = Describe("Simulator", func() {
 				// get stdv of ttfts
 				stdv := common.StdDevInt(ttfts)
 				fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
-				Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev))
+				Expect(stdv).To(BeNumerically("<=", int(float64(kvCacheTransOverheadStdDev)*1.7)))
 			},
 			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
 				return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",

From dff8d3ddcd1846266eba3c10aa18697eae02f858 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 21:47:59 +0800
Subject: [PATCH 09/19] Use simplfied implementation of ttft

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config.go                      |  69 +++++-----
 pkg/llm-d-inference-sim/simulator.go      |  44 ++-----
 pkg/llm-d-inference-sim/simulator_test.go | 147 +++++++---------------
 3 files changed, 86 insertions(+), 174 deletions(-)

diff --git a/pkg/common/config.go b/pkg/common/config.go
index 10367a49..45aa7acf 100644
--- a/pkg/common/config.go
+++ b/pkg/common/config.go
@@ -66,14 +66,6 @@ type Configuration struct {
 	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
 
-	// PrefillOverhead time taken to prefill the context, in milliseconds
-	// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
-	PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
-	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
-	PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"`
-	// options are "n^2" and "nlog(n)"
-	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
-
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"`
 	// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
@@ -89,14 +81,20 @@ type Configuration struct {
 	// KVCacheTransferLatency
 	KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`
 
+	// $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$
+	// the assumption is that n is less than k, where k is the number of prallelism units of GPU
+	// PrefillOverhead time taken to prefill the context, in milliseconds
+	PrefillOverhead     int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	PrefillTimePerToken int `yaml:"prefill-time-per-token" json:"prefill-time-per-token"`
+	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
+	PrefillTimeStdDev int `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"`
+	// $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$
+	// the assumption is that the cache blocks are all missed at the remote pod
 	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated,
 	// in milliseconds.
-	// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
-	KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
+	KVCacheTransferTimePerToken int `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"`
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
-	KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"`
-	// options are "linear" and "in-place", default is "linear"
-	KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
+	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
@@ -313,19 +311,24 @@ func (c *Configuration) validate() error {
 	if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) {
 		return errors.New("time to first token standard deviation cannot be more than 30% of time to first token")
 	}
+
 	if c.PrefillOverhead < 0 {
 		return errors.New("prefill overhead cannot be negative")
-	} else if c.PrefillOverhead == 0 {
-		if c.PrefillComplexity != "" {
-			return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
-		}
 	}
-	if c.PrefillOverheadStdDev < 0 {
-		return errors.New("prefill overhead standard deviation cannot be negative")
+	if c.PrefillTimePerToken < 0 {
+		return errors.New("prefill time per token cannot be negative")
+	}
+	if c.PrefillTimeStdDev < 0 {
+		return errors.New("prefill time standard deviation cannot be negative")
+	}
+
+	if c.KVCacheTransferTimePerToken < 0 {
+		return errors.New("kv-cache tranfer time per token cannot be negative")
 	}
-	if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
-		return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
+	if c.KVCacheTransferTimeStdDev < 0 {
+		return errors.New("kv-cache tranfer time standard deviation cannot be negative")
 	}
+
 	if c.KVCacheTransferLatency < 0 {
 		return errors.New("kv-cache tranfer time cannot be negative")
 	}
@@ -335,19 +338,6 @@ func (c *Configuration) validate() error {
 	if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
-	if c.KVCacheTransferOverhead < 0 {
-		return errors.New("kv-cache transfer overhead cannot be negative")
-	} else if c.KVCacheTransferOverhead == 0 {
-		if c.KVCacheTransferComplexity != "" {
-			return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
-		}
-	}
-	if c.KVCacheTransferOverheadStdDev < 0 {
-		return errors.New("kv-cache transfer overhead standard deviation cannot be negative")
-	}
-	if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
-		return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
-	}
 
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
@@ -445,17 +435,18 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
+
 	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
-	f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)")
-	f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
+	f.IntVar(&config.PrefillTimePerToken, "prefill-time-per-token", config.PrefillTimePerToken, "Time to prefill per token (in milliseconds)")
+	f.IntVar(&config.PrefillTimeStdDev, "prefill-time-std-dev", config.PrefillTimeStdDev, "Standard deviation for time to prefill (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferTimePerToken, "kv-cache-transfer-time-per-token", config.KVCacheTransferTimePerToken, "Time for KV-cache transfer per token from a remote vLLM (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferTimeStdDev, "kv-cache-transfer-time-std-dev", config.KVCacheTransferTimeStdDev, "Standard deviation for time for KV-cache transfer per token from a remote vLLM (in milliseconds)")
+
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
-	f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
-	f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)")
-	f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
 	f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 5f628d33..d446db19 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -22,7 +22,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"math"
 	"net"
 	"os"
 	"strings"
@@ -655,13 +654,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 // returns time to first token based on the current request's doRemotePrefill
 func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
-	if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 {
-		if nPromptTokens <= 1 {
-			if !doRemotePrefill {
-				return s.config.PrefillOverhead
-			}
-			return s.config.KVCacheTransferOverhead
-		}
+	if s.config.TimeToFirstToken == 0 {
 		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
 
@@ -695,34 +688,21 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b
 	if doRemotePrefill {
 		return s.calcRemotePrefillOverhead(nPromptTokens)
 	}
-	pfOverhead := s.config.PrefillOverhead
-	complexity := s.config.PrefillComplexity
-	// policies of different complexities of prefill implementation
-	overhead := 0
-	switch complexity {
-	case "n^2", "":
-		// this is simple implementation of n^2
-		overhead = pfOverhead * nPromptTokens * nPromptTokens
-	case "nlog(n)":
-		overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
-	}
-	return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev)))
+
+	constOverhead := s.config.PrefillOverhead
+	ptpt := s.config.PrefillTimePerToken
+	prefillTime := constOverhead + nPromptTokens*ptpt
+
+	stdDev := s.config.PrefillTimeStdDev
+	return int(common.RandomNorm(float64(prefillTime), float64(stdDev)))
 }
 
 // calc the remote prefill overhead against number of tokens
 func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
-	overhead := s.config.KVCacheTransferOverhead
-	complexity := s.config.KVCacheTransferComplexity
-	total := 0
-	switch complexity {
-	case "linear", "":
-		total = overhead * nPromptTokens
-	case "in-place":
-		// when the context is already filled
-		// this is a simple implementation which return a defined overhead
-		total = overhead
-	}
-	return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev)))
+	kvCacheTransTPT := s.config.KVCacheTransferTimePerToken
+	kvCacheTransT := kvCacheTransTPT * nPromptTokens
+	stdDev := s.config.KVCacheTransferTimeStdDev
+	return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index df023cf1..bcf78266 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -21,7 +21,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"math"
 	"net"
 	"net/http"
 	"os"
@@ -828,142 +827,84 @@ var _ = Describe("Simulator", func() {
 			timeToFirstToken := 10000
 			simulator.config.TimeToFirstToken = timeToFirstToken
 			simulator.config.PrefillOverhead = 100
-			timeToFirst := simulator.getTimeToFirstToken(1, false)
-			Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
-			Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
+			ttft := simulator.getTimeToFirstToken(1, false)
+			Expect(ttft).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
+			Expect(ttft).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
 		})
 
 		It("when <time-to-first-token> is 0, and <prefill-overhead> is not 0, use <prefill-overhead>", func() {
 			simulator.config.TimeToFirstToken = 0
 			simulator.config.PrefillOverhead = 100
-			timeToFirst := simulator.getTimeToFirstToken(1, false)
-			Expect(timeToFirst).To(BeNumerically(">=", 100))
+			ttft := simulator.getTimeToFirstToken(1, false)
+			Expect(ttft).To(BeNumerically(">=", 100))
 		})
 
-		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) {
+		DescribeTable("time to first token is against number of prompt tokens",
+			func(prefillOverhead int, prefillTimePerToken int, stdDev int, nTokens int) {
 				simulator.config.TimeToFirstToken = 0
-				simulator.config.PrefillComplexity = "n^2"
 				simulator.config.PrefillOverhead = prefillOverhead
-				simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev
+				simulator.config.PrefillTimePerToken = prefillTimePerToken
+				simulator.config.PrefillTimeStdDev = stdDev
 
-				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, false)
 
-					n2 := prefillOverhead * nTokens * nTokens
-					n2logn := n2 * int(math.Log2(float64(nTokens)))
-					nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens)))
+				expectedTTFT := prefillOverhead + prefillTimePerToken*nTokens
+				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
+				Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7)))
 
-					Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3)))
-					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7)))
-				}
-			},
-			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
-					prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens)
-			},
-			Entry("small numbers", 100, 50, 2, 10),
-			Entry("medium numbers, larger range", 200, 100, 50, 100),
-			Entry("large numbers", 150, 125, 20000, 20010),
-			Entry("stddev is 0", 150, 0, 20000, 20010),
-		)
-
-		DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) {
-				simulator.config.TimeToFirstToken = 0
-				simulator.config.PrefillComplexity = "nlog(n)"
-				simulator.config.PrefillOverhead = prefillOverhead
-				simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev
-
-				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
-
-					logn := prefillOverhead * int(math.Log2(float64(nTokens)))
-					n2 := prefillOverhead * nTokens * nTokens
-					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
-					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
-				}
 			},
-			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
-					prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens)
+			func(prefillOverhead int, prefillTimePerToken, stdDev int, nTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d",
+					prefillOverhead, prefillTimePerToken, stdDev, nTokens)
 			},
-			Entry("small numbers", 100, 50, 2, 10),
-			Entry("medium numbers, larger range", 200, 100, 50, 100),
-			Entry("large numbers", 150, 125, 20000, 20010),
-			Entry("stddev is 0", 150, 0, 20000, 20010),
+			Entry("single token", 100, 50, 70, 1),
+			Entry("stddev is 0", 100, 50, 0, 1),
+			Entry("medium overhead, 512 tokens", 200, 1000, 150, 512),
+			Entry("large overhead, 1024 tokens", 2000, 3000, 1800, 1024),
+			Entry("very long prompt", 150, 200, 100, 20000),
 		)
 
 		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
 			overhead := 100
 			simulator.config.KVCacheTransferLatency = 1000
-			simulator.config.KVCacheTransferOverhead = overhead
-			timeToFirst := simulator.getTimeToFirstToken(1, false)
-			Expect(timeToFirst).To(BeNumerically(">=", overhead))
+			simulator.config.KVCacheTransferTimePerToken = overhead
+			ttft := simulator.getTimeToFirstToken(1, false)
+			Expect(ttft).To(BeNumerically(">=", overhead))
 		})
 
 		It("when <kv-cache-transfer-latency> is 0, and <kv-cache-transfer-overhead> is not 0, use <kv-cache-transfer-overhead>", func() {
 			overhead := 100
 			simulator.config.KVCacheTransferLatency = 0
-			simulator.config.KVCacheTransferOverhead = overhead
-			timeToFirst := simulator.getTimeToFirstToken(1, false)
-			Expect(timeToFirst).To(BeNumerically(">", 0))
+			simulator.config.KVCacheTransferTimePerToken = overhead
+			ttft := simulator.getTimeToFirstToken(1, false)
+			Expect(ttft).To(BeNumerically(">", 0))
 		})
 
-		DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
-			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) {
+		DescribeTable("kv cache transfer time against number of prompt tokens",
+			func(kvCacheTransTPT int, stddev int, nTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = 1
-				simulator.config.KVCacheTransferComplexity = "linear"
-				simulator.config.KVCacheTransferOverheadStdDev = stddev
-				simulator.config.KVCacheTransferOverhead = kvCacheOverhead
+				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
+				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
+				ttft := simulator.getTimeToFirstToken(nTokens, true)
 
-					n2 := kvCacheOverhead * nTokens * nTokens
-					logn := kvCacheOverhead * int(math.Log2(float64(nTokens)))
-					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
-					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
-				}
-			},
-			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
-					kvCacheOverhead, stddev, minNTokens, maxNTokens)
-			},
-			Entry("small numbers", 100, 50, 2, 10),
-			Entry("medium numbers, larger range", 200, 180, 50, 100),
-			Entry("large numbers", 150, 70, 20000, 20010),
-			Entry("stddev is 0", 150, 0, 20000, 20010),
-		)
+				expectedTTFT := kvCacheTransTPT * nTokens
+				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
+				Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7)))
 
-		DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
-			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) {
-				simulator.config.TimeToFirstToken = 0
-				simulator.config.PrefillOverhead = 1
-				simulator.config.KVCacheTransferComplexity = "in-place"
-				simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev
-				simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead
-
-				var ttfts []int
-				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
-					ttfts = append(ttfts, timeToFirst)
-				}
-				// get stdv of ttfts
-				stdv := common.StdDevInt(ttfts)
-				fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
-				Expect(stdv).To(BeNumerically("<=", int(float64(kvCacheTransOverheadStdDev)*1.7)))
 			},
-			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",
-					kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens)
+			func(kvCacheTransferTimePerToken int, stddev int, nTokens int) string {
+				return fmt.Sprintf("kvCacheTransferTimePerToken: %d stddev: %d nTokens: %d",
+					kvCacheTransferTimePerToken, stddev, nTokens)
 			},
-			Entry("small numbers", 100, 50, 2, 10),
-			Entry("medium numbers, larger range", 200, 150, 50, 100),
-			Entry("large numbers", 150, 200, 20000, 20010),
-			Entry("stddev is 0", 150, 0, 20000, 20010),
+			Entry("single token", 100, 70, 1),
+			Entry("stddev is 0", 100, 0, 1),
+			Entry("medium overhead, 512 tokens", 200, 150, 512),
+			Entry("large overhead, 1024 tokens", 2000, 1800, 1024),
+			Entry("very long prompt", 150, 100, 20000),
 		)
+
 	})
 
 	Context("fake metrics", func() {

From 0910dbf64533f4c2508cafd6b4c6580919f8b053 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 21:50:36 +0800
Subject: [PATCH 10/19] Add sep lines in readme params

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index f274b2b1..ed5ee511 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `mode`: the simulator mode, optional, by default `random`
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
+---
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
@@ -108,6 +109,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
 - `kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
+---
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
 - `min-tool-call-integer-param`: the minimum possible value of integer parameters in a tool call, optional, defaults to 0
 - `max-tool-call-number-param`: the maximum possible value of number (float) parameters in a tool call, optional, defaults to 100
@@ -116,6 +118,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `min-tool-call-array-param-length`: the minimum possible length of array parameters in a tool call, optional, defaults to 1
 - `tool-call-not-required-param-probability`: the probability to add a parameter, that is not required, in a tool call, optional, defaults to 50
 - `object-tool-call-not-required-field-probability`: the probability to add a field, that is not required, in an object in a tool call, optional, defaults to 50
+---
 - `enable-kvcache`: if true, the KV cache support will be enabled in the simulator. In this case, the KV cache will be simulated, and ZQM events will be published when a KV cache block is added or evicted. 
 - `kv-cache-size`: the maximum number of token blocks in kv cache
 - `block-size`: token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128
@@ -124,6 +127,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `zmq-endpoint`: ZMQ address to publish events
 - `zmq-max-connect-attempts`: the maximum number of ZMQ connection attempts, defaults to 0, maximum: 10
 - `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
+---
 - `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the real metrics. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported together. The set should include values for 
     - `running-requests`
     - `waiting-requests`

From 5f9fe1b38368af3abd06fd1b4bcfb778d5580698 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 22:00:38 +0800
Subject: [PATCH 11/19] Update readme with explanation of new ttft

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index ed5ee511..087a6137 100644
--- a/README.md
+++ b/README.md
@@ -108,6 +108,13 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `inter-token-latency-std-dev`: standard deviation for time between generated tokens, in milliseconds, optional, default is 0, can't be more than 30% of `inter-token-latency`, will not cause the actual inter token latency to differ by more than 70% from `inter-token-latency`
 - `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
 - `kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
+---
+- `prefill-overhead`: constant overhead time for prefill (in milliseconds), optional, by default zero, used in calculating time to first token, this will be ignored if `time-to-first-token` is not `0`
+- `prefill-time-per-token`: time taken to generate each token during prefill (in milliseconds), optional, by default zero, this will be ignored if `time-to-first-token` is not `0`
+- `prefill-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final prefill time, which is calculated by `prefill-overhead`, `prefill-time-per-token`, and number of prompt tokens, this will be ignored if `time-to-first-token` is not `0`
+- `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
+- `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
+---
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 ---
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100

From 049c10e3d0948d82075dd7f5a8dc40cea9766362 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 22:09:36 +0800
Subject: [PATCH 12/19] Fix ttft new params tests

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator_test.go | 46 +++++++++++++++--------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index bcf78266..d0338dfd 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -824,19 +824,29 @@ var _ = Describe("Simulator", func() {
 		)
 
 		It("when <time-to-first-token> is not 0, ignore <prefill-overhead>", func() {
-			timeToFirstToken := 10000
+			timeToFirstToken := 1000
 			simulator.config.TimeToFirstToken = timeToFirstToken
+			simulator.config.TimeToFirstTokenStdDev = 0
+
 			simulator.config.PrefillOverhead = 100
-			ttft := simulator.getTimeToFirstToken(1, false)
-			Expect(ttft).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
-			Expect(ttft).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
+			simulator.config.PrefillTimePerToken = 200
+			simulator.config.PrefillTimeStdDev = 80
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+
+			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
 
 		It("when <time-to-first-token> is 0, and <prefill-overhead> is not 0, use <prefill-overhead>", func() {
 			simulator.config.TimeToFirstToken = 0
+			simulator.config.TimeToFirstTokenStdDev = 0
+
 			simulator.config.PrefillOverhead = 100
-			ttft := simulator.getTimeToFirstToken(1, false)
-			Expect(ttft).To(BeNumerically(">=", 100))
+			simulator.config.PrefillTimePerToken = 200
+			simulator.config.PrefillTimeStdDev = 80
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
 		DescribeTable("time to first token is against number of prompt tokens",
@@ -865,19 +875,25 @@ var _ = Describe("Simulator", func() {
 		)
 
 		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
-			overhead := 100
-			simulator.config.KVCacheTransferLatency = 1000
-			simulator.config.KVCacheTransferTimePerToken = overhead
-			ttft := simulator.getTimeToFirstToken(1, false)
-			Expect(ttft).To(BeNumerically(">=", overhead))
+			simulator.config.KVCacheTransferLatency = 200
+			simulator.config.KVCacheTransferLatencyStdDev = 0
+
+			simulator.config.KVCacheTransferTimePerToken = 100
+			simulator.config.KVCacheTransferTimeStdDev = 0
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
 		It("when <kv-cache-transfer-latency> is 0, and <kv-cache-transfer-overhead> is not 0, use <kv-cache-transfer-overhead>", func() {
-			overhead := 100
 			simulator.config.KVCacheTransferLatency = 0
-			simulator.config.KVCacheTransferTimePerToken = overhead
-			ttft := simulator.getTimeToFirstToken(1, false)
-			Expect(ttft).To(BeNumerically(">", 0))
+			simulator.config.KVCacheTransferLatencyStdDev = 0
+
+			simulator.config.KVCacheTransferTimePerToken = 100
+			simulator.config.KVCacheTransferTimeStdDev = 0
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
 		DescribeTable("kv cache transfer time against number of prompt tokens",

From 9886b94e4bc0280f6af9271762ddcf89f5187edc Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 22:18:23 +0800
Subject: [PATCH 13/19] Fix kv cache trasfer tests and impl

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go      | 38 +++++++++++++----------
 pkg/llm-d-inference-sim/simulator_test.go |  4 +--
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index d446db19..59fb8e83 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -654,16 +654,18 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 // returns time to first token based on the current request's doRemotePrefill
 func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
-	if s.config.TimeToFirstToken == 0 {
+	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
 
-	mean := float64(s.config.TimeToFirstToken)
-	stddev := float64(s.config.TimeToFirstTokenStdDev)
-	if doRemotePrefill {
-		mean = float64(s.config.KVCacheTransferLatency)
-		stddev = float64(s.config.KVCacheTransferLatencyStdDev)
+	if !doRemotePrefill {
+		mean := float64(s.config.TimeToFirstToken)
+		stddev := float64(s.config.TimeToFirstTokenStdDev)
+		return int(common.RandomNorm(mean, stddev))
 	}
+
+	mean := float64(s.config.KVCacheTransferLatency)
+	stddev := float64(s.config.KVCacheTransferLatencyStdDev)
 	return int(common.RandomNorm(mean, stddev))
 }
 
@@ -685,22 +687,24 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 
 // calc the prefill overhead against number of tokens
 func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int {
-	if doRemotePrefill {
-		return s.calcRemotePrefillOverhead(nPromptTokens)
-	}
+	if !doRemotePrefill {
+		constOverhead := s.config.PrefillOverhead
+		ptpt := s.config.PrefillTimePerToken
+		prefillTime := constOverhead + nPromptTokens*ptpt
 
-	constOverhead := s.config.PrefillOverhead
-	ptpt := s.config.PrefillTimePerToken
-	prefillTime := constOverhead + nPromptTokens*ptpt
+		stdDev := s.config.PrefillTimeStdDev
+		return int(common.RandomNorm(float64(prefillTime), float64(stdDev)))
+	}
 
-	stdDev := s.config.PrefillTimeStdDev
-	return int(common.RandomNorm(float64(prefillTime), float64(stdDev)))
-}
+	if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 {
+		mean := float64(s.config.KVCacheTransferLatency)
+		stddev := float64(s.config.KVCacheTransferLatencyStdDev)
+		return int(common.RandomNorm(mean, stddev))
+	}
 
-// calc the remote prefill overhead against number of tokens
-func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
 	kvCacheTransTPT := s.config.KVCacheTransferTimePerToken
 	kvCacheTransT := kvCacheTransTPT * nPromptTokens
+
 	stdDev := s.config.KVCacheTransferTimeStdDev
 	return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))
 }
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
index d0338dfd..c06b57fa 100644
--- a/pkg/llm-d-inference-sim/simulator_test.go
+++ b/pkg/llm-d-inference-sim/simulator_test.go
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, false)
+			ttft := simulator.getTimeToFirstToken(128, true)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -892,7 +892,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, false)
+			ttft := simulator.getTimeToFirstToken(128, true)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 

From 904e18d0f88a2c7e95de548691db01f210e3bb35 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 22:28:28 +0800
Subject: [PATCH 14/19] Fix invalid config test of new ttft params

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/config_test.go | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
index 33b07fea..7d5fae13 100644
--- a/pkg/common/config_test.go
+++ b/pkg/common/config_test.go
@@ -407,35 +407,24 @@ var _ = Describe("Simulator configuration", func() {
 				"--config", "../../manifests/config.yaml"},
 		},
 		{
-			name: "invalid (negative) prefill-overhead-std-dev",
-			args: []string{"cmd", "--prefill-overhead-std-dev", "-1",
+			name: "invalid (negative) prefill-time-per-token",
+			args: []string{"cmd", "--prefill-time-per-token", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
 		{
-			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
-			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
-		},
-		{
-			name: "<prefill-complexity> should not be 'xxx'",
-			args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"},
-		},
-		{
-			name: "invalid (negative) kv-cache-transfer-overhead",
-			args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
+			name: "invalid (negative) prefill-time-std-dev",
+			args: []string{"cmd", "--prefill-time-std-dev", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
 		{
-			name: "invalid (negative) kv-cache-transfer-overhead-std-dev",
-			args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1",
+			name: "invalid (negative) kv-cache-transfer-time-per-token",
+			args: []string{"cmd", "--kv-cache-transfer-time-per-token", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
 		{
-			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
-			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},
-		},
-		{
-			name: "<kv-cache-transfer-complexity> should not be 'xxx'",
-			args: []string{"cmd", "--kv-cache-transfer-complexity", "xxx", "--config", "../../manifests/config.yaml"},
+			name: "invalid (negative) kv-cache-transfer-time-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-time-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
 		},
 	}
 

From 4078dbd5c25940069988775d8b8f245dc9a4840a Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Fri, 29 Aug 2025 22:34:05 +0800
Subject: [PATCH 15/19] Revert "Add standard deviation in utils"

This reverts commit 18d30756a7173798e9bf0ed010478829b6b14d9a.

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/common/utils.go      | 19 -------------------
 pkg/common/utils_test.go | 12 ------------
 2 files changed, 31 deletions(-)

diff --git a/pkg/common/utils.go b/pkg/common/utils.go
index 5295b3dd..d3ea5b44 100644
--- a/pkg/common/utils.go
+++ b/pkg/common/utils.go
@@ -328,22 +328,3 @@ func init() {
 func Tokenize(text string) []string {
 	return re.FindAllString(text, -1)
 }
-
-// Calculate standard deviation of an int array
-func StdDevInt(data []int) float64 {
-	var sum int
-	for _, value := range data {
-		sum += value
-	}
-	mean := sum / len(data)
-
-	var sumSquares int
-	for _, value := range data {
-		diff := value - mean
-		sumSquares += diff * diff
-	}
-
-	variance := sumSquares / len(data)
-
-	return math.Sqrt(float64(variance))
-}
diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go
index 4dac4a4c..b8f3285e 100644
--- a/pkg/common/utils_test.go
+++ b/pkg/common/utils_test.go
@@ -168,16 +168,4 @@ var _ = Describe("Utils", Ordered, func() {
 		}
 	})
 
-	Context("Standard Deviation", func() {
-		It("should return 0 for a single element", func() {
-			data := []int{42}
-			Expect(StdDevInt(data)).To(Equal(0.0))
-		})
-
-		It("should return the correct standard deviation for multiple elements", func() {
-			data := []int{1, 2, 3, 4, 5}
-			Expect(StdDevInt(data)).To(Equal(1.4142135623730951))
-		})
-	})
-
 })

From 91e702f82c43b9258df039d43cd718422da2deb8 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 31 Aug 2025 22:14:22 +0800
Subject: [PATCH 16/19] Remove additional variables in prefill time calculation

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 3d555e63..f142de1c 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -701,22 +701,15 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 // calc the prefill overhead against number of tokens
 func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int {
 	if !doRemotePrefill {
-		constOverhead := s.config.PrefillOverhead
-		ptpt := s.config.PrefillTimePerToken
-		prefillTime := constOverhead + nPromptTokens*ptpt
-
-		stdDev := s.config.PrefillTimeStdDev
-		return int(common.RandomNorm(float64(prefillTime), float64(stdDev)))
+		prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken
+		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
 	}
 
 	if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 {
-		mean := float64(s.config.KVCacheTransferLatency)
-		stddev := float64(s.config.KVCacheTransferLatencyStdDev)
-		return int(common.RandomNorm(mean, stddev))
+		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))
 	}
 
-	kvCacheTransTPT := s.config.KVCacheTransferTimePerToken
-	kvCacheTransT := kvCacheTransTPT * nPromptTokens
+	kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens
 
 	stdDev := s.config.KVCacheTransferTimeStdDev
 	return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))

From 8430ea369f4781abe3838fc58793a2871e141b6e Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 31 Aug 2025 22:24:51 +0800
Subject: [PATCH 17/19] Improve is remote prefill/decode interface doc

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/openai-server-api/request.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
index afab801d..b23104f8 100644
--- a/pkg/openai-server-api/request.go
+++ b/pkg/openai-server-api/request.go
@@ -53,9 +53,13 @@ type CompletionRequest interface {
 	GetToolChoice() string
 	// GetMaxCompletionTokens returns the maximum completion tokens requested
 	GetMaxCompletionTokens() *int64
-	// IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is decode request
+	// IsDoRemoteDecode() returns true if do_remote_decode field is true in the request,
+	// when the field is true, the decode phase should be done on remote pod,
+	// whereas prefill phase is done on local pod, thus this is a prefill request
 	IsDoRemoteDecode() bool
-	// IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is prefill request
+	// IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request,
+	// when the field is true, the prefill phase should be done on remote pod,
+	// whereas decode phase is done on local pod, thus this is a decode request
 	IsDoRemotePrefill() bool
 }
 

From a5305c82fa8100ffe7bddc55bc79ad9d29cfaae3 Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 31 Aug 2025 22:39:39 +0800
Subject: [PATCH 18/19] Improve implementation of ttft calc

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go | 43 ++++++++++------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index f142de1c..948bae95 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -667,19 +667,23 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 // returns time to first token based on the current request's doRemotePrefill
 func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
-	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
-		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
+	if doRemotePrefill {
+		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
+			// is disaggregated PD and ttft is calculated using number of prompt tokens
+			kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens
+			stdDev := s.config.KVCacheTransferTimeStdDev
+			return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))
+		}
+		// is disaggregated PD and *not* using number of prompt tokens
+		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))
 	}
-
-	if !doRemotePrefill {
-		mean := float64(s.config.TimeToFirstToken)
-		stddev := float64(s.config.TimeToFirstTokenStdDev)
-		return int(common.RandomNorm(mean, stddev))
+	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
+		// is aggregated PD and ttft is calculated using number of prompt tokens
+		prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken
+		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
 	}
-
-	mean := float64(s.config.KVCacheTransferLatency)
-	stddev := float64(s.config.KVCacheTransferLatencyStdDev)
-	return int(common.RandomNorm(mean, stddev))
+	// is aggregated PD and *not* using number of prompt tokens
+	return int(common.RandomNorm(float64(s.config.TimeToFirstToken), float64(s.config.TimeToFirstTokenStdDev)))
 }
 
 // returns inter token latency
@@ -698,23 +702,6 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 	return total
 }
 
-// calc the prefill overhead against number of tokens
-func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int {
-	if !doRemotePrefill {
-		prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken
-		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
-	}
-
-	if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 {
-		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))
-	}
-
-	kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens
-
-	stdDev := s.config.KVCacheTransferTimeStdDev
-	return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))
-}
-
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
 func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse {
 	modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}}

From b74b3aa17a807f2f04ddfab5070eab0a8335a8ac Mon Sep 17 00:00:00 2001
From: Qifan Deng <dev.llmd@qifand.com>
Date: Sun, 31 Aug 2025 22:46:09 +0800
Subject: [PATCH 19/19] Remove unnecessary variable

Signed-off-by: Qifan Deng <dev.llmd@qifand.com>
---
 pkg/llm-d-inference-sim/simulator.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index 948bae95..2ecade9d 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -671,8 +671,7 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
 			kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens
-			stdDev := s.config.KVCacheTransferTimeStdDev
-			return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev)))
+			return int(common.RandomNorm(float64(kvCacheTransT), float64(s.config.KVCacheTransferTimeStdDev)))
 		}
 		// is disaggregated PD and *not* using number of prompt tokens
 		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))