Calc kv cache transfer overhead based on prompt length

pancak3 · pancak3 · commit 58544fe0edc1 · 2025-08-25T15:19:54.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -67,7 +67,9 @@ type Configuration struct {
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
 
 	// PrefillOverhead time taken to prefill the context, in milliseconds
-	PrefillOverhead   int    `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
+	PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// options are "n^2" and "nlog(n)"
 	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
 
 	// InterTokenLatency time between generated tokens, in milliseconds
@@ -85,6 +87,13 @@ type Configuration struct {
 	// KVCacheTransferLatency
 	KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`
 
+	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated,
+	// in milliseconds.
+	// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
+	KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
+	// options are "linear" and "in-place", default is "linear"
+	KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
 	// Seed defines random seed for operations
@@ -319,6 +328,17 @@ func (c *Configuration) validate() error {
 	if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
+	if c.KVCacheTransferOverhead < 0 {
+		return errors.New("kv-cache transfer overhead cannot be negative")
+	} else if c.KVCacheTransferOverhead == 0 {
+		if c.KVCacheTransferComplexity != "" {
+			return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
+		}
+	}
+	if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
+		return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
+	}
+
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
@@ -422,6 +442,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
+	f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
+	f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
 	f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -392,6 +392,18 @@ var _ = Describe("Simulator configuration", func() {
 			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
 			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "<prefill-complexity> should not be 'xxx'",
+			args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
+			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "<kv-cache-transfer-complexity> should not be 'xxx'",
+			args: []string{"cmd", "--kv-cache-transfer-complexity", "xxx", "--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -636,7 +636,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	nPromptTokens := usageData.PromptTokens
 	nGenTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill, nPromptTokens) + s.getTotalInterTokenLatency(nGenTokens)
+	totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	ctx.Response.Header.SetContentType("application/json")
@@ -654,13 +654,17 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool, nPromptTokens int) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
 	if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 {
 		if nPromptTokens <= 1 {
-			return s.config.PrefillOverhead
+			if !doRemotePrefill {
+				return s.config.PrefillOverhead
+			}
+			return s.config.KVCacheTransferOverhead
 		}
-		return s.calcPrefillOverhead(nPromptTokens)
+		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
+	fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
 
 	mean := float64(s.config.TimeToFirstToken)
 	stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -688,7 +692,10 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
 }
 
 // calc the prefill overhead against number of tokens
-func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
+func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int {
+	if doRemotePrefill {
+		return s.calcRemotePrefillOverhead(nPromptTokens)
+	}
 	pfOverhead := s.config.PrefillOverhead
 	complexity := s.config.PrefillComplexity
 	// policies of different complexities of prefill implementation
@@ -699,7 +706,24 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int {
 	case "nlog(n)":
 		return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
 	}
+	// should never reach here
+	return 0
+}
 
+// calc the remote prefill overhead against number of tokens
+func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
+	overhead := s.config.KVCacheTransferOverhead
+	complexity := s.config.KVCacheTransferComplexity
+	switch complexity {
+	case "linear", "":
+		fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
+		return overhead * nPromptTokens
+	case "in-place":
+		// when the context is already filled
+		// this is a simple implementation which return a defined overhead
+		return overhead
+	}
+	// should never reach here
 	return 0
 }
 
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -802,7 +802,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill, 1)
+				timeToFirst := simulator.getTimeToFirstToken(1, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -826,29 +826,30 @@ var _ = Describe("Simulator", func() {
 
 		It("when <time-to-first-token> is not 0, ignore <prefill-overhead>", func() {
 			timeToFirstToken := 10000
-			prefillOverhead := 100
 			simulator.config.TimeToFirstToken = timeToFirstToken
-			simulator.config.PrefillOverhead = prefillOverhead
-			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			simulator.config.PrefillOverhead = 100
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
 			Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3)))
 			Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7)))
 		})
 
-		It("when <time-to-first-token> is 0, use <prefill-overhead>", func() {
+		It("when <time-to-first-token> is 0, and <prefill-overhead> is not 0, use <prefill-overhead>", func() {
 			simulator.config.TimeToFirstToken = 0
 			simulator.config.PrefillOverhead = 100
-			timeToFirst := simulator.getTimeToFirstToken(false, 1)
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
 			Expect(timeToFirst).To(BeNumerically(">=", 100))
 		})
 
 		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.PrefillComplexity = "n^2"
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					square := prefillOverhead * nTokens * nTokens
 					simulator.config.PrefillOverhead = prefillOverhead
-					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
+
+					square := prefillOverhead * nTokens * nTokens
 					diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
-					Expect(diffRatio).To(BeNumerically("<", tolerance))
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
 				}
 			},
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
@@ -865,11 +866,12 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillComplexity = "nlog(n)"
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
 					simulator.config.PrefillOverhead = prefillOverhead
-					timeToFirst := simulator.getTimeToFirstToken(false, nTokens)
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
+
+					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
 					diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
-					Expect(diffRatio).To(BeNumerically("<", tolerance))
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
 				}
 			},
 			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
@@ -880,6 +882,69 @@ var _ = Describe("Simulator", func() {
 			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
 			Entry("large numbers", 150, 0.05, 20000, 20010),
 		)
+
+		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
+			overhead := 100
+			simulator.config.KVCacheTransferLatency = 1000
+			simulator.config.KVCacheTransferOverhead = overhead
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
+			Expect(timeToFirst).To(BeNumerically(">=", overhead))
+		})
+
+		It("when <kv-cache-transfer-latency> is 0, and <kv-cache-transfer-overhead> is not 0, use <kv-cache-transfer-overhead>", func() {
+			overhead := 100
+			simulator.config.KVCacheTransferLatency = 0
+			simulator.config.KVCacheTransferOverhead = overhead
+			timeToFirst := simulator.getTimeToFirstToken(1, false)
+			Expect(timeToFirst).To(BeNumerically(">", 0))
+		})
+
+		DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = 1
+				simulator.config.KVCacheTransferComplexity = "linear"
+
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
+
+					linear := kvCacheOverhead * nTokens
+					diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+				}
+			},
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 1, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
+
+		DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = 1
+				simulator.config.KVCacheTransferComplexity = "in-place"
+				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
+					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
+					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
+
+					inPlace := kvCacheOverhead
+					diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
+					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+				}
+			},
+			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			},
+			Entry("small numbers", 100, 0.1, 1, 10),
+			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
+			Entry("large numbers", 150, 0.05, 20000, 20010),
+		)
 	})
 
 	Context("fake metrics", func() {
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -96,7 +96,7 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, nPrompt
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, nPromptTokens int, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) {
 	// time to first token delay
-	time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill, nPromptTokens)) * time.Millisecond)
+	time.Sleep(time.Duration(s.getTimeToFirstToken(nPromptTokens, context.doRemotePrefill)) * time.Millisecond)
 
 	for i, token := range genTokens {
 		if i != 0 {