Add stddev for prefill overhead and kvcache trans overhead

pancak3 · pancak3 · commit 679e6ff3ff33 · 2025-08-25T20:59:30.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -69,6 +69,8 @@ type Configuration struct {
 	// PrefillOverhead time taken to prefill the context, in milliseconds
 	// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
 	PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
+	PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"`
 	// options are "n^2" and "nlog(n)"
 	PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
 
@@ -91,6 +93,8 @@ type Configuration struct {
 	// in milliseconds.
 	// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
 	KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
+	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
+	KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"`
 	// options are "linear" and "in-place", default is "linear"
 	KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
 
@@ -316,6 +320,9 @@ func (c *Configuration) validate() error {
 			return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
 		}
 	}
+	if c.PrefillOverheadStdDev < 0 {
+		return errors.New("prefill overhead standard deviation cannot be negative")
+	}
 	if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
 		return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
 	}
@@ -335,6 +342,9 @@ func (c *Configuration) validate() error {
 			return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
 		}
 	}
+	if c.KVCacheTransferOverheadStdDev < 0 {
+		return errors.New("kv-cache transfer overhead standard deviation cannot be negative")
+	}
 	if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
 		return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
 	}
@@ -436,13 +446,15 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
+	f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)")
 	f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
 	f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
+	f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)")
 	f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -393,6 +393,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--prefill-overhead", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) prefill-overhead-std-dev",
+			args: []string{"cmd", "--prefill-overhead-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<prefill-overhead> must be set when <prefill-complexity> is set",
 			args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
@@ -406,6 +411,11 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) kv-cache-transfer-overhead-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 		{
 			name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
 			args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -664,7 +664,6 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
 		}
 		return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
 	}
-	fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
 
 	mean := float64(s.config.TimeToFirstToken)
 	stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -699,32 +698,31 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b
 	pfOverhead := s.config.PrefillOverhead
 	complexity := s.config.PrefillComplexity
 	// policies of different complexities of prefill implementation
+	overhead := 0
 	switch complexity {
 	case "n^2", "":
 		// this is simple implementation of n^2
-		return pfOverhead * nPromptTokens * nPromptTokens
+		overhead = pfOverhead * nPromptTokens * nPromptTokens
 	case "nlog(n)":
-		return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
+		overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
 	}
-	// should never reach here
-	return 0
+	return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev)))
 }
 
 // calc the remote prefill overhead against number of tokens
 func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
 	overhead := s.config.KVCacheTransferOverhead
 	complexity := s.config.KVCacheTransferComplexity
+	total := 0
 	switch complexity {
 	case "linear", "":
-		fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
-		return overhead * nPromptTokens
+		total = overhead * nPromptTokens
 	case "in-place":
 		// when the context is already filled
 		// this is a simple implementation which return a defined overhead
-		return overhead
+		total = overhead
 	}
-	// should never reach here
-	return 0
+	return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev)))
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -841,46 +841,57 @@ var _ = Describe("Simulator", func() {
 		})
 
 		DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillComplexity = "n^2"
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev
+
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.PrefillOverhead = prefillOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
 
-					square := prefillOverhead * nTokens * nTokens
-					diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					n2 := prefillOverhead * nTokens * nTokens
+					n2logn := n2 * int(math.Log2(float64(nTokens)))
+					nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens)))
+
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7)))
 				}
 			},
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 100, 50, 100),
+			Entry("large numbers", 150, 125, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) {
+				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillComplexity = "nlog(n)"
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.PrefillOverhead = prefillOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
 
-					nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
-					diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					logn := prefillOverhead * int(math.Log2(float64(nTokens)))
+					n2 := prefillOverhead * nTokens * nTokens
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
 				}
 			},
-			func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					prefillOverhead, tolerance, minNTokens, maxNTokens)
+			func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 2, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 100, 50, 100),
+			Entry("large numbers", 150, 125, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
@@ -900,50 +911,58 @@ var _ = Describe("Simulator", func() {
 		})
 
 		DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = 1
 				simulator.config.KVCacheTransferComplexity = "linear"
+				simulator.config.KVCacheTransferOverheadStdDev = stddev
+				simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
 
-					linear := kvCacheOverhead * nTokens
-					diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					n2 := kvCacheOverhead * nTokens * nTokens
+					logn := kvCacheOverhead * int(math.Log2(float64(nTokens)))
+					Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
+					Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
 				}
 			},
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
+					kvCacheOverhead, stddev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 180, 50, 100),
+			Entry("large numbers", 150, 70, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 
 		DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
+			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = 1
 				simulator.config.KVCacheTransferComplexity = "in-place"
+				simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev
+				simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead
+
+				var ttfts []int
 				for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
-					simulator.config.KVCacheTransferOverhead = kvCacheOverhead
 					timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
-
-					inPlace := kvCacheOverhead
-					diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
-					Expect(diffRatio).To(BeNumerically("<=", tolerance))
+					ttfts = append(ttfts, timeToFirst)
 				}
+				// get stdv of ttfts
+				stdv := common.StdDevInt(ttfts)
+				fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
+				Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev))
 			},
-			func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
-				return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
-					kvCacheOverhead, tolerance, minNTokens, maxNTokens)
+			func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
+				return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",
+					kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens)
 			},
-			Entry("small numbers", 100, 0.1, 1, 10),
-			Entry("medium numbers, larger range", 200, 0.1, 50, 100),
-			Entry("large numbers", 150, 0.05, 20000, 20010),
+			Entry("small numbers", 100, 50, 2, 10),
+			Entry("medium numbers, larger range", 200, 150, 50, 100),
+			Entry("large numbers", 150, 200, 20000, 20010),
+			Entry("stddev is 0", 150, 0, 20000, 20010),
 		)
 	})