Skip to content

Commit d95d66d

Browse files
committed
Add stddev for prefill overhead and kvcache trans overhead
Signed-off-by: Qifan Deng <[email protected]>
1 parent 62be531 commit d95d66d

File tree

4 files changed

+94
-55
lines changed

4 files changed

+94
-55
lines changed

pkg/common/config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ type Configuration struct {
7676
// PrefillOverhead time taken to prefill the context, in milliseconds
7777
// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
7878
PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
79+
// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
80+
PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"`
7981
// options are "n^2" and "nlog(n)"
8082
PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
8183

@@ -98,6 +100,8 @@ type Configuration struct {
98100
// in milliseconds.
99101
// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
100102
KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
103+
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
104+
KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"`
101105
// options are "linear" and "in-place", default is "linear"
102106
KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
103107

@@ -328,6 +332,9 @@ func (c *Configuration) validate() error {
328332
return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
329333
}
330334
}
335+
if c.PrefillOverheadStdDev < 0 {
336+
return errors.New("prefill overhead standard deviation cannot be negative")
337+
}
331338
if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
332339
return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
333340
}
@@ -347,6 +354,9 @@ func (c *Configuration) validate() error {
347354
return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
348355
}
349356
}
357+
if c.KVCacheTransferOverheadStdDev < 0 {
358+
return errors.New("kv-cache transfer overhead standard deviation cannot be negative")
359+
}
350360
if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
351361
return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
352362
}
@@ -469,13 +479,15 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
469479
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
470480
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
471481
f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
482+
f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)")
472483
f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
473484
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
474485
f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
475486
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
476487
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
477488
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
478489
f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
490+
f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)")
479491
f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
480492

481493
f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")

pkg/common/config_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,11 @@ var _ = Describe("Simulator configuration", func() {
406406
args: []string{"cmd", "--prefill-overhead", "-1",
407407
"--config", "../../manifests/config.yaml"},
408408
},
409+
{
410+
name: "invalid (negative) prefill-overhead-std-dev",
411+
args: []string{"cmd", "--prefill-overhead-std-dev", "-1",
412+
"--config", "../../manifests/config.yaml"},
413+
},
409414
{
410415
name: "<prefill-overhead> must be set when <prefill-complexity> is set",
411416
args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
@@ -419,6 +424,11 @@ var _ = Describe("Simulator configuration", func() {
419424
args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
420425
"--config", "../../manifests/config.yaml"},
421426
},
427+
{
428+
name: "invalid (negative) kv-cache-transfer-overhead-std-dev",
429+
args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1",
430+
"--config", "../../manifests/config.yaml"},
431+
},
422432
{
423433
name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
424434
args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},

pkg/llm-d-inference-sim/simulator.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,6 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
677677
}
678678
return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
679679
}
680-
fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
681680

682681
mean := float64(s.config.TimeToFirstToken)
683682
stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -712,32 +711,31 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b
712711
pfOverhead := s.config.PrefillOverhead
713712
complexity := s.config.PrefillComplexity
714713
// policies of different complexities of prefill implementation
714+
overhead := 0
715715
switch complexity {
716716
case "n^2", "":
717717
// this is simple implementation of n^2
718-
return pfOverhead * nPromptTokens * nPromptTokens
718+
overhead = pfOverhead * nPromptTokens * nPromptTokens
719719
case "nlog(n)":
720-
return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
720+
overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
721721
}
722-
// should never reach here
723-
return 0
722+
return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev)))
724723
}
725724

726725
// calc the remote prefill overhead against number of tokens
727726
func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
728727
overhead := s.config.KVCacheTransferOverhead
729728
complexity := s.config.KVCacheTransferComplexity
729+
total := 0
730730
switch complexity {
731731
case "linear", "":
732-
fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
733-
return overhead * nPromptTokens
732+
total = overhead * nPromptTokens
734733
case "in-place":
735734
// when the context is already filled
736735
// this is a simple implementation which return a defined overhead
737-
return overhead
736+
total = overhead
738737
}
739-
// should never reach here
740-
return 0
738+
return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev)))
741739
}
742740

743741
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -847,46 +847,57 @@ var _ = Describe("Simulator", func() {
847847
})
848848

849849
DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
850-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
850+
func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) {
851+
simulator.config.TimeToFirstToken = 0
851852
simulator.config.PrefillComplexity = "n^2"
853+
simulator.config.PrefillOverhead = prefillOverhead
854+
simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev
855+
852856
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
853-
simulator.config.PrefillOverhead = prefillOverhead
854857
timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
855858

856-
square := prefillOverhead * nTokens * nTokens
857-
diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
858-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
859+
n2 := prefillOverhead * nTokens * nTokens
860+
n2logn := n2 * int(math.Log2(float64(nTokens)))
861+
nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens)))
862+
863+
Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3)))
864+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7)))
859865
}
860866
},
861-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
862-
return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
863-
prefillOverhead, tolerance, minNTokens, maxNTokens)
867+
func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
868+
return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
869+
prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens)
864870
},
865-
Entry("small numbers", 100, 0.1, 1, 10),
866-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
867-
Entry("large numbers", 150, 0.05, 20000, 20010),
871+
Entry("small numbers", 100, 50, 2, 10),
872+
Entry("medium numbers, larger range", 200, 100, 50, 100),
873+
Entry("large numbers", 150, 125, 20000, 20010),
874+
Entry("stddev is 0", 150, 0, 20000, 20010),
868875
)
869876

870877
DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
871-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
878+
func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) {
879+
simulator.config.TimeToFirstToken = 0
872880
simulator.config.PrefillComplexity = "nlog(n)"
881+
simulator.config.PrefillOverhead = prefillOverhead
882+
simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev
873883

874884
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
875-
simulator.config.PrefillOverhead = prefillOverhead
876885
timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
877886

878-
nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
879-
diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
880-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
887+
logn := prefillOverhead * int(math.Log2(float64(nTokens)))
888+
n2 := prefillOverhead * nTokens * nTokens
889+
Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
890+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
881891
}
882892
},
883-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
884-
return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
885-
prefillOverhead, tolerance, minNTokens, maxNTokens)
893+
func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
894+
return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
895+
prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens)
886896
},
887-
Entry("small numbers", 100, 0.1, 2, 10),
888-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
889-
Entry("large numbers", 150, 0.05, 20000, 20010),
897+
Entry("small numbers", 100, 50, 2, 10),
898+
Entry("medium numbers, larger range", 200, 100, 50, 100),
899+
Entry("large numbers", 150, 125, 20000, 20010),
900+
Entry("stddev is 0", 150, 0, 20000, 20010),
890901
)
891902

892903
It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
@@ -906,50 +917,58 @@ var _ = Describe("Simulator", func() {
906917
})
907918

908919
DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
909-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
920+
func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) {
910921
simulator.config.TimeToFirstToken = 0
911922
simulator.config.PrefillOverhead = 1
912923
simulator.config.KVCacheTransferComplexity = "linear"
924+
simulator.config.KVCacheTransferOverheadStdDev = stddev
925+
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
913926

914927
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
915-
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
916928
timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
917929

918-
linear := kvCacheOverhead * nTokens
919-
diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
920-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
930+
n2 := kvCacheOverhead * nTokens * nTokens
931+
logn := kvCacheOverhead * int(math.Log2(float64(nTokens)))
932+
Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
933+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
921934
}
922935
},
923-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
924-
return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
925-
kvCacheOverhead, tolerance, minNTokens, maxNTokens)
936+
func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string {
937+
return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
938+
kvCacheOverhead, stddev, minNTokens, maxNTokens)
926939
},
927-
Entry("small numbers", 100, 0.1, 1, 10),
928-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
929-
Entry("large numbers", 150, 0.05, 20000, 20010),
940+
Entry("small numbers", 100, 50, 2, 10),
941+
Entry("medium numbers, larger range", 200, 180, 50, 100),
942+
Entry("large numbers", 150, 70, 20000, 20010),
943+
Entry("stddev is 0", 150, 0, 20000, 20010),
930944
)
931945

932946
DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
933-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
947+
func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) {
934948
simulator.config.TimeToFirstToken = 0
935949
simulator.config.PrefillOverhead = 1
936950
simulator.config.KVCacheTransferComplexity = "in-place"
951+
simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev
952+
simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead
953+
954+
var ttfts []int
937955
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
938-
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
939956
timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
940-
941-
inPlace := kvCacheOverhead
942-
diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
943-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
957+
ttfts = append(ttfts, timeToFirst)
944958
}
959+
// get stdv of ttfts
960+
stdv := common.StdDevInt(ttfts)
961+
fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
962+
Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev))
945963
},
946-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
947-
return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
948-
kvCacheOverhead, tolerance, minNTokens, maxNTokens)
964+
func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
965+
return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",
966+
kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens)
949967
},
950-
Entry("small numbers", 100, 0.1, 1, 10),
951-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
952-
Entry("large numbers", 150, 0.05, 20000, 20010),
968+
Entry("small numbers", 100, 50, 2, 10),
969+
Entry("medium numbers, larger range", 200, 150, 50, 100),
970+
Entry("large numbers", 150, 200, 20000, 20010),
971+
Entry("stddev is 0", 150, 0, 20000, 20010),
953972
)
954973
})
955974
})

0 commit comments

Comments
 (0)