Skip to content

Commit 1fd0a9a

Browse files
committed
Add stddev for prefill overhead and kvcache trans overhead
Signed-off-by: Qifan Deng <[email protected]>
1 parent 18d3075 commit 1fd0a9a

File tree

4 files changed

+94
-55
lines changed

4 files changed

+94
-55
lines changed

pkg/common/config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ type Configuration struct {
6969
// PrefillOverhead time taken to prefill the context, in milliseconds
7070
// PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context
7171
PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"`
72+
// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
73+
PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"`
7274
// options are "n^2" and "nlog(n)"
7375
PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"`
7476

@@ -91,6 +93,8 @@ type Configuration struct {
9193
// in milliseconds.
9294
// KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache.
9395
KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"`
96+
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
97+
KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"`
9498
// options are "linear" and "in-place", default is "linear"
9599
KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"`
96100

@@ -316,6 +320,9 @@ func (c *Configuration) validate() error {
316320
return errors.New("prefill overhead complexity is set, but prefill overhead is 0")
317321
}
318322
}
323+
if c.PrefillOverheadStdDev < 0 {
324+
return errors.New("prefill overhead standard deviation cannot be negative")
325+
}
319326
if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" {
320327
return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"")
321328
}
@@ -335,6 +342,9 @@ func (c *Configuration) validate() error {
335342
return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0")
336343
}
337344
}
345+
if c.KVCacheTransferOverheadStdDev < 0 {
346+
return errors.New("kv-cache transfer overhead standard deviation cannot be negative")
347+
}
338348
if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" {
339349
return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"")
340350
}
@@ -436,13 +446,15 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
436446
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
437447
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
438448
f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
449+
f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)")
439450
f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".")
440451
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
441452
f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
442453
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
443454
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
444455
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
445456
f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if <kv-cache-transfer-latency> is not set.")
457+
f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)")
446458
f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".")
447459

448460
f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")

pkg/common/config_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,11 @@ var _ = Describe("Simulator configuration", func() {
393393
args: []string{"cmd", "--prefill-overhead", "-1",
394394
"--config", "../../manifests/config.yaml"},
395395
},
396+
{
397+
name: "invalid (negative) prefill-overhead-std-dev",
398+
args: []string{"cmd", "--prefill-overhead-std-dev", "-1",
399+
"--config", "../../manifests/config.yaml"},
400+
},
396401
{
397402
name: "<prefill-overhead> must be set when <prefill-complexity> is set",
398403
args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"},
@@ -406,6 +411,11 @@ var _ = Describe("Simulator configuration", func() {
406411
args: []string{"cmd", "--kv-cache-transfer-overhead", "-1",
407412
"--config", "../../manifests/config.yaml"},
408413
},
414+
{
415+
name: "invalid (negative) kv-cache-transfer-overhead-std-dev",
416+
args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1",
417+
"--config", "../../manifests/config.yaml"},
418+
},
409419
{
410420
name: "<kv-cache-transfer-overhead> must be set when <kv-cache-transfer-complexity> is set",
411421
args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"},

pkg/llm-d-inference-sim/simulator.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,6 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
664664
}
665665
return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill)
666666
}
667-
fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill)
668667

669668
mean := float64(s.config.TimeToFirstToken)
670669
stddev := float64(s.config.TimeToFirstTokenStdDev)
@@ -699,32 +698,31 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b
699698
pfOverhead := s.config.PrefillOverhead
700699
complexity := s.config.PrefillComplexity
701700
// policies of different complexities of prefill implementation
701+
overhead := 0
702702
switch complexity {
703703
case "n^2", "":
704704
// this is simple implementation of n^2
705-
return pfOverhead * nPromptTokens * nPromptTokens
705+
overhead = pfOverhead * nPromptTokens * nPromptTokens
706706
case "nlog(n)":
707-
return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
707+
overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens))))
708708
}
709-
// should never reach here
710-
return 0
709+
return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev)))
711710
}
712711

713712
// calc the remote prefill overhead against number of tokens
714713
func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int {
715714
overhead := s.config.KVCacheTransferOverhead
716715
complexity := s.config.KVCacheTransferComplexity
716+
total := 0
717717
switch complexity {
718718
case "linear", "":
719-
fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens)
720-
return overhead * nPromptTokens
719+
total = overhead * nPromptTokens
721720
case "in-place":
722721
// when the context is already filled
723722
// this is a simple implementation which return a defined overhead
724-
return overhead
723+
total = overhead
725724
}
726-
// should never reach here
727-
return 0
725+
return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev)))
728726
}
729727

730728
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -841,46 +841,57 @@ var _ = Describe("Simulator", func() {
841841
})
842842

843843
DescribeTable("time to first token is super linear of prefill against number of prompt tokens",
844-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
844+
func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) {
845+
simulator.config.TimeToFirstToken = 0
845846
simulator.config.PrefillComplexity = "n^2"
847+
simulator.config.PrefillOverhead = prefillOverhead
848+
simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev
849+
846850
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
847-
simulator.config.PrefillOverhead = prefillOverhead
848851
timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
849852

850-
square := prefillOverhead * nTokens * nTokens
851-
diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square)
852-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
853+
n2 := prefillOverhead * nTokens * nTokens
854+
n2logn := n2 * int(math.Log2(float64(nTokens)))
855+
nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens)))
856+
857+
Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3)))
858+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7)))
853859
}
854860
},
855-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
856-
return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
857-
prefillOverhead, tolerance, minNTokens, maxNTokens)
861+
func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
862+
return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
863+
prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens)
858864
},
859-
Entry("small numbers", 100, 0.1, 1, 10),
860-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
861-
Entry("large numbers", 150, 0.05, 20000, 20010),
865+
Entry("small numbers", 100, 50, 2, 10),
866+
Entry("medium numbers, larger range", 200, 100, 50, 100),
867+
Entry("large numbers", 150, 125, 20000, 20010),
868+
Entry("stddev is 0", 150, 0, 20000, 20010),
862869
)
863870

864871
DescribeTable("time to first token is log-linear of prefill against number of prompt tokens",
865-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
872+
func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) {
873+
simulator.config.TimeToFirstToken = 0
866874
simulator.config.PrefillComplexity = "nlog(n)"
875+
simulator.config.PrefillOverhead = prefillOverhead
876+
simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev
867877

868878
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
869-
simulator.config.PrefillOverhead = prefillOverhead
870879
timeToFirst := simulator.getTimeToFirstToken(nTokens, false)
871880

872-
nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens)))
873-
diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn)
874-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
881+
logn := prefillOverhead * int(math.Log2(float64(nTokens)))
882+
n2 := prefillOverhead * nTokens * nTokens
883+
Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
884+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
875885
}
876886
},
877-
func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
878-
return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
879-
prefillOverhead, tolerance, minNTokens, maxNTokens)
887+
func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string {
888+
return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
889+
prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens)
880890
},
881-
Entry("small numbers", 100, 0.1, 2, 10),
882-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
883-
Entry("large numbers", 150, 0.05, 20000, 20010),
891+
Entry("small numbers", 100, 50, 2, 10),
892+
Entry("medium numbers, larger range", 200, 100, 50, 100),
893+
Entry("large numbers", 150, 125, 20000, 20010),
894+
Entry("stddev is 0", 150, 0, 20000, 20010),
884895
)
885896

886897
It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
@@ -900,50 +911,58 @@ var _ = Describe("Simulator", func() {
900911
})
901912

902913
DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens",
903-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
914+
func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) {
904915
simulator.config.TimeToFirstToken = 0
905916
simulator.config.PrefillOverhead = 1
906917
simulator.config.KVCacheTransferComplexity = "linear"
918+
simulator.config.KVCacheTransferOverheadStdDev = stddev
919+
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
907920

908921
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
909-
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
910922
timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
911923

912-
linear := kvCacheOverhead * nTokens
913-
diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear)
914-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
924+
n2 := kvCacheOverhead * nTokens * nTokens
925+
logn := kvCacheOverhead * int(math.Log2(float64(nTokens)))
926+
Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3)))
927+
Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7)))
915928
}
916929
},
917-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
918-
return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
919-
kvCacheOverhead, tolerance, minNTokens, maxNTokens)
930+
func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string {
931+
return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d",
932+
kvCacheOverhead, stddev, minNTokens, maxNTokens)
920933
},
921-
Entry("small numbers", 100, 0.1, 1, 10),
922-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
923-
Entry("large numbers", 150, 0.05, 20000, 20010),
934+
Entry("small numbers", 100, 50, 2, 10),
935+
Entry("medium numbers, larger range", 200, 180, 50, 100),
936+
Entry("large numbers", 150, 70, 20000, 20010),
937+
Entry("stddev is 0", 150, 0, 20000, 20010),
924938
)
925939

926940
DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens",
927-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) {
941+
func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) {
928942
simulator.config.TimeToFirstToken = 0
929943
simulator.config.PrefillOverhead = 1
930944
simulator.config.KVCacheTransferComplexity = "in-place"
945+
simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev
946+
simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead
947+
948+
var ttfts []int
931949
for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ {
932-
simulator.config.KVCacheTransferOverhead = kvCacheOverhead
933950
timeToFirst := simulator.getTimeToFirstToken(nTokens, true)
934-
935-
inPlace := kvCacheOverhead
936-
diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace)
937-
Expect(diffRatio).To(BeNumerically("<=", tolerance))
951+
ttfts = append(ttfts, timeToFirst)
938952
}
953+
// get stdv of ttfts
954+
stdv := common.StdDevInt(ttfts)
955+
fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv)
956+
Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev))
939957
},
940-
func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string {
941-
return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d",
942-
kvCacheOverhead, tolerance, minNTokens, maxNTokens)
958+
func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string {
959+
return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d",
960+
kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens)
943961
},
944-
Entry("small numbers", 100, 0.1, 1, 10),
945-
Entry("medium numbers, larger range", 200, 0.1, 50, 100),
946-
Entry("large numbers", 150, 0.05, 20000, 20010),
962+
Entry("small numbers", 100, 50, 2, 10),
963+
Entry("medium numbers, larger range", 200, 150, 50, 100),
964+
Entry("large numbers", 150, 200, 20000, 20010),
965+
Entry("stddev is 0", 150, 0, 20000, 20010),
947966
)
948967
})
949968

0 commit comments

Comments
 (0)