Skip to content

Commit 2bcfedb

Browse files
authored
Adjust request "processing time" to current load (#189)
* Validate max-num-seqs Signed-off-by: Qifan Deng <[email protected]> * Validate PrefillTimeStdDev Signed-off-by: Qifan Deng <[email protected]> * Add param time-factor-under-load Signed-off-by: Qifan Deng <[email protected]> * The factor applies on time-to-first-token Signed-off-by: Qifan Deng <[email protected]> * Test TTFT when partially loaded Signed-off-by: Qifan Deng <[email protected]> * Apply time factor under load to prefill and inter token latency Signed-off-by: Qifan Deng <[email protected]> * Improve param desc Signed-off-by: Qifan Deng <[email protected]> * Use nRunningReqs instead of runReqChan Signed-off-by: Qifan Deng <[email protected]> * unstage manifests/dev-config.yaml Signed-off-by: Qifan Deng <[email protected]> * Update readme Signed-off-by: Qifan Deng <[email protected]> * Restore changes for inter token latency (lost due to conflicts resolve) Signed-off-by: Qifan Deng <[email protected]> * Calc inter token latency based on load instead of one-calc-for-whole request Signed-off-by: Qifan Deng <[email protected]> * Calc inter token latency based on load instead of one-calc-for-whole request Signed-off-by: Qifan Deng <[email protected]> * Move methods to simulator Signed-off-by: Qifan Deng <[email protected]> * Rename helper func Signed-off-by: Qifan Deng <[email protected]> * Rename helper func Signed-off-by: Qifan Deng <[email protected]> * Fix inter token latency test Signed-off-by: Qifan Deng <[email protected]> --------- Signed-off-by: Qifan Deng <[email protected]>
1 parent 40ec02c commit 2bcfedb

File tree

6 files changed

+189
-19
lines changed

6 files changed

+189
-19
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ vendor
55
.devcontainer
66
# MacOSX
77
.DS_Store
8-
*.test
8+
*.test
9+
manifests/dev-config.yaml

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
115115
- `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
116116
- `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
117117
---
118+
- `time-factor-under-load`: a multiplicative factor that affects the overall time taken for requests when parallelrequests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are `max-num-seqs` requests, the total time will be multiplied by x. The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
118119
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
119120
---
120121
- `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100

pkg/common/config.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ type Configuration struct {
104104
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
105105
KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
106106

107+
// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
108+
// requests are being processed.
109+
// The value of this factor must be >= 1.0, with a default of 1.0.
110+
// - If this factor is 1.0, no extra time is added.
111+
// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
112+
// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
113+
TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`
114+
107115
// Mode defines the simulator response generation mode, valid values: echo, random
108116
Mode string `yaml:"mode" json:"mode"`
109117
// Seed defines random seed for operations
@@ -259,6 +267,7 @@ func newConfig() *Configuration {
259267
MaxModelLen: 1024,
260268
Mode: ModeRandom,
261269
Seed: time.Now().UnixNano(),
270+
TimeFactorUnderLoad: 1.0,
262271
MaxToolCallIntegerParam: 100,
263272
MaxToolCallNumberParam: 100,
264273
MaxToolCallArrayParamLength: 5,
@@ -338,6 +347,9 @@ func (c *Configuration) validate() error {
338347
if c.PrefillTimeStdDev < 0 {
339348
return errors.New("prefill time standard deviation cannot be negative")
340349
}
350+
if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) {
351+
return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token")
352+
}
341353

342354
if c.KVCacheTransferTimePerToken < 0 {
343355
return errors.New("kv-cache tranfer time per token cannot be negative")
@@ -359,6 +371,10 @@ func (c *Configuration) validate() error {
359371
return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
360372
}
361373

374+
if c.TimeFactorUnderLoad < 1.0 {
375+
return errors.New("time factor under load cannot be less than 1.0")
376+
}
377+
362378
if c.MaxLoras < 1 {
363379
return errors.New("max LoRAs cannot be less than 1")
364380
}
@@ -373,6 +389,10 @@ func (c *Configuration) validate() error {
373389
return errors.New("max model len cannot be less than 1")
374390
}
375391

392+
if c.MaxNumSeqs < 1 {
393+
return errors.New("max num seqs cannot be less than 1")
394+
}
395+
376396
for _, lora := range c.LoraModules {
377397
if lora.Name == "" {
378398
return errors.New("empty LoRA name")
@@ -502,6 +522,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
502522
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
503523
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
504524
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
525+
f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)")
505526

506527
f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
507528
f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")

pkg/common/config_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,26 @@ var _ = Describe("Simulator configuration", func() {
431431
args: []string{"cmd", "--data-parallel-size", "15",
432432
"--config", "../../manifests/config.yaml"},
433433
},
434+
{
435+
name: "invalid max-num-seqs",
436+
args: []string{"cmd", "--max-num-seqs", "0",
437+
"--config", "../../manifests/config.yaml"},
438+
},
439+
{
440+
name: "invalid max-num-seqs",
441+
args: []string{"cmd", "--max-num-seqs", "-1",
442+
"--config", "../../manifests/config.yaml"},
443+
},
444+
{
445+
name: "invalid time-factor-under-load",
446+
args: []string{"cmd", "--time-factor-under-load", "0",
447+
"--config", "../../manifests/config.yaml"},
448+
},
449+
{
450+
name: "invalid time-factor-under-load",
451+
args: []string{"cmd", "--time-factor-under-load", "-1",
452+
"--config", "../../manifests/config.yaml"},
453+
},
434454
}
435455

436456
for _, test := range invalidTests {

pkg/llm-d-inference-sim/simulator.go

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -672,12 +672,13 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
672672
}
673673

674674
// calculate how long to wait before returning the response, time is based on number of tokens
675-
nPromptTokens := usageData.PromptTokens
676675
nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
677-
nGenTokens := usageData.CompletionTokens
678-
ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
679-
totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
680-
time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
676+
ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
677+
time.Sleep(time.Duration(ttft) * time.Millisecond)
678+
for range usageData.CompletionTokens - 1 {
679+
perTokenLatency := s.getInterTokenLatency()
680+
time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
681+
}
681682

682683
ctx.Response.Header.SetContentType("application/json")
683684
ctx.Response.Header.SetStatusCode(fasthttp.StatusOK)
@@ -706,25 +707,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
706707
}
707708
if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
708709
// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
709-
prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
710+
prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken()
710711
return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
711712
}
712713
// is aggregated PD and *not* using number of prompt tokens
713-
return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev)
714+
return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev)
714715
}
715716

716717
// returns inter token latency
717718
func (s *VllmSimulator) getInterTokenLatency() int {
718-
return common.RandomNorm(s.config.InterTokenLatency, s.config.InterTokenLatencyStdDev)
719-
}
720-
721-
// returns total inter token latency for the given number of tokens
722-
func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
723-
total := 0
724-
for range numOfTokens - 1 {
725-
total += s.getInterTokenLatency()
726-
}
727-
return total
719+
return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev)
728720
}
729721

730722
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
@@ -818,3 +810,26 @@ func (s *VllmSimulator) showConfig(dp bool) error {
818810
s.logger.Info("Configuration:", "", string(cfgJSON))
819811
return nil
820812
}
813+
814+
func (s *VllmSimulator) getCurrFactor() float64 {
815+
if s.config.MaxNumSeqs <= 1 {
816+
return 1.0
817+
}
818+
return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
819+
}
820+
821+
func (s *VllmSimulator) GetTimeToFirstToken() int {
822+
return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor())
823+
}
824+
825+
func (s *VllmSimulator) GetPrefillOverhead() int {
826+
return int(float64(s.config.PrefillOverhead) * s.getCurrFactor())
827+
}
828+
829+
func (s *VllmSimulator) GetPrefillTimePerToken() int {
830+
return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor())
831+
}
832+
833+
func (s *VllmSimulator) GetInterTokenLatency() int {
834+
return int(float64(s.config.InterTokenLatency) * s.getCurrFactor())
835+
}

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,14 @@ var _ = Describe("Simulator", func() {
787787
func(interTokenLatency int, stddev int, numberOfTokens int) {
788788
simulator.config.InterTokenLatency = interTokenLatency
789789
simulator.config.InterTokenLatencyStdDev = stddev
790-
latency := simulator.getTotalInterTokenLatency(numberOfTokens)
790+
simulator.config.MaxNumSeqs = 1
791+
simulator.config.TimeFactorUnderLoad = 1.0
792+
793+
latency := 0
794+
for range numberOfTokens - 1 {
795+
latency += simulator.getInterTokenLatency()
796+
}
797+
791798
Expect(latency).To(BeNumerically(">=", int(float32(interTokenLatency)*0.3*float32(numberOfTokens))))
792799
Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens))))
793800
},
@@ -955,5 +962,110 @@ var _ = Describe("Simulator", func() {
955962
Entry("very long prompt", 150, 100, 20000),
956963
)
957964

965+
It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
966+
simulator.config.TimeToFirstToken = 42
967+
simulator.config.TimeToFirstTokenStdDev = 0
968+
simulator.config.TimeFactorUnderLoad = 1.0
969+
970+
simulator.runReqChan <- 100
971+
972+
ttft := simulator.getTimeToFirstToken(128, 0, false)
973+
Expect(ttft).To(Equal(42))
974+
})
975+
976+
It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
977+
simulator.config.TimeToFirstToken = 42
978+
simulator.config.TimeToFirstTokenStdDev = 0
979+
simulator.config.TimeFactorUnderLoad = 100.0
980+
simulator.config.MaxNumSeqs = 1
981+
982+
for len(simulator.runReqChan) > 0 {
983+
<-simulator.runReqChan
984+
}
985+
986+
simulator.runReqChan <- 1
987+
988+
ttft := simulator.getTimeToFirstToken(128, 0, false)
989+
Expect(ttft).To(Equal(42))
990+
})
991+
992+
DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
993+
func(timeFactorUnderLoad float64, maxNumOfReq int) {
994+
simulator.config.TimeToFirstToken = 42
995+
simulator.config.TimeToFirstTokenStdDev = 0
996+
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
997+
simulator.config.MaxNumSeqs = maxNumOfReq
998+
simulator.nRunningReqs = int64(maxNumOfReq)
999+
1000+
ttft := simulator.getTimeToFirstToken(128, 0, false)
1001+
Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
1002+
1003+
},
1004+
func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
1005+
return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
1006+
timeFactorUnderLoad, maxNumOfReq)
1007+
},
1008+
1009+
Entry("factor: 1.5", 1.5, 70),
1010+
Entry("factor: 2.0", 2.0, 2),
1011+
Entry("factor: 100.0", 100.0, 150),
1012+
Entry("factor: 20000.0", 20000.0, 310),
1013+
)
1014+
1015+
DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token",
1016+
func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) {
1017+
simulator.config.TimeToFirstToken = 42
1018+
simulator.config.TimeToFirstTokenStdDev = 0
1019+
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
1020+
simulator.config.MaxNumSeqs = maxNumOfReq
1021+
simulator.nRunningReqs = int64(nCurrNumOfReq)
1022+
1023+
ttft := simulator.getTimeToFirstToken(128, 0, false)
1024+
max := timeFactorUnderLoad * float64(42)
1025+
Expect(ttft).To(BeNumerically(">=", 42))
1026+
Expect(ttft).To(BeNumerically("<=", max))
1027+
1028+
},
1029+
func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string {
1030+
return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d",
1031+
timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq)
1032+
},
1033+
1034+
Entry("factor: 1.5", 1.5, 70, 35),
1035+
Entry("factor: 2.0", 2.0, 2, 1),
1036+
Entry("factor: 100.0", 100.0, 150, 75),
1037+
Entry("factor: 20000.0", 20000.0, 310, 155),
1038+
)
1039+
1040+
It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
1041+
simulator.config.TimeFactorUnderLoad = 1.0
1042+
simulator.config.MaxNumSeqs = 11
1043+
simulator.nRunningReqs = 3
1044+
1045+
factor := simulator.getCurrFactor()
1046+
Expect(factor).To(BeNumerically("==", 1.0))
1047+
})
1048+
1049+
It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
1050+
simulator.config.TimeFactorUnderLoad = 2.0
1051+
simulator.config.MaxNumSeqs = 11
1052+
simulator.nRunningReqs = 11
1053+
1054+
factor := simulator.getCurrFactor()
1055+
Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
1056+
1057+
})
1058+
1059+
It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
1060+
simulator.config.TimeFactorUnderLoad = 2.0
1061+
simulator.config.MaxNumSeqs = 11
1062+
simulator.nRunningReqs = 6
1063+
1064+
factor := simulator.getCurrFactor()
1065+
Expect(factor).To(BeNumerically(">", 1.0))
1066+
Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
1067+
})
1068+
9581069
})
1070+
9591071
})

0 commit comments

Comments
 (0)