Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ vendor
.devcontainer
# MacOSX
.DS_Store
*.test
*.test
manifests/dev-config.yaml
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
- `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
---
- `time-factor-under-load`: a multiplicative factor that affects the overall time taken for requests when parallelrequests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are `max-num-seqs` requests, the total time will be multiplied by x. The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
---
- `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
Expand Down
21 changes: 21 additions & 0 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ type Configuration struct {
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`

// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
// requests are being processed.
// The value of this factor must be >= 1.0, with a default of 1.0.
// - If this factor is 1.0, no extra time is added.
// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`

// Mode defines the simulator response generation mode, valid values: echo, random
Mode string `yaml:"mode" json:"mode"`
// Seed defines random seed for operations
Expand Down Expand Up @@ -259,6 +267,7 @@ func newConfig() *Configuration {
MaxModelLen: 1024,
Mode: ModeRandom,
Seed: time.Now().UnixNano(),
TimeFactorUnderLoad: 1.0,
MaxToolCallIntegerParam: 100,
MaxToolCallNumberParam: 100,
MaxToolCallArrayParamLength: 5,
Expand Down Expand Up @@ -338,6 +347,9 @@ func (c *Configuration) validate() error {
if c.PrefillTimeStdDev < 0 {
return errors.New("prefill time standard deviation cannot be negative")
}
if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) {
return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token")
}

if c.KVCacheTransferTimePerToken < 0 {
return errors.New("kv-cache tranfer time per token cannot be negative")
Expand All @@ -359,6 +371,10 @@ func (c *Configuration) validate() error {
return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
}

if c.TimeFactorUnderLoad < 1.0 {
return errors.New("time factor under load cannot be less than 1.0")
}

if c.MaxLoras < 1 {
return errors.New("max LoRAs cannot be less than 1")
}
Expand All @@ -373,6 +389,10 @@ func (c *Configuration) validate() error {
return errors.New("max model len cannot be less than 1")
}

if c.MaxNumSeqs < 1 {
return errors.New("max num seqs cannot be less than 1")
}

for _, lora := range c.LoraModules {
if lora.Name == "" {
return errors.New("empty LoRA name")
Expand Down Expand Up @@ -502,6 +522,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)")

f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
Expand Down
20 changes: 20 additions & 0 deletions pkg/common/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,26 @@ var _ = Describe("Simulator configuration", func() {
args: []string{"cmd", "--data-parallel-size", "15",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid max-num-seqs",
args: []string{"cmd", "--max-num-seqs", "0",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid max-num-seqs",
args: []string{"cmd", "--max-num-seqs", "-1",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid time-factor-under-load",
args: []string{"cmd", "--time-factor-under-load", "0",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid time-factor-under-load",
args: []string{"cmd", "--time-factor-under-load", "-1",
"--config", "../../manifests/config.yaml"},
},
}

for _, test := range invalidTests {
Expand Down
49 changes: 32 additions & 17 deletions pkg/llm-d-inference-sim/simulator.go
Original file line number Diff line number Diff line change
Expand Up @@ -674,12 +674,13 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
}

// calculate how long to wait before returning the response, time is based on number of tokens
nPromptTokens := usageData.PromptTokens
nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
nGenTokens := usageData.CompletionTokens
ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
time.Sleep(time.Duration(ttft) * time.Millisecond)
for range usageData.CompletionTokens - 1 {
perTokenLatency := s.getInterTokenLatency()
time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
}

ctx.Response.Header.SetContentType("application/json")
ctx.Response.Header.SetStatusCode(fasthttp.StatusOK)
Expand Down Expand Up @@ -708,25 +709,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
}
if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken()
return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
}
// is aggregated PD and *not* using number of prompt tokens
return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev)
return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev)
}

// returns inter token latency
func (s *VllmSimulator) getInterTokenLatency() int {
return common.RandomNorm(s.config.InterTokenLatency, s.config.InterTokenLatencyStdDev)
}

// returns total inter token latency for the given number of tokens
func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
total := 0
for range numOfTokens - 1 {
total += s.getInterTokenLatency()
}
return total
return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev)
}

// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
Expand Down Expand Up @@ -820,3 +812,26 @@ func (s *VllmSimulator) showConfig(dp bool) error {
s.logger.Info("Configuration:", "", string(cfgJSON))
return nil
}

func (s *VllmSimulator) getCurrFactor() float64 {
if s.config.MaxNumSeqs <= 1 {
return 1.0
}
return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
}

func (s *VllmSimulator) GetTimeToFirstToken() int {
return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor())
}

func (s *VllmSimulator) GetPrefillOverhead() int {
return int(float64(s.config.PrefillOverhead) * s.getCurrFactor())
}

func (s *VllmSimulator) GetPrefillTimePerToken() int {
return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor())
}

func (s *VllmSimulator) GetInterTokenLatency() int {
return int(float64(s.config.InterTokenLatency) * s.getCurrFactor())
}
114 changes: 113 additions & 1 deletion pkg/llm-d-inference-sim/simulator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,14 @@ var _ = Describe("Simulator", func() {
func(interTokenLatency int, stddev int, numberOfTokens int) {
simulator.config.InterTokenLatency = interTokenLatency
simulator.config.InterTokenLatencyStdDev = stddev
latency := simulator.getTotalInterTokenLatency(numberOfTokens)
simulator.config.MaxNumSeqs = 1
simulator.config.TimeFactorUnderLoad = 1.0

latency := 0
for range numberOfTokens - 1 {
latency += simulator.getInterTokenLatency()
}

Expect(latency).To(BeNumerically(">=", int(float32(interTokenLatency)*0.3*float32(numberOfTokens))))
Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens))))
},
Expand Down Expand Up @@ -945,5 +952,110 @@ var _ = Describe("Simulator", func() {
Entry("very long prompt", 150, 100, 20000),
)

It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
simulator.config.TimeToFirstToken = 42
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = 1.0

simulator.runReqChan <- 100

ttft := simulator.getTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(42))
})

It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
simulator.config.TimeToFirstToken = 42
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = 100.0
simulator.config.MaxNumSeqs = 1

for len(simulator.runReqChan) > 0 {
<-simulator.runReqChan
}

simulator.runReqChan <- 1

ttft := simulator.getTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(42))
})

DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
func(timeFactorUnderLoad float64, maxNumOfReq int) {
simulator.config.TimeToFirstToken = 42
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
simulator.config.MaxNumSeqs = maxNumOfReq
simulator.nRunningReqs = int64(maxNumOfReq)

ttft := simulator.getTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))

},
func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
timeFactorUnderLoad, maxNumOfReq)
},

Entry("factor: 1.5", 1.5, 70),
Entry("factor: 2.0", 2.0, 2),
Entry("factor: 100.0", 100.0, 150),
Entry("factor: 20000.0", 20000.0, 310),
)

DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token",
func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) {
simulator.config.TimeToFirstToken = 42
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
simulator.config.MaxNumSeqs = maxNumOfReq
simulator.nRunningReqs = int64(nCurrNumOfReq)

ttft := simulator.getTimeToFirstToken(128, 0, false)
max := timeFactorUnderLoad * float64(42)
Expect(ttft).To(BeNumerically(">=", 42))
Expect(ttft).To(BeNumerically("<=", max))

},
func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string {
return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d",
timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq)
},

Entry("factor: 1.5", 1.5, 70, 35),
Entry("factor: 2.0", 2.0, 2, 1),
Entry("factor: 100.0", 100.0, 150, 75),
Entry("factor: 20000.0", 20000.0, 310, 155),
)

It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
simulator.config.TimeFactorUnderLoad = 1.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 3

factor := simulator.getCurrFactor()
Expect(factor).To(BeNumerically("==", 1.0))
})

It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
simulator.config.TimeFactorUnderLoad = 2.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 11

factor := simulator.getCurrFactor()
Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))

})

It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
simulator.config.TimeFactorUnderLoad = 2.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 6

factor := simulator.getCurrFactor()
Expect(factor).To(BeNumerically(">", 1.0))
Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
})

})

})
Loading