diff --git a/.gitignore b/.gitignore index 3906cfb9..950b0cb4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ vendor .devcontainer # MacOSX .DS_Store -*.test \ No newline at end of file +*.test +manifests/dev-config.yaml diff --git a/README.md b/README.md index 535e77fc..29d4d405 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,7 @@ For more details see the = 1.0, with a default of 1.0. + // - If this factor is 1.0, no extra time is added. + // - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x. + // - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs. + TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"` + // Mode defines the simulator response generation mode, valid values: echo, random Mode string `yaml:"mode" json:"mode"` // Seed defines random seed for operations @@ -259,6 +267,7 @@ func newConfig() *Configuration { MaxModelLen: 1024, Mode: ModeRandom, Seed: time.Now().UnixNano(), + TimeFactorUnderLoad: 1.0, MaxToolCallIntegerParam: 100, MaxToolCallNumberParam: 100, MaxToolCallArrayParamLength: 5, @@ -338,6 +347,9 @@ func (c *Configuration) validate() error { if c.PrefillTimeStdDev < 0 { return errors.New("prefill time standard deviation cannot be negative") } + if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) { + return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token") + } if c.KVCacheTransferTimePerToken < 0 { return errors.New("kv-cache tranfer time per token cannot be negative") @@ -359,6 +371,10 @@ func (c *Configuration) validate() error { return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer") } + if c.TimeFactorUnderLoad < 1.0 { + return errors.New("time factor under load cannot be less than 1.0") + } + if c.MaxLoras < 1 { return errors.New("max LoRAs cannot be less than 1") } @@ -373,6 +389,10 @@ func (c *Configuration) validate() error { return errors.New("max model len cannot be less than 1") } + if c.MaxNumSeqs < 1 { + return errors.New("max num seqs cannot be less than 1") + } + for _, lora := range c.LoraModules { if lora.Name == "" { return errors.New("empty LoRA name") @@ -502,6 +522,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)") f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)") + f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)") f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call") f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 20aba9a4..1c0353ed 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -431,6 +431,26 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--data-parallel-size", "15", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid max-num-seqs", + args: []string{"cmd", "--max-num-seqs", "0", + "--config", "../../manifests/config.yaml"}, + }, + { + name: "invalid max-num-seqs", + args: []string{"cmd", "--max-num-seqs", "-1", + "--config", "../../manifests/config.yaml"}, + }, + { + name: "invalid time-factor-under-load", + args: []string{"cmd", "--time-factor-under-load", "0", + "--config", "../../manifests/config.yaml"}, + }, + { + name: "invalid time-factor-under-load", + args: []string{"cmd", "--time-factor-under-load", "-1", + "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 24446685..5ba1b7e7 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -674,12 +674,13 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r } // calculate how long to wait before returning the response, time is based on number of tokens - nPromptTokens := usageData.PromptTokens nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() - nGenTokens := usageData.CompletionTokens - ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) - totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens) - time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) + ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) + time.Sleep(time.Duration(ttft) * time.Millisecond) + for range usageData.CompletionTokens - 1 { + perTokenLatency := s.getInterTokenLatency() + time.Sleep(time.Duration(perTokenLatency) * time.Millisecond) + } ctx.Response.Header.SetContentType("application/json") ctx.Response.Header.SetStatusCode(fasthttp.StatusOK) @@ -708,25 +709,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke } if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { // is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache - prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken + prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken() return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev) } // is aggregated PD and *not* using number of prompt tokens - return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev) + return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev) } // returns inter token latency func (s *VllmSimulator) getInterTokenLatency() int { - return common.RandomNorm(s.config.InterTokenLatency, s.config.InterTokenLatencyStdDev) -} - -// returns total inter token latency for the given number of tokens -func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { - total := 0 - for range numOfTokens - 1 { - total += s.getInterTokenLatency() - } - return total + return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev) } // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist @@ -820,3 +812,26 @@ func (s *VllmSimulator) showConfig(dp bool) error { s.logger.Info("Configuration:", "", string(cfgJSON)) return nil } + +func (s *VllmSimulator) getCurrFactor() float64 { + if s.config.MaxNumSeqs <= 1 { + return 1.0 + } + return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1) +} + +func (s *VllmSimulator) GetTimeToFirstToken() int { + return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor()) +} + +func (s *VllmSimulator) GetPrefillOverhead() int { + return int(float64(s.config.PrefillOverhead) * s.getCurrFactor()) +} + +func (s *VllmSimulator) GetPrefillTimePerToken() int { + return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor()) +} + +func (s *VllmSimulator) GetInterTokenLatency() int { + return int(float64(s.config.InterTokenLatency) * s.getCurrFactor()) +} diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index df43ff57..17485001 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -777,7 +777,14 @@ var _ = Describe("Simulator", func() { func(interTokenLatency int, stddev int, numberOfTokens int) { simulator.config.InterTokenLatency = interTokenLatency simulator.config.InterTokenLatencyStdDev = stddev - latency := simulator.getTotalInterTokenLatency(numberOfTokens) + simulator.config.MaxNumSeqs = 1 + simulator.config.TimeFactorUnderLoad = 1.0 + + latency := 0 + for range numberOfTokens - 1 { + latency += simulator.getInterTokenLatency() + } + Expect(latency).To(BeNumerically(">=", int(float32(interTokenLatency)*0.3*float32(numberOfTokens)))) Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens)))) }, @@ -945,5 +952,110 @@ var _ = Describe("Simulator", func() { Entry("very long prompt", 150, 100, 20000), ) + It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = 1.0 + + simulator.runReqChan <- 100 + + ttft := simulator.getTimeToFirstToken(128, 0, false) + Expect(ttft).To(Equal(42)) + }) + + It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = 100.0 + simulator.config.MaxNumSeqs = 1 + + for len(simulator.runReqChan) > 0 { + <-simulator.runReqChan + } + + simulator.runReqChan <- 1 + + ttft := simulator.getTimeToFirstToken(128, 0, false) + Expect(ttft).To(Equal(42)) + }) + + DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token", + func(timeFactorUnderLoad float64, maxNumOfReq int) { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad + simulator.config.MaxNumSeqs = maxNumOfReq + simulator.nRunningReqs = int64(maxNumOfReq) + + ttft := simulator.getTimeToFirstToken(128, 0, false) + Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad))) + + }, + func(timeFactorUnderLoad float64, maxNumOfReq int64) string { + return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d", + timeFactorUnderLoad, maxNumOfReq) + }, + + Entry("factor: 1.5", 1.5, 70), + Entry("factor: 2.0", 2.0, 2), + Entry("factor: 100.0", 100.0, 150), + Entry("factor: 20000.0", 20000.0, 310), + ) + + DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token", + func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad + simulator.config.MaxNumSeqs = maxNumOfReq + simulator.nRunningReqs = int64(nCurrNumOfReq) + + ttft := simulator.getTimeToFirstToken(128, 0, false) + max := timeFactorUnderLoad * float64(42) + Expect(ttft).To(BeNumerically(">=", 42)) + Expect(ttft).To(BeNumerically("<=", max)) + + }, + func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string { + return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d", + timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq) + }, + + Entry("factor: 1.5", 1.5, 70, 35), + Entry("factor: 2.0", 2.0, 2, 1), + Entry("factor: 100.0", 100.0, 150, 75), + Entry("factor: 20000.0", 20000.0, 310, 155), + ) + + It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() { + simulator.config.TimeFactorUnderLoad = 1.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 3 + + factor := simulator.getCurrFactor() + Expect(factor).To(BeNumerically("==", 1.0)) + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() { + simulator.config.TimeFactorUnderLoad = 2.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 11 + + factor := simulator.getCurrFactor() + Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad)) + + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() { + simulator.config.TimeFactorUnderLoad = 2.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 6 + + factor := simulator.getCurrFactor() + Expect(factor).To(BeNumerically(">", 1.0)) + Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad)) + }) + }) + })