Adjust request "processing time" to current load (#189)

pancak3 · web-flow · commit 2bcfedba8f9b · 2025-09-09T09:24:14.000Z
* Validate max-num-seqs

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Validate PrefillTimeStdDev

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Add param time-factor-under-load

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* The factor applies on time-to-first-token

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Test TTFT when partially loaded

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Apply time factor under load to prefill and inter token latency

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Improve param desc

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Use nRunningReqs instead of runReqChan

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* unstage manifests/dev-config.yaml

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Update readme

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Restore changes for inter token latency (lost due to conflicts resolve)

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Calc inter token latency based on load instead of one-calc-for-whole request

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Calc inter token latency based on load instead of one-calc-for-whole request

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Move methods to simulator

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Rename helper func

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Rename helper func

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

* Fix inter token latency test

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;

---------

Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ vendor
 .devcontainer
 # MacOSX
 .DS_Store
-*.test
+*.test
+manifests/dev-config.yaml
diff --git a/README.md b/README.md
@@ -115,6 +115,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
 - `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
 ---
+- `time-factor-under-load`: a multiplicative factor that affects the overall time taken for requests when parallelrequests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. If this factor is 1.0, no extra time is added.  When the factor is x (where x > 1.0) and there are `max-num-seqs` requests, the total time will be multiplied by x. The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 ---
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -104,6 +104,14 @@ type Configuration struct {
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
 	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
+	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
+	// requests are being processed.
+	// The value of this factor must be >= 1.0, with a default of 1.0.
+	// - If this factor is 1.0, no extra time is added.
+	// - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x.
+	// - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs.
+	TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
 	// Seed defines random seed for operations
@@ -259,6 +267,7 @@ func newConfig() *Configuration {
 		MaxModelLen:                         1024,
 		Mode:                                ModeRandom,
 		Seed:                                time.Now().UnixNano(),
+		TimeFactorUnderLoad:                 1.0,
 		MaxToolCallIntegerParam:             100,
 		MaxToolCallNumberParam:              100,
 		MaxToolCallArrayParamLength:         5,
@@ -338,6 +347,9 @@ func (c *Configuration) validate() error {
 	if c.PrefillTimeStdDev < 0 {
 		return errors.New("prefill time standard deviation cannot be negative")
 	}
+	if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) {
+		return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token")
+	}
 
 	if c.KVCacheTransferTimePerToken < 0 {
 		return errors.New("kv-cache tranfer time per token cannot be negative")
@@ -359,6 +371,10 @@ func (c *Configuration) validate() error {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
 
+	if c.TimeFactorUnderLoad < 1.0 {
+		return errors.New("time factor under load cannot be less than 1.0")
+	}
+
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
@@ -373,6 +389,10 @@ func (c *Configuration) validate() error {
 		return errors.New("max model len cannot be less than 1")
 	}
 
+	if c.MaxNumSeqs < 1 {
+		return errors.New("max num seqs cannot be less than 1")
+	}
+
 	for _, lora := range c.LoraModules {
 		if lora.Name == "" {
 			return errors.New("empty LoRA name")
@@ -502,6 +522,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
+	f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
 	f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -431,6 +431,26 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--data-parallel-size", "15",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid max-num-seqs",
+			args: []string{"cmd", "--max-num-seqs", "0",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid max-num-seqs",
+			args: []string{"cmd", "--max-num-seqs", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid time-factor-under-load",
+			args: []string{"cmd", "--time-factor-under-load", "0",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid time-factor-under-load",
+			args: []string{"cmd", "--time-factor-under-load", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -672,12 +672,13 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	}
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
-	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
-	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
-	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
-	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
+	ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
+	time.Sleep(time.Duration(ttft) * time.Millisecond)
+	for range usageData.CompletionTokens - 1 {
+		perTokenLatency := s.getInterTokenLatency()
+		time.Sleep(time.Duration(perTokenLatency) * time.Millisecond)
+	}
 
 	ctx.Response.Header.SetContentType("application/json")
 	ctx.Response.Header.SetStatusCode(fasthttp.StatusOK)
@@ -706,25 +707,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
+		prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken()
 		return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev)
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev)
+	return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev)
 }
 
 // returns inter token latency
 func (s *VllmSimulator) getInterTokenLatency() int {
-	return common.RandomNorm(s.config.InterTokenLatency, s.config.InterTokenLatencyStdDev)
-}
-
-// returns total inter token latency for the given number of tokens
-func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
-	total := 0
-	for range numOfTokens - 1 {
-		total += s.getInterTokenLatency()
-	}
-	return total
+	return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev)
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
@@ -818,3 +810,26 @@ func (s *VllmSimulator) showConfig(dp bool) error {
 	s.logger.Info("Configuration:", "", string(cfgJSON))
 	return nil
 }
+
+func (s *VllmSimulator) getCurrFactor() float64 {
+	if s.config.MaxNumSeqs <= 1 {
+		return 1.0
+	}
+	return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
+}
+
+func (s *VllmSimulator) GetTimeToFirstToken() int {
+	return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor())
+}
+
+func (s *VllmSimulator) GetPrefillOverhead() int {
+	return int(float64(s.config.PrefillOverhead) * s.getCurrFactor())
+}
+
+func (s *VllmSimulator) GetPrefillTimePerToken() int {
+	return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor())
+}
+
+func (s *VllmSimulator) GetInterTokenLatency() int {
+	return int(float64(s.config.InterTokenLatency) * s.getCurrFactor())
+}
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -787,7 +787,14 @@ var _ = Describe("Simulator", func() {
 			func(interTokenLatency int, stddev int, numberOfTokens int) {
 				simulator.config.InterTokenLatency = interTokenLatency
 				simulator.config.InterTokenLatencyStdDev = stddev
-				latency := simulator.getTotalInterTokenLatency(numberOfTokens)
+				simulator.config.MaxNumSeqs = 1
+				simulator.config.TimeFactorUnderLoad = 1.0
+
+				latency := 0
+				for range numberOfTokens - 1 {
+					latency += simulator.getInterTokenLatency()
+				}
+
 				Expect(latency).To(BeNumerically(">=", int(float32(interTokenLatency)*0.3*float32(numberOfTokens))))
 				Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens))))
 			},
@@ -955,5 +962,110 @@ var _ = Describe("Simulator", func() {
 			Entry("very long prompt", 150, 100, 20000),
 		)
 
+		It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 1.0
+
+			simulator.runReqChan <- 100
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			Expect(ttft).To(Equal(42))
+		})
+
+		It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 100.0
+			simulator.config.MaxNumSeqs = 1
+
+			for len(simulator.runReqChan) > 0 {
+				<-simulator.runReqChan
+			}
+
+			simulator.runReqChan <- 1
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			Expect(ttft).To(Equal(42))
+		})
+
+		DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
+			func(timeFactorUnderLoad float64, maxNumOfReq int) {
+				simulator.config.TimeToFirstToken = 42
+				simulator.config.TimeToFirstTokenStdDev = 0
+				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
+				simulator.config.MaxNumSeqs = maxNumOfReq
+				simulator.nRunningReqs = int64(maxNumOfReq)
+
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
+				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
+
+			},
+			func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
+				return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
+					timeFactorUnderLoad, maxNumOfReq)
+			},
+
+			Entry("factor: 1.5", 1.5, 70),
+			Entry("factor: 2.0", 2.0, 2),
+			Entry("factor: 100.0", 100.0, 150),
+			Entry("factor: 20000.0", 20000.0, 310),
+		)
+
+		DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token",
+			func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) {
+				simulator.config.TimeToFirstToken = 42
+				simulator.config.TimeToFirstTokenStdDev = 0
+				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
+				simulator.config.MaxNumSeqs = maxNumOfReq
+				simulator.nRunningReqs = int64(nCurrNumOfReq)
+
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
+				max := timeFactorUnderLoad * float64(42)
+				Expect(ttft).To(BeNumerically(">=", 42))
+				Expect(ttft).To(BeNumerically("<=", max))
+
+			},
+			func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string {
+				return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d",
+					timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq)
+			},
+
+			Entry("factor: 1.5", 1.5, 70, 35),
+			Entry("factor: 2.0", 2.0, 2, 1),
+			Entry("factor: 100.0", 100.0, 150, 75),
+			Entry("factor: 20000.0", 20000.0, 310, 155),
+		)
+
+		It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
+			simulator.config.TimeFactorUnderLoad = 1.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 3
+
+			factor := simulator.getCurrFactor()
+			Expect(factor).To(BeNumerically("==", 1.0))
+		})
+
+		It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
+			simulator.config.TimeFactorUnderLoad = 2.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 11
+
+			factor := simulator.getCurrFactor()
+			Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
+
+		})
+
+		It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
+			simulator.config.TimeFactorUnderLoad = 2.0
+			simulator.config.MaxNumSeqs = 11
+			simulator.nRunningReqs = 6
+
+			factor := simulator.getCurrFactor()
+			Expect(factor).To(BeNumerically(">", 1.0))
+			Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad))
+		})
+
 	})
+
 })