Change time-to-first-token parameter to be based on number of request tokens #137 (#165)

pancak3 · web-flow · commit 08d4613a9a49 · 2025-09-01T09:39:41.000+03:00
* Fix comments on prefill arg in completion request interface Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Add feature of calc ttft by prefill overhead. TODO: kvcache transfer overhead Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Rename prefill-overhead-complexity to prefill-complexity Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Calc kv cache transfer overhead based on prompt length Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Add invalid test cases for args prefill-overhead and kv-cache-transfer-overhead Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Add standard deviation in utils Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Add stddev for prefill overhead and kvcache trans overhead Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Fix test condition when remove p/d is enabled and in-place policy is used Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Use simplfied implementation of ttft Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Add sep lines in readme params Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Update readme with explanation of new ttft Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Fix ttft new params tests Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Fix kv cache trasfer tests and impl Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Fix invalid config test of new ttft params Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Revert "Add standard deviation in utils" This reverts commit 18d3075. Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Remove additional variables in prefill time calculation Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Improve is remote prefill/decode interface doc Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Improve implementation of ttft calc Signed-off-by: Qifan Deng <dev.llmd@qifand.com> * Remove unnecessary variable Signed-off-by: Qifan Deng <dev.llmd@qifand.com> --------- Signed-off-by: Qifan Deng <dev.llmd@qifand.com> Signed-off-by: Qifan Deng <20884468+pancak3@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -101,13 +101,22 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `mode`: the simulator mode, optional, by default `random`
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
+---
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
 - `inter-token-latency-std-dev`: standard deviation for time between generated tokens, in milliseconds, optional, default is 0, can't be more than 30% of `inter-token-latency`, will not cause the actual inter token latency to differ by more than 70% from `inter-token-latency`
 - `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
 - `kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
+---
+- `prefill-overhead`: constant overhead time for prefill (in milliseconds), optional, by default zero, used in calculating time to first token, this will be ignored if `time-to-first-token` is not `0`
+- `prefill-time-per-token`: time taken to generate each token during prefill (in milliseconds), optional, by default zero, this will be ignored if `time-to-first-token` is not `0`
+- `prefill-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final prefill time, which is calculated by `prefill-overhead`, `prefill-time-per-token`, and number of prompt tokens, this will be ignored if `time-to-first-token` is not `0`
+- `kv-cache-transfer-time-per-token`: time taken to transfer cache for each token in case P/D is enabled (in milliseconds), optional, by default zero, this will be ignored if `kv-cache-transfer-latency` is not `0`
+- `kv-cache-transfer-time-std-dev`: similar to `time-to-first-token-std-dev`, but is applied on the final kv cache transfer time in case P/D is enabled (in milliseconds), which is calculated by `kv-cache-transfer-time-per-token` and number of prompt tokens, this will be ignored if `kv-cache-transfer-latency` is not `0`
+---
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
+---
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
 - `min-tool-call-integer-param`: the minimum possible value of integer parameters in a tool call, optional, defaults to 0
 - `max-tool-call-number-param`: the maximum possible value of number (float) parameters in a tool call, optional, defaults to 100
@@ -116,6 +125,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `min-tool-call-array-param-length`: the minimum possible length of array parameters in a tool call, optional, defaults to 1
 - `tool-call-not-required-param-probability`: the probability to add a parameter, that is not required, in a tool call, optional, defaults to 50
 - `object-tool-call-not-required-field-probability`: the probability to add a field, that is not required, in an object in a tool call, optional, defaults to 50
+---
 - `enable-kvcache`: if true, the KV cache support will be enabled in the simulator. In this case, the KV cache will be simulated, and ZQM events will be published when a KV cache block is added or evicted. 
 - `kv-cache-size`: the maximum number of token blocks in kv cache
 - `block-size`: token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128
@@ -124,8 +134,10 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `zmq-endpoint`: ZMQ address to publish events
 - `zmq-max-connect-attempts`: the maximum number of ZMQ connection attempts, defaults to 0, maximum: 10
 - `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
+---
 - `failure-injection-rate`: probability (0-100) of injecting failures, optional, default is 0
 - `failure-types`: list of specific failure types to inject (rate_limit, invalid_api_key, context_length, server_error, invalid_request, model_not_found), optional, if empty all types are used
+---
 - `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the real metrics. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported together. The set should include values for 
     - `running-requests`
     - `waiting-requests`
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -72,6 +72,7 @@ type Configuration struct {
 	// in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
 	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
 	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"`
+
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"`
 	// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
@@ -87,6 +88,21 @@ type Configuration struct {
 	// KVCacheTransferLatency
 	KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"`
 
+	// $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$
+	// the assumption is that n is less than k, where k is the number of prallelism units of GPU
+	// PrefillOverhead time taken to prefill the context, in milliseconds
+	PrefillOverhead     int `yaml:"prefill-overhead" json:"prefill-overhead"`
+	PrefillTimePerToken int `yaml:"prefill-time-per-token" json:"prefill-time-per-token"`
+	// PrefillOverheadStdDev similar to TimeToFirstTokenStdDev
+	PrefillTimeStdDev int `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"`
+	// $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$
+	// the assumption is that the cache blocks are all missed at the remote pod
+	// KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated,
+	// in milliseconds.
+	KVCacheTransferTimePerToken int `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"`
+	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
+	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode" json:"mode"`
 	// Seed defines random seed for operations
@@ -307,6 +323,24 @@ func (c *Configuration) validate() error {
 	if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) {
 		return errors.New("time to first token standard deviation cannot be more than 30% of time to first token")
 	}
+
+	if c.PrefillOverhead < 0 {
+		return errors.New("prefill overhead cannot be negative")
+	}
+	if c.PrefillTimePerToken < 0 {
+		return errors.New("prefill time per token cannot be negative")
+	}
+	if c.PrefillTimeStdDev < 0 {
+		return errors.New("prefill time standard deviation cannot be negative")
+	}
+
+	if c.KVCacheTransferTimePerToken < 0 {
+		return errors.New("kv-cache tranfer time per token cannot be negative")
+	}
+	if c.KVCacheTransferTimeStdDev < 0 {
+		return errors.New("kv-cache tranfer time standard deviation cannot be negative")
+	}
+
 	if c.KVCacheTransferLatency < 0 {
 		return errors.New("kv-cache tranfer time cannot be negative")
 	}
@@ -316,6 +350,7 @@ func (c *Configuration) validate() error {
 	if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
 		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
 	}
+
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
@@ -433,6 +468,13 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode: echo - returns the same text that was sent in the request, for chat completion returns the last message; random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
+
+	f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if <time-to-first-token> is not 0.")
+	f.IntVar(&config.PrefillTimePerToken, "prefill-time-per-token", config.PrefillTimePerToken, "Time to prefill per token (in milliseconds)")
+	f.IntVar(&config.PrefillTimeStdDev, "prefill-time-std-dev", config.PrefillTimeStdDev, "Standard deviation for time to prefill (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferTimePerToken, "kv-cache-transfer-time-per-token", config.KVCacheTransferTimePerToken, "Time for KV-cache transfer per token from a remote vLLM (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferTimeStdDev, "kv-cache-transfer-time-std-dev", config.KVCacheTransferTimeStdDev, "Standard deviation for time for KV-cache transfer per token from a remote vLLM (in milliseconds)")
+
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
 	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -401,6 +401,31 @@ var _ = Describe("Simulator configuration", func() {
 			name: "invalid (negative) zmq-max-connect-attempts for config file",
 			args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"},
 		},
+		{
+			name: "invalid (negative) prefill-overhead",
+			args: []string{"cmd", "--prefill-overhead", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) prefill-time-per-token",
+			args: []string{"cmd", "--prefill-time-per-token", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) prefill-time-std-dev",
+			args: []string{"cmd", "--prefill-time-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) kv-cache-transfer-time-per-token",
+			args: []string{"cmd", "--kv-cache-transfer-time-per-token", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) kv-cache-transfer-time-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-time-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -495,7 +495,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 							model:            displayModel,
 							doRemotePrefill:  req.IsDoRemotePrefill(),
 						},
-						responseTokens, toolCalls, finishReason, usageDataToSend,
+						usageDataToSend.PromptTokens, responseTokens, toolCalls, finishReason, usageDataToSend,
 					)
 				} else {
 					if req.IsDoRemoteDecode() {
@@ -646,8 +646,9 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	}
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
-	numOfTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens)
+	nPromptTokens := usageData.PromptTokens
+	nGenTokens := usageData.CompletionTokens
+	totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	ctx.Response.Header.SetContentType("application/json")
@@ -665,14 +666,23 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
-	mean := float64(s.config.TimeToFirstToken)
-	stddev := float64(s.config.TimeToFirstTokenStdDev)
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
 	if doRemotePrefill {
-		mean = float64(s.config.KVCacheTransferLatency)
-		stddev = float64(s.config.KVCacheTransferLatencyStdDev)
+		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
+			// is disaggregated PD and ttft is calculated using number of prompt tokens
+			kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens
+			return int(common.RandomNorm(float64(kvCacheTransT), float64(s.config.KVCacheTransferTimeStdDev)))
+		}
+		// is disaggregated PD and *not* using number of prompt tokens
+		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))
 	}
-	return int(common.RandomNorm(mean, stddev))
+	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
+		// is aggregated PD and ttft is calculated using number of prompt tokens
+		prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken
+		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
+	}
+	// is aggregated PD and *not* using number of prompt tokens
+	return int(common.RandomNorm(float64(s.config.TimeToFirstToken), float64(s.config.TimeToFirstTokenStdDev)))
 }
 
 // returns inter token latency
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -807,7 +807,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill)
+				timeToFirst := simulator.getTimeToFirstToken(1, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -828,5 +828,104 @@ var _ = Describe("Simulator", func() {
 			Entry(nil, 10000, 0, 1000, 0, true),
 			Entry(nil, 10000, 0, 1000, 0, false),
 		)
+
+		It("when <time-to-first-token> is not 0, ignore <prefill-overhead>", func() {
+			timeToFirstToken := 1000
+			simulator.config.TimeToFirstToken = timeToFirstToken
+			simulator.config.TimeToFirstTokenStdDev = 0
+
+			simulator.config.PrefillOverhead = 100
+			simulator.config.PrefillTimePerToken = 200
+			simulator.config.PrefillTimeStdDev = 80
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+
+			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
+		})
+
+		It("when <time-to-first-token> is 0, and <prefill-overhead> is not 0, use <prefill-overhead>", func() {
+			simulator.config.TimeToFirstToken = 0
+			simulator.config.TimeToFirstTokenStdDev = 0
+
+			simulator.config.PrefillOverhead = 100
+			simulator.config.PrefillTimePerToken = 200
+			simulator.config.PrefillTimeStdDev = 80
+
+			ttft := simulator.getTimeToFirstToken(128, false)
+			Expect(ttft).NotTo(BeNumerically("==", 0))
+		})
+
+		DescribeTable("time to first token is against number of prompt tokens",
+			func(prefillOverhead int, prefillTimePerToken int, stdDev int, nTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillTimePerToken = prefillTimePerToken
+				simulator.config.PrefillTimeStdDev = stdDev
+
+				ttft := simulator.getTimeToFirstToken(nTokens, false)
+
+				expectedTTFT := prefillOverhead + prefillTimePerToken*nTokens
+				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
+				Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7)))
+
+			},
+			func(prefillOverhead int, prefillTimePerToken, stdDev int, nTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d",
+					prefillOverhead, prefillTimePerToken, stdDev, nTokens)
+			},
+			Entry("single token", 100, 50, 70, 1),
+			Entry("stddev is 0", 100, 50, 0, 1),
+			Entry("medium overhead, 512 tokens", 200, 1000, 150, 512),
+			Entry("large overhead, 1024 tokens", 2000, 3000, 1800, 1024),
+			Entry("very long prompt", 150, 200, 100, 20000),
+		)
+
+		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
+			simulator.config.KVCacheTransferLatency = 200
+			simulator.config.KVCacheTransferLatencyStdDev = 0
+
+			simulator.config.KVCacheTransferTimePerToken = 100
+			simulator.config.KVCacheTransferTimeStdDev = 0
+
+			ttft := simulator.getTimeToFirstToken(128, true)
+			Expect(ttft).To(BeNumerically("==", 200))
+		})
+
+		It("when <kv-cache-transfer-latency> is 0, and <kv-cache-transfer-overhead> is not 0, use <kv-cache-transfer-overhead>", func() {
+			simulator.config.KVCacheTransferLatency = 0
+			simulator.config.KVCacheTransferLatencyStdDev = 0
+
+			simulator.config.KVCacheTransferTimePerToken = 100
+			simulator.config.KVCacheTransferTimeStdDev = 0
+
+			ttft := simulator.getTimeToFirstToken(128, true)
+			Expect(ttft).To(BeNumerically("==", 12800))
+		})
+
+		DescribeTable("kv cache transfer time against number of prompt tokens",
+			func(kvCacheTransTPT int, stddev int, nTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = 1
+				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
+				simulator.config.KVCacheTransferTimeStdDev = stddev
+
+				ttft := simulator.getTimeToFirstToken(nTokens, true)
+
+				expectedTTFT := kvCacheTransTPT * nTokens
+				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
+				Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7)))
+
+			},
+			func(kvCacheTransferTimePerToken int, stddev int, nTokens int) string {
+				return fmt.Sprintf("kvCacheTransferTimePerToken: %d stddev: %d nTokens: %d",
+					kvCacheTransferTimePerToken, stddev, nTokens)
+			},
+			Entry("single token", 100, 70, 1),
+			Entry("stddev is 0", 100, 0, 1),
+			Entry("medium overhead, 512 tokens", 200, 150, 512),
+			Entry("large overhead, 1024 tokens", 2000, 1800, 1024),
+			Entry("very long prompt", 150, 100, 20000),
+		)
+
 	})
 })
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go