The factor applies on time-to-first-token

pancak3 · pancak3 · commit 04542f27baf9 · 2025-09-07T16:32:03.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/manifests/dev-config.yaml b/manifests/dev-config.yaml
@@ -0,0 +1,46 @@
+block-size: 16
+data-parallel-size: 1
+enable-kvcache: false
+event-batch-size: 16
+failure-injection-rate: 0
+failure-types: null
+fake-metrics:
+  kv-cache-usage: 0.4
+  running-requests: 10
+  waiting-requests: 30
+hash-seed: 'hashseed'
+inter-token-latency: 50
+inter-token-latency-std-dev: 15
+kv-cache-size: 1024
+kv-cache-transfer-latency: 0
+kv-cache-transfer-latency-std-dev: 0
+kv-cache-transfer-time-per-token: 100
+kv-cache-transfer-time-std-dev: 30
+lora-modules: null
+max-cpu-loras: 1
+max-loras: 1
+max-model-len: 1024
+max-num-seqs: 7
+max-tool-call-array-param-length: 5
+max-tool-call-integer-param: 100
+max-tool-call-number-param: 100
+min-tool-call-array-param-length: 1
+min-tool-call-integer-param: 0
+min-tool-call-number-param: 0
+mode: random
+model: Qwen/Qwen2.5-1.5B-Instruct
+object-tool-call-not-required-field-probability: 50
+port: 8000
+prefill-overhead: 80
+prefill-time-per-token: 20
+prefill-time-std-dev: 3
+seed: 1757050700239757600
+served-model-name:
+  - Qwen/Qwen2.5-1.5B-Instruct
+time-factor-under-load: 5
+time-to-first-token: 0
+time-to-first-token-std-dev: 0
+tokenizers-cache-dir: ''
+tool-call-not-required-param-probability: 50
+zmq-endpoint: tcp://localhost:5557
+zmq-max-connect-attempts: 0
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -175,6 +175,17 @@ type Configuration struct {
 	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
 }
 
+func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
+	if c.MaxNumSeqs <= 1 {
+		return 1.0
+	}
+	return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
+}
+
+func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
+	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
+}
+
 type Metrics struct {
 	// LoraMetrics
 	LoraMetrics []LorasMetrics `json:"loras"`
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 						finishReason = common.RemoteDecodeFinishReason
 					}
 
-					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
+					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData)
 				}
 			}
 			reqCtx.Wg.Done()
@@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
 func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
+	modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
@@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
+	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan)
 	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
@@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int {
 	if doRemotePrefill {
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
+				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
 
 			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
@@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
 			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
@@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = stdDev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
 
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = 0
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(Equal(expectedTTFT))
 			},
@@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
@@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
 				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
+				ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan)
 
 				expectedTTFT := kvCacheTransTPT * nTokens
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -945,5 +945,60 @@ var _ = Describe("Simulator", func() {
 			Entry("very long prompt", 150, 100, 20000),
 		)
 
+		It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 1.0
+
+			simulator.runReqChan <- 100
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			Expect(ttft).To(Equal(42))
+		})
+
+		It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
+			simulator.config.TimeToFirstToken = 42
+			simulator.config.TimeToFirstTokenStdDev = 0
+			simulator.config.TimeFactorUnderLoad = 100.0
+			simulator.config.MaxNumSeqs = 1
+
+			for len(simulator.runReqChan) > 0 {
+				<-simulator.runReqChan
+			}
+
+			simulator.runReqChan <- 1
+
+			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			Expect(ttft).To(Equal(42))
+		})
+
+		DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
+			func(timeFactorUnderLoad float64, maxNumOfReq int) {
+				simulator.config.TimeToFirstToken = 42
+				simulator.config.TimeToFirstTokenStdDev = 0
+				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
+				simulator.config.MaxNumSeqs = maxNumOfReq
+				for len(simulator.runReqChan) > 0 {
+					<-simulator.runReqChan
+				}
+				for i := 0; i < maxNumOfReq; i++ {
+					simulator.runReqChan <- 1
+				}
+
+				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
+
+			},
+			func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
+				return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
+					timeFactorUnderLoad, maxNumOfReq)
+			},
+
+			Entry("factor: 1.5", 1.5, 70),
+			Entry("factor: 2.0", 2.0, 2),
+			Entry("factor: 100.0", 100.0, 150),
+			Entry("factor: 20000.0", 20000.0, 310),
+		)
+
 	})
 })
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
+					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan)
 			}
 		}
 
@@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
-	tc *openaiserverapi.ToolCall, finishReason string) {
+	tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) {
 	// time to first token delay
-	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
+	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	for i, token := range genTokens {