Apply time factor under load to prefill and inter token latency

pancak3 · pancak3 · commit ee0235782058 · 2025-09-05T13:49:48.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -186,6 +186,18 @@ func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
 	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
 }
 
+func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int {
+	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))
+}
+
+func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int {
+	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))
+}
+
+func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int {
+	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))
+}
+
 type Metrics struct {
 	// LoraMetrics
 	LoraMetrics []LorasMetrics `json:"loras"`
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -461,4 +461,48 @@ var _ = Describe("Simulator configuration", func() {
 			})
 		})
 	}
+
+	It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 1.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, 3)
+		for i := 0; i < 3; i++ {
+			reqChan <- 1
+		}
+
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically("==", 1.0))
+		close(reqChan)
+	})
+
+	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 2.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, c.MaxNumSeqs)
+		for i := 0; i < c.MaxNumSeqs; i++ {
+			reqChan <- 1
+		}
+
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
+		close(reqChan)
+
+	})
+
+	It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
+		c := newConfig()
+		c.TimeFactorUnderLoad = 2.0
+		c.MaxNumSeqs = 11
+		reqChan := make(chan int64, c.MaxNumSeqs)
+		for i := 0; i < c.MaxNumSeqs/2; i++ {
+			reqChan <- 1
+		}
+		factor := c.calcLoadFactor(&reqChan)
+		Expect(factor).To(BeNumerically(">", 1.0))
+		Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
+		close(reqChan)
+
+	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 						finishReason = common.RemoteDecodeFinishReason
 					}
 
-					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData)
+					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
 				}
 			}
 			reqCtx.Wg.Done()
@@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
 func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) {
+	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
 	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
 		reqCtx.CompletionReq.IsDoRemoteDecode())
 
@@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 	nPromptTokens := usageData.PromptTokens
 	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
-	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan)
+	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
 	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
@@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
 	if doRemotePrefill {
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
@@ -708,16 +708,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
+		prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)
 		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(runReqChan)), float64(s.config.TimeToFirstTokenStdDev)))
+	return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(&s.runReqChan)), float64(s.config.TimeToFirstTokenStdDev)))
 }
 
 // returns inter token latency
 func (s *VllmSimulator) getInterTokenLatency() int {
-	mean := float64(s.config.InterTokenLatency)
+	mean := float64(s.config.GetInterTokenLatency(&s.runReqChan))
 	stddev := float64(s.config.InterTokenLatencyStdDev)
 	return int(common.RandomNorm(mean, stddev))
 }
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan)
+				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 
 			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
@@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
@@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = stdDev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
 
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = 0
 
-				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
 				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(Equal(expectedTTFT))
 			},
@@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
@@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
 				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
 
 				expectedTTFT := kvCacheTransTPT * nTokens
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -952,7 +952,7 @@ var _ = Describe("Simulator", func() {
 
 			simulator.runReqChan <- 100
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).To(Equal(42))
 		})
 
@@ -968,7 +968,7 @@ var _ = Describe("Simulator", func() {
 
 			simulator.runReqChan <- 1
 
-			ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).To(Equal(42))
 		})
 
@@ -985,7 +985,7 @@ var _ = Describe("Simulator", func() {
 					simulator.runReqChan <- 1
 				}
 
-				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
 
 			},
@@ -1014,7 +1014,7 @@ var _ = Describe("Simulator", func() {
 					simulator.runReqChan <- 1
 				}
 
-				ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
+				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				max := timeFactorUnderLoad * float64(42)
 				Expect(ttft).To(BeNumerically(">=", 42))
 				Expect(ttft).To(BeNumerically("<=", max))
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan)
+					s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
 			}
 		}
 
@@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 
 // sendTokenChunks creates and sends response chunks
 func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
-	tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) {
+	tc *openaiserverapi.ToolCall, finishReason string) {
 	// time to first token delay
-	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan)
+	ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
 	time.Sleep(time.Duration(ttft) * time.Millisecond)
 
 	for i, token := range genTokens {