Skip to content

Commit 04542f2

Browse files
committed
The factor applies on time-to-first-token
Signed-off-by: Qifan Deng <[email protected]>
1 parent 1d30ea0 commit 04542f2

File tree

5 files changed

+128
-16
lines changed

5 files changed

+128
-16
lines changed

manifests/dev-config.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
block-size: 16
2+
data-parallel-size: 1
3+
enable-kvcache: false
4+
event-batch-size: 16
5+
failure-injection-rate: 0
6+
failure-types: null
7+
fake-metrics:
8+
kv-cache-usage: 0.4
9+
running-requests: 10
10+
waiting-requests: 30
11+
hash-seed: 'hashseed'
12+
inter-token-latency: 50
13+
inter-token-latency-std-dev: 15
14+
kv-cache-size: 1024
15+
kv-cache-transfer-latency: 0
16+
kv-cache-transfer-latency-std-dev: 0
17+
kv-cache-transfer-time-per-token: 100
18+
kv-cache-transfer-time-std-dev: 30
19+
lora-modules: null
20+
max-cpu-loras: 1
21+
max-loras: 1
22+
max-model-len: 1024
23+
max-num-seqs: 7
24+
max-tool-call-array-param-length: 5
25+
max-tool-call-integer-param: 100
26+
max-tool-call-number-param: 100
27+
min-tool-call-array-param-length: 1
28+
min-tool-call-integer-param: 0
29+
min-tool-call-number-param: 0
30+
mode: random
31+
model: Qwen/Qwen2.5-1.5B-Instruct
32+
object-tool-call-not-required-field-probability: 50
33+
port: 8000
34+
prefill-overhead: 80
35+
prefill-time-per-token: 20
36+
prefill-time-std-dev: 3
37+
seed: 1757050700239757600
38+
served-model-name:
39+
- Qwen/Qwen2.5-1.5B-Instruct
40+
time-factor-under-load: 5
41+
time-to-first-token: 0
42+
time-to-first-token-std-dev: 0
43+
tokenizers-cache-dir: ''
44+
tool-call-not-required-param-probability: 50
45+
zmq-endpoint: tcp://localhost:5557
46+
zmq-max-connect-attempts: 0

pkg/common/config.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,17 @@ type Configuration struct {
175175
DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
176176
}
177177

178+
func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
179+
if c.MaxNumSeqs <= 1 {
180+
return 1.0
181+
}
182+
return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
183+
}
184+
185+
func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
186+
return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
187+
}
188+
178189
type Metrics struct {
179190
// LoraMetrics
180191
LoraMetrics []LorasMetrics `json:"loras"`

pkg/llm-d-inference-sim/simulator.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
537537
finishReason = common.RemoteDecodeFinishReason
538538
}
539539

540-
s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
540+
s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData)
541541
}
542542
}
543543
reqCtx.Wg.Done()
@@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
662662
// finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
663663
// usageData - usage (tokens statistics) for this response
664664
func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
665-
modelName string, finishReason string, usageData *openaiserverapi.Usage) {
665+
modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) {
666666
resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
667667
reqCtx.CompletionReq.IsDoRemoteDecode())
668668

@@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
677677
nPromptTokens := usageData.PromptTokens
678678
nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
679679
nGenTokens := usageData.CompletionTokens
680-
ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
680+
ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan)
681681
totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
682682
time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
683683

@@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r
696696
}
697697

698698
// returns time to first token based on the current request's doRemotePrefill
699-
func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
699+
func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int {
700700
if doRemotePrefill {
701701
if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
702702
// is disaggregated PD and ttft is calculated using number of prompt tokens

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() {
798798
simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
799799
simulator.config.KVCacheTransferLatency = kvCacheLatency
800800
simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
801-
timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
801+
timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan)
802802
if doREmotePrefill {
803803
Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
804804
Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() {
829829
simulator.config.PrefillTimePerToken = 200
830830
simulator.config.PrefillTimeStdDev = 80
831831

832-
ttft := simulator.getTimeToFirstToken(128, 0, false)
832+
ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
833833

834834
Expect(ttft).To(BeNumerically("==", timeToFirstToken))
835835
})
@@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() {
842842
simulator.config.PrefillTimePerToken = 200
843843
simulator.config.PrefillTimeStdDev = 80
844844

845-
ttft := simulator.getTimeToFirstToken(128, 0, false)
845+
ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
846846
Expect(ttft).NotTo(BeNumerically("==", 0))
847847
})
848848

@@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() {
853853
simulator.config.PrefillTimePerToken = prefillTimePerToken
854854
simulator.config.PrefillTimeStdDev = stdDev
855855

856-
ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
856+
ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
857857

858858
expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
859859
Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() {
881881
simulator.config.PrefillTimePerToken = prefillTimePerToken
882882
simulator.config.PrefillTimeStdDev = 0
883883

884-
ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
884+
ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan)
885885
expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
886886
Expect(ttft).To(Equal(expectedTTFT))
887887
},
@@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() {
905905
simulator.config.KVCacheTransferTimePerToken = 100
906906
simulator.config.KVCacheTransferTimeStdDev = 0
907907

908-
ttft := simulator.getTimeToFirstToken(128, 0, true)
908+
ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
909909
Expect(ttft).To(BeNumerically("==", 200))
910910
})
911911

@@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() {
916916
simulator.config.KVCacheTransferTimePerToken = 100
917917
simulator.config.KVCacheTransferTimeStdDev = 0
918918

919-
ttft := simulator.getTimeToFirstToken(128, 0, true)
919+
ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan)
920920
Expect(ttft).To(BeNumerically("==", 12800))
921921
})
922922

@@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() {
927927
simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
928928
simulator.config.KVCacheTransferTimeStdDev = stddev
929929

930-
ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
930+
ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan)
931931

932932
expectedTTFT := kvCacheTransTPT * nTokens
933933
Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
@@ -945,5 +945,60 @@ var _ = Describe("Simulator", func() {
945945
Entry("very long prompt", 150, 100, 20000),
946946
)
947947

948+
It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() {
949+
simulator.config.TimeToFirstToken = 42
950+
simulator.config.TimeToFirstTokenStdDev = 0
951+
simulator.config.TimeFactorUnderLoad = 1.0
952+
953+
simulator.runReqChan <- 100
954+
955+
ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
956+
Expect(ttft).To(Equal(42))
957+
})
958+
959+
It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() {
960+
simulator.config.TimeToFirstToken = 42
961+
simulator.config.TimeToFirstTokenStdDev = 0
962+
simulator.config.TimeFactorUnderLoad = 100.0
963+
simulator.config.MaxNumSeqs = 1
964+
965+
for len(simulator.runReqChan) > 0 {
966+
<-simulator.runReqChan
967+
}
968+
969+
simulator.runReqChan <- 1
970+
971+
ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
972+
Expect(ttft).To(Equal(42))
973+
})
974+
975+
DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token",
976+
func(timeFactorUnderLoad float64, maxNumOfReq int) {
977+
simulator.config.TimeToFirstToken = 42
978+
simulator.config.TimeToFirstTokenStdDev = 0
979+
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
980+
simulator.config.MaxNumSeqs = maxNumOfReq
981+
for len(simulator.runReqChan) > 0 {
982+
<-simulator.runReqChan
983+
}
984+
for i := 0; i < maxNumOfReq; i++ {
985+
simulator.runReqChan <- 1
986+
}
987+
988+
ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan)
989+
Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
990+
991+
},
992+
func(timeFactorUnderLoad float64, maxNumOfReq int64) string {
993+
return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d",
994+
timeFactorUnderLoad, maxNumOfReq)
995+
},
996+
997+
Entry("factor: 1.5", 1.5, 70),
998+
Entry("factor: 2.0", 2.0, 2),
999+
Entry("factor: 100.0", 100.0, 150),
1000+
Entry("factor: 20000.0", 20000.0, 310),
1001+
)
1002+
9481003
})
9491004
})

pkg/llm-d-inference-sim/streaming.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
6969
if len(toolCalls) > 0 {
7070
s.logger.Info("Going to send tools calls")
7171
for _, tc := range toolCalls {
72-
s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason)
72+
s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan)
7373
}
7474
} else {
7575
s.logger.Info("Going to send text", "number of tokens", len(responseTokens))
76-
s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
76+
s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan)
7777
}
7878
}
7979

@@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
9797

9898
// sendTokenChunks creates and sends response chunks
9999
func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string,
100-
tc *openaiserverapi.ToolCall, finishReason string) {
100+
tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) {
101101
// time to first token delay
102-
ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill)
102+
ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan)
103103
time.Sleep(time.Duration(ttft) * time.Millisecond)
104104

105105
for i, token := range genTokens {

0 commit comments

Comments
 (0)