Skip to content

Commit 6595261

Browse files
committed
Use nRunningReqs instead of runReqChan
Signed-off-by: Qifan Deng <[email protected]>
1 parent c12ead2 commit 6595261

File tree

5 files changed

+21
-47
lines changed

5 files changed

+21
-47
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ vendor
55
.devcontainer
66
# MacOSX
77
.DS_Store
8-
*.test
8+
*.test
9+
manifests/dev-config.yaml

pkg/common/config.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ type Configuration struct {
104104
// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
105105
KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
106106

107-
// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
107+
// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
108108
// requests are being processed.
109109
// The value of this factor must be >= 1.0, with a default of 1.0.
110110
// - If this factor is 1.0, no extra time is added.
@@ -176,27 +176,27 @@ type Configuration struct {
176176
DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
177177
}
178178

179-
func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
179+
func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 {
180180
if c.MaxNumSeqs <= 1 {
181181
return 1.0
182182
}
183-
return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
183+
return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1)
184184
}
185185

186-
func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
187-
return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
186+
func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int {
187+
return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs))
188188
}
189189

190-
func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int {
191-
return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))
190+
func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int {
191+
return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs))
192192
}
193193

194-
func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int {
195-
return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))
194+
func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int {
195+
return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs))
196196
}
197197

198-
func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int {
199-
return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))
198+
func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int {
199+
return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs))
200200
}
201201

202202
type Metrics struct {

pkg/common/config_test.go

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -466,43 +466,27 @@ var _ = Describe("Simulator configuration", func() {
466466
c := newConfig()
467467
c.TimeFactorUnderLoad = 1.0
468468
c.MaxNumSeqs = 11
469-
reqChan := make(chan int64, 3)
470-
for i := 0; i < 3; i++ {
471-
reqChan <- 1
472-
}
473469

474-
factor := c.calcLoadFactor(&reqChan)
470+
factor := c.calcLoadFactor(3)
475471
Expect(factor).To(BeNumerically("==", 1.0))
476-
close(reqChan)
477472
})
478473

479474
It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
480475
c := newConfig()
481476
c.TimeFactorUnderLoad = 2.0
482477
c.MaxNumSeqs = 11
483-
reqChan := make(chan int64, c.MaxNumSeqs)
484-
for i := 0; i < c.MaxNumSeqs; i++ {
485-
reqChan <- 1
486-
}
487478

488-
factor := c.calcLoadFactor(&reqChan)
479+
factor := c.calcLoadFactor(11)
489480
Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
490-
close(reqChan)
491481

492482
})
493483

494484
It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
495485
c := newConfig()
496486
c.TimeFactorUnderLoad = 2.0
497487
c.MaxNumSeqs = 11
498-
reqChan := make(chan int64, c.MaxNumSeqs)
499-
for i := 0; i < c.MaxNumSeqs/2; i++ {
500-
reqChan <- 1
501-
}
502-
factor := c.calcLoadFactor(&reqChan)
488+
factor := c.calcLoadFactor(6)
503489
Expect(factor).To(BeNumerically(">", 1.0))
504490
Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
505-
close(reqChan)
506-
507491
})
508492
})

pkg/llm-d-inference-sim/simulator.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -708,16 +708,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
708708
}
709709
if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
710710
// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
711-
prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)
711+
prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs)
712712
return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
713713
}
714714
// is aggregated PD and *not* using number of prompt tokens
715-
return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(&s.runReqChan)), float64(s.config.TimeToFirstTokenStdDev)))
715+
return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(s.nRunningReqs)), float64(s.config.TimeToFirstTokenStdDev)))
716716
}
717717

718718
// returns inter token latency
719719
func (s *VllmSimulator) getInterTokenLatency() int {
720-
mean := float64(s.config.GetInterTokenLatency(&s.runReqChan))
720+
mean := float64(s.config.GetInterTokenLatency(s.nRunningReqs))
721721
stddev := float64(s.config.InterTokenLatencyStdDev)
722722
return int(common.RandomNorm(mean, stddev))
723723
}

pkg/llm-d-inference-sim/simulator_test.go

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -978,12 +978,7 @@ var _ = Describe("Simulator", func() {
978978
simulator.config.TimeToFirstTokenStdDev = 0
979979
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
980980
simulator.config.MaxNumSeqs = maxNumOfReq
981-
for len(simulator.runReqChan) > 0 {
982-
<-simulator.runReqChan
983-
}
984-
for range maxNumOfReq {
985-
simulator.runReqChan <- 1
986-
}
981+
simulator.nRunningReqs = int64(maxNumOfReq)
987982

988983
ttft := simulator.getTimeToFirstToken(128, 0, false)
989984
Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
@@ -1006,13 +1001,7 @@ var _ = Describe("Simulator", func() {
10061001
simulator.config.TimeToFirstTokenStdDev = 0
10071002
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
10081003
simulator.config.MaxNumSeqs = maxNumOfReq
1009-
1010-
for len(simulator.runReqChan) > 0 {
1011-
<-simulator.runReqChan
1012-
}
1013-
for range nCurrNumOfReq {
1014-
simulator.runReqChan <- 1
1015-
}
1004+
simulator.nRunningReqs = int64(nCurrNumOfReq)
10161005

10171006
ttft := simulator.getTimeToFirstToken(128, 0, false)
10181007
max := timeFactorUnderLoad * float64(42)

0 commit comments

Comments
 (0)