Use nRunningReqs instead of runReqChan

pancak3 · pancak3 · commit 6595261c6a53 · 2025-09-05T17:17:33.000+10:00
Signed-off-by: Qifan Deng &lt;dev.llmd@qifand.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ vendor
 .devcontainer
 # MacOSX
 .DS_Store
-*.test
+*.test
+manifests/dev-config.yaml
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -104,7 +104,7 @@ type Configuration struct {
 	// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev
 	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
 
-	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel 
+	// TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel
 	// requests are being processed.
 	// The value of this factor must be >= 1.0, with a default of 1.0.
 	// - If this factor is 1.0, no extra time is added.
@@ -176,27 +176,27 @@ type Configuration struct {
 	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
 }
 
-func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 {
+func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 {
 	if c.MaxNumSeqs <= 1 {
 		return 1.0
 	}
-	return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1)
+	return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1)
 }
 
-func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int {
-	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int {
+	return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int {
-	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int {
+	return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int {
-	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int {
+	return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs))
 }
 
-func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int {
-	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))
+func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int {
+	return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs))
 }
 
 type Metrics struct {
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -466,43 +466,27 @@ var _ = Describe("Simulator configuration", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 1.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, 3)
-		for i := 0; i < 3; i++ {
-			reqChan <- 1
-		}
 
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(3)
 		Expect(factor).To(BeNumerically("==", 1.0))
-		close(reqChan)
 	})
 
 	It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 2.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, c.MaxNumSeqs)
-		for i := 0; i < c.MaxNumSeqs; i++ {
-			reqChan <- 1
-		}
 
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(11)
 		Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad))
-		close(reqChan)
 
 	})
 
 	It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
 		c := newConfig()
 		c.TimeFactorUnderLoad = 2.0
 		c.MaxNumSeqs = 11
-		reqChan := make(chan int64, c.MaxNumSeqs)
-		for i := 0; i < c.MaxNumSeqs/2; i++ {
-			reqChan <- 1
-		}
-		factor := c.calcLoadFactor(&reqChan)
+		factor := c.calcLoadFactor(6)
 		Expect(factor).To(BeNumerically(">", 1.0))
 		Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad))
-		close(reqChan)
-
 	})
 })
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -708,16 +708,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
 		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
-		prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)
+		prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs)
 		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
 	}
 	// is aggregated PD and *not* using number of prompt tokens
-	return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(&s.runReqChan)), float64(s.config.TimeToFirstTokenStdDev)))
+	return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(s.nRunningReqs)), float64(s.config.TimeToFirstTokenStdDev)))
 }
 
 // returns inter token latency
 func (s *VllmSimulator) getInterTokenLatency() int {
-	mean := float64(s.config.GetInterTokenLatency(&s.runReqChan))
+	mean := float64(s.config.GetInterTokenLatency(s.nRunningReqs))
 	stddev := float64(s.config.InterTokenLatencyStdDev)
 	return int(common.RandomNorm(mean, stddev))
 }
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -978,12 +978,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = 0
 				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 				simulator.config.MaxNumSeqs = maxNumOfReq
-				for len(simulator.runReqChan) > 0 {
-					<-simulator.runReqChan
-				}
-				for range maxNumOfReq {
-					simulator.runReqChan <- 1
-				}
+				simulator.nRunningReqs = int64(maxNumOfReq)
 
 				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
@@ -1006,13 +1001,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = 0
 				simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
 				simulator.config.MaxNumSeqs = maxNumOfReq
-
-				for len(simulator.runReqChan) > 0 {
-					<-simulator.runReqChan
-				}
-				for range nCurrNumOfReq {
-					simulator.runReqChan <- 1
-				}
+				simulator.nRunningReqs = int64(nCurrNumOfReq)
 
 				ttft := simulator.getTimeToFirstToken(128, 0, false)
 				max := timeFactorUnderLoad * float64(42)

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ type Configuration struct {`
`104`	`104`	`// KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev`
`105`	`105`	KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"`
`106`	`106`
`107`		`- // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel`
	`107`	`+ // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel`
`108`	`108`	`// requests are being processed.`
`109`	`109`	`// The value of this factor must be >= 1.0, with a default of 1.0.`
`110`	`110`	`// - If this factor is 1.0, no extra time is added.`
`@@ -176,27 +176,27 @@ type Configuration struct {`
`176`	`176`	DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"`
`177`	`177`	`}`
`178`	`178`
`179`		`-func (c Configuration) calcLoadFactor(runReqChan chan int64) float64 {`
	`179`	`+func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 {`
`180`	`180`	`if c.MaxNumSeqs <= 1 {`
`181`	`181`	`return 1.0`
`182`	`182`	`}`
`183`		`- return 1 + (c.TimeFactorUnderLoad-1)float64(len(runReqChan)-1)/float64(c.MaxNumSeqs-1)`
	`183`	`+ return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1)`
`184`	`184`	`}`
`185`	`185`
`186`		`-func (c Configuration) GetTimeToFirstToken(runReqChan chan int64) int {`
`187`		`- return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan))`
	`186`	`+func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int {`
	`187`	`+ return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs))`
`188`	`188`	`}`
`189`	`189`
`190`		`-func (c Configuration) GetPrefillOverhead(runReqChan chan int64) int {`
`191`		`- return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan))`
	`190`	`+func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int {`
	`191`	`+ return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs))`
`192`	`192`	`}`
`193`	`193`
`194`		`-func (c Configuration) GetPrefillTimePerToken(runReqChan chan int64) int {`
`195`		`- return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan))`
	`194`	`+func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int {`
	`195`	`+ return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs))`
`196`	`196`	`}`
`197`	`197`
`198`		`-func (c Configuration) GetInterTokenLatency(runReqChan chan int64) int {`
`199`		`- return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan))`
	`198`	`+func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int {`
	`199`	`+ return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs))`
`200`	`200`	`}`
`201`	`201`
`202`	`202`	`type Metrics struct {`
Original file line number	Diff line number	Diff line change
`@@ -708,16 +708,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke`
`708`	`708`	`}`
`709`	`709`	`if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {`
`710`	`710`	`// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache`
`711`		`- prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan)`
	`711`	`+ prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs)`
`712`	`712`	`return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))`
`713`	`713`	`}`
`714`	`714`	`// is aggregated PD and not using number of prompt tokens`
`715`		`- return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(&s.runReqChan)), float64(s.config.TimeToFirstTokenStdDev)))`
	`715`	`+ return int(common.RandomNorm(float64(s.config.GetTimeToFirstToken(s.nRunningReqs)), float64(s.config.TimeToFirstTokenStdDev)))`
`716`	`716`	`}`
`717`	`717`
`718`	`718`	`// returns inter token latency`
`719`	`719`	`func (s *VllmSimulator) getInterTokenLatency() int {`
`720`		`- mean := float64(s.config.GetInterTokenLatency(&s.runReqChan))`
	`720`	`+ mean := float64(s.config.GetInterTokenLatency(s.nRunningReqs))`
`721`	`721`	`stddev := float64(s.config.InterTokenLatencyStdDev)`
`722`	`722`	`return int(common.RandomNorm(mean, stddev))`
`723`	`723`	`}`