From 210fa8a7ccae3daf07a72a1153942a21e4a47100 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 00:18:50 +1000 Subject: [PATCH 01/17] Validate max-num-seqs Signed-off-by: Qifan Deng --- pkg/common/config.go | 4 ++++ pkg/common/config_test.go | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/pkg/common/config.go b/pkg/common/config.go index 442f53b0..ca4be087 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -373,6 +373,10 @@ func (c *Configuration) validate() error { return errors.New("max model len cannot be less than 1") } + if c.MaxNumSeqs < 1 { + return errors.New("max num seqs cannot be less than 1") + } + for _, lora := range c.LoraModules { if lora.Name == "" { return errors.New("empty LoRA name") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 20aba9a4..815037f0 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -431,6 +431,16 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--data-parallel-size", "15", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid max-num-seqs", + args: []string{"cmd", "--max-num-seqs", "0", + "--config", "../../manifests/config.yaml"}, + }, + { + name: "invalid max-num-seqs", + args: []string{"cmd", "--max-num-seqs", "-1", + "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { From abdb2fa2a587ed4d8ff4f589417ea209eec894f0 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 00:53:19 +1000 Subject: [PATCH 02/17] Validate PrefillTimeStdDev Signed-off-by: Qifan Deng --- pkg/common/config.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/common/config.go b/pkg/common/config.go index ca4be087..f63871f8 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -338,6 +338,9 @@ func (c *Configuration) validate() error { if c.PrefillTimeStdDev < 0 { return errors.New("prefill time standard deviation cannot be negative") } + if float32(c.PrefillTimeStdDev) > 0.3*float32(c.PrefillTimePerToken) { + return errors.New("prefill time standard deviation cannot be more than 30% of prefill time per token") + } if c.KVCacheTransferTimePerToken < 0 { return errors.New("kv-cache tranfer time per token cannot be negative") From 1d30ea03aedc74ad46c7236f7ac0ac4e706f00aa Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 01:14:09 +1000 Subject: [PATCH 03/17] Add param time-factor-under-load Signed-off-by: Qifan Deng --- pkg/common/config.go | 13 +++++++++++++ pkg/common/config_test.go | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/pkg/common/config.go b/pkg/common/config.go index f63871f8..7e9200a5 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -104,6 +104,13 @@ type Configuration struct { // KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"` + // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests + // when parallel requests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. + // If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are MaxNumSeqs + // requests, the total time will be multiplied by x. + // The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs. + TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"` + // Mode defines the simulator response generation mode, valid values: echo, random Mode string `yaml:"mode" json:"mode"` // Seed defines random seed for operations @@ -259,6 +266,7 @@ func newConfig() *Configuration { MaxModelLen: 1024, Mode: ModeRandom, Seed: time.Now().UnixNano(), + TimeFactorUnderLoad: 1.0, MaxToolCallIntegerParam: 100, MaxToolCallNumberParam: 100, MaxToolCallArrayParamLength: 5, @@ -362,6 +370,10 @@ func (c *Configuration) validate() error { return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer") } + if c.TimeFactorUnderLoad < 1.0 { + return errors.New("time factor under load cannot be less than 1.0") + } + if c.MaxLoras < 1 { return errors.New("max LoRAs cannot be less than 1") } @@ -509,6 +521,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)") f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)") + f.Float64Var(&config.TimeFactorUnderLoad, "time-factor-under-load", config.TimeFactorUnderLoad, "Time factor under load (must be >= 1.0)") f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call") f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 815037f0..1c0353ed 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -441,6 +441,16 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--max-num-seqs", "-1", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid time-factor-under-load", + args: []string{"cmd", "--time-factor-under-load", "0", + "--config", "../../manifests/config.yaml"}, + }, + { + name: "invalid time-factor-under-load", + args: []string{"cmd", "--time-factor-under-load", "-1", + "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { From 04542f27baf93878fc5b15da79a58fab6ff5a6ff Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 03:33:30 +1000 Subject: [PATCH 04/17] The factor applies on time-to-first-token Signed-off-by: Qifan Deng --- manifests/dev-config.yaml | 46 +++++++++++++++ pkg/common/config.go | 11 ++++ pkg/llm-d-inference-sim/simulator.go | 8 +-- pkg/llm-d-inference-sim/simulator_test.go | 71 ++++++++++++++++++++--- pkg/llm-d-inference-sim/streaming.go | 8 +-- 5 files changed, 128 insertions(+), 16 deletions(-) create mode 100644 manifests/dev-config.yaml diff --git a/manifests/dev-config.yaml b/manifests/dev-config.yaml new file mode 100644 index 00000000..a3497c16 --- /dev/null +++ b/manifests/dev-config.yaml @@ -0,0 +1,46 @@ +block-size: 16 +data-parallel-size: 1 +enable-kvcache: false +event-batch-size: 16 +failure-injection-rate: 0 +failure-types: null +fake-metrics: + kv-cache-usage: 0.4 + running-requests: 10 + waiting-requests: 30 +hash-seed: 'hashseed' +inter-token-latency: 50 +inter-token-latency-std-dev: 15 +kv-cache-size: 1024 +kv-cache-transfer-latency: 0 +kv-cache-transfer-latency-std-dev: 0 +kv-cache-transfer-time-per-token: 100 +kv-cache-transfer-time-std-dev: 30 +lora-modules: null +max-cpu-loras: 1 +max-loras: 1 +max-model-len: 1024 +max-num-seqs: 7 +max-tool-call-array-param-length: 5 +max-tool-call-integer-param: 100 +max-tool-call-number-param: 100 +min-tool-call-array-param-length: 1 +min-tool-call-integer-param: 0 +min-tool-call-number-param: 0 +mode: random +model: Qwen/Qwen2.5-1.5B-Instruct +object-tool-call-not-required-field-probability: 50 +port: 8000 +prefill-overhead: 80 +prefill-time-per-token: 20 +prefill-time-std-dev: 3 +seed: 1757050700239757600 +served-model-name: + - Qwen/Qwen2.5-1.5B-Instruct +time-factor-under-load: 5 +time-to-first-token: 0 +time-to-first-token-std-dev: 0 +tokenizers-cache-dir: '' +tool-call-not-required-param-probability: 50 +zmq-endpoint: tcp://localhost:5557 +zmq-max-connect-attempts: 0 \ No newline at end of file diff --git a/pkg/common/config.go b/pkg/common/config.go index 7e9200a5..6dc71023 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -175,6 +175,17 @@ type Configuration struct { DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"` } +func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 { + if c.MaxNumSeqs <= 1 { + return 1.0 + } + return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1) +} + +func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int { + return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan)) +} + type Metrics struct { // LoraMetrics LoraMetrics []LorasMetrics `json:"loras"` diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 24446685..002c871b 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) { finishReason = common.RemoteDecodeFinishReason } - s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData) + s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData) } } reqCtx.Wg.Done() @@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools // usageData - usage (tokens statistics) for this response func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall, - modelName string, finishReason string, usageData *openaiserverapi.Usage) { + modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) { resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName, reqCtx.CompletionReq.IsDoRemoteDecode()) @@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r nPromptTokens := usageData.PromptTokens nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() nGenTokens := usageData.CompletionTokens - ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) + ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan) totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens) time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) @@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r } // returns time to first token based on the current request's doRemotePrefill -func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int { +func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int { if doRemotePrefill { if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 { // is disaggregated PD and ttft is calculated using number of prompt tokens diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index df43ff57..b6af37d4 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev simulator.config.KVCacheTransferLatency = kvCacheLatency simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev - timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill) + timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan) if doREmotePrefill { Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3))) Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7))) @@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = 200 simulator.config.PrefillTimeStdDev = 80 - ttft := simulator.getTimeToFirstToken(128, 0, false) + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) Expect(ttft).To(BeNumerically("==", timeToFirstToken)) }) @@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = 200 simulator.config.PrefillTimeStdDev = 80 - ttft := simulator.getTimeToFirstToken(128, 0, false) + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) Expect(ttft).NotTo(BeNumerically("==", 0)) }) @@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = prefillTimePerToken simulator.config.PrefillTimeStdDev = stdDev - ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false) + ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan) expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens) Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) @@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = prefillTimePerToken simulator.config.PrefillTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false) + ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan) expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens) Expect(ttft).To(Equal(expectedTTFT)) }, @@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, 0, true) + ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan) Expect(ttft).To(BeNumerically("==", 200)) }) @@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, 0, true) + ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan) Expect(ttft).To(BeNumerically("==", 12800)) }) @@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT simulator.config.KVCacheTransferTimeStdDev = stddev - ttft := simulator.getTimeToFirstToken(nTokens, 0, true) + ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan) expectedTTFT := kvCacheTransTPT * nTokens Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) @@ -945,5 +945,60 @@ var _ = Describe("Simulator", func() { Entry("very long prompt", 150, 100, 20000), ) + It("when time-factor-under-load is 1, the time to first token should be equal to time-to-first-token", func() { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = 1.0 + + simulator.runReqChan <- 100 + + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + Expect(ttft).To(Equal(42)) + }) + + It("when time-factor-under-load is > 1, but max-num-seqs is 1, the factor will not take effect", func() { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = 100.0 + simulator.config.MaxNumSeqs = 1 + + for len(simulator.runReqChan) > 0 { + <-simulator.runReqChan + } + + simulator.runReqChan <- 1 + + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + Expect(ttft).To(Equal(42)) + }) + + DescribeTable("when time-factor-under-load is > 1, and the sim is fully loaded, the time to first token should be time-factor-under-load * time-to-first-token", + func(timeFactorUnderLoad float64, maxNumOfReq int) { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad + simulator.config.MaxNumSeqs = maxNumOfReq + for len(simulator.runReqChan) > 0 { + <-simulator.runReqChan + } + for i := 0; i < maxNumOfReq; i++ { + simulator.runReqChan <- 1 + } + + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad))) + + }, + func(timeFactorUnderLoad float64, maxNumOfReq int64) string { + return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d", + timeFactorUnderLoad, maxNumOfReq) + }, + + Entry("factor: 1.5", 1.5, 70), + Entry("factor: 2.0", 2.0, 2), + Entry("factor: 100.0", 100.0, 150), + Entry("factor: 20000.0", 20000.0, 310), + ) + }) }) diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index 5ff1e240..a503af6d 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons if len(toolCalls) > 0 { s.logger.Info("Going to send tools calls") for _, tc := range toolCalls { - s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason) + s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan) } } else { s.logger.Info("Going to send text", "number of tokens", len(responseTokens)) - s.sendTokenChunks(context, w, responseTokens, nil, finishReason) + s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan) } } @@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons // sendTokenChunks creates and sends response chunks func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string, - tc *openaiserverapi.ToolCall, finishReason string) { + tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) { // time to first token delay - ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill) + ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan) time.Sleep(time.Duration(ttft) * time.Millisecond) for i, token := range genTokens { From 9ccbe9532ecc5e366a52f86c5b533d11acd0459b Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 12:53:48 +1000 Subject: [PATCH 05/17] Test TTFT when partially loaded Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator_test.go | 33 ++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index b6af37d4..73912534 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -981,7 +981,7 @@ var _ = Describe("Simulator", func() { for len(simulator.runReqChan) > 0 { <-simulator.runReqChan } - for i := 0; i < maxNumOfReq; i++ { + for range maxNumOfReq { simulator.runReqChan <- 1 } @@ -1000,5 +1000,36 @@ var _ = Describe("Simulator", func() { Entry("factor: 20000.0", 20000.0, 310), ) + DescribeTable("when time-factor-under-load is > 1, and the sim is partially loaded, the time to first token should be linear interpolation between time-to-first-token and time-factor-under-load * time-to-first-token", + func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) { + simulator.config.TimeToFirstToken = 42 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad + simulator.config.MaxNumSeqs = maxNumOfReq + + for len(simulator.runReqChan) > 0 { + <-simulator.runReqChan + } + for range nCurrNumOfReq { + simulator.runReqChan <- 1 + } + + ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + max := timeFactorUnderLoad * float64(42) + Expect(ttft).To(BeNumerically(">=", 42)) + Expect(ttft).To(BeNumerically("<=", max)) + + }, + func(timeFactorUnderLoad float64, maxNumOfReq int, nCurrNumOfReq int) string { + return fmt.Sprintf("timeFactorUnderLoad: %f maxNumOfReq: %d nCurrNumOfReq: %d", + timeFactorUnderLoad, maxNumOfReq, nCurrNumOfReq) + }, + + Entry("factor: 1.5", 1.5, 70, 35), + Entry("factor: 2.0", 2.0, 2, 1), + Entry("factor: 100.0", 100.0, 150, 75), + Entry("factor: 20000.0", 20000.0, 310, 155), + ) + }) }) From d21f9c7ef1c6cca796ab5152dd30ed7d7423ebf3 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 13:49:48 +1000 Subject: [PATCH 06/17] Apply time factor under load to prefill and inter token latency Signed-off-by: Qifan Deng --- pkg/common/config.go | 12 +++++++ pkg/common/config_test.go | 44 +++++++++++++++++++++++ pkg/llm-d-inference-sim/simulator.go | 12 +++---- pkg/llm-d-inference-sim/simulator_test.go | 24 ++++++------- pkg/llm-d-inference-sim/streaming.go | 8 ++--- 5 files changed, 78 insertions(+), 22 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index 6dc71023..d822daa5 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -186,6 +186,18 @@ func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int { return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan)) } +func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int { + return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan)) +} + +func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int { + return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan)) +} + +func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int { + return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan)) +} + type Metrics struct { // LoraMetrics LoraMetrics []LorasMetrics `json:"loras"` diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 1c0353ed..4cf59136 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -461,4 +461,48 @@ var _ = Describe("Simulator configuration", func() { }) }) } + + It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() { + c := newConfig() + c.TimeFactorUnderLoad = 1.0 + c.MaxNumSeqs = 11 + reqChan := make(chan int64, 3) + for i := 0; i < 3; i++ { + reqChan <- 1 + } + + factor := c.calcLoadFactor(&reqChan) + Expect(factor).To(BeNumerically("==", 1.0)) + close(reqChan) + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() { + c := newConfig() + c.TimeFactorUnderLoad = 2.0 + c.MaxNumSeqs = 11 + reqChan := make(chan int64, c.MaxNumSeqs) + for i := 0; i < c.MaxNumSeqs; i++ { + reqChan <- 1 + } + + factor := c.calcLoadFactor(&reqChan) + Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad)) + close(reqChan) + + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() { + c := newConfig() + c.TimeFactorUnderLoad = 2.0 + c.MaxNumSeqs = 11 + reqChan := make(chan int64, c.MaxNumSeqs) + for i := 0; i < c.MaxNumSeqs/2; i++ { + reqChan <- 1 + } + factor := c.calcLoadFactor(&reqChan) + Expect(factor).To(BeNumerically(">", 1.0)) + Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad)) + close(reqChan) + + }) }) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 002c871b..9076c48b 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -537,7 +537,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) { finishReason = common.RemoteDecodeFinishReason } - s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &s.runReqChan, &usageData) + s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData) } } reqCtx.Wg.Done() @@ -662,7 +662,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools // usageData - usage (tokens statistics) for this response func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall, - modelName string, finishReason string, runReqChan *chan int64, usageData *openaiserverapi.Usage) { + modelName string, finishReason string, usageData *openaiserverapi.Usage) { resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName, reqCtx.CompletionReq.IsDoRemoteDecode()) @@ -677,7 +677,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r nPromptTokens := usageData.PromptTokens nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() nGenTokens := usageData.CompletionTokens - ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill(), runReqChan) + ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens) time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) @@ -696,7 +696,7 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r } // returns time to first token based on the current request's doRemotePrefill -func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool, runReqChan *chan int64) int { +func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int { if doRemotePrefill { if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 { // is disaggregated PD and ttft is calculated using number of prompt tokens @@ -708,11 +708,11 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke } if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { // is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache - prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken + prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan) return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev) } // is aggregated PD and *not* using number of prompt tokens - return common.RandomNorm(s.config.TimeToFirstToken, s.config.TimeToFirstTokenStdDev) + return common.RandomNorm(s.config.GetTimeToFirstToken(&s.runReqChan), s.config.TimeToFirstTokenStdDev) } // returns inter token latency diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 73912534..fa06830d 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -798,7 +798,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev simulator.config.KVCacheTransferLatency = kvCacheLatency simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev - timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill, &simulator.runReqChan) + timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill) if doREmotePrefill { Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3))) Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7))) @@ -829,7 +829,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = 200 simulator.config.PrefillTimeStdDev = 80 - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).To(BeNumerically("==", timeToFirstToken)) }) @@ -842,7 +842,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = 200 simulator.config.PrefillTimeStdDev = 80 - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).NotTo(BeNumerically("==", 0)) }) @@ -853,7 +853,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = prefillTimePerToken simulator.config.PrefillTimeStdDev = stdDev - ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false) expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens) Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) @@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillTimePerToken = prefillTimePerToken simulator.config.PrefillTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false) expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens) Expect(ttft).To(Equal(expectedTTFT)) }, @@ -905,7 +905,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, true) Expect(ttft).To(BeNumerically("==", 200)) }) @@ -916,7 +916,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, 0, true, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, true) Expect(ttft).To(BeNumerically("==", 12800)) }) @@ -927,7 +927,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT simulator.config.KVCacheTransferTimeStdDev = stddev - ttft := simulator.getTimeToFirstToken(nTokens, 0, true, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(nTokens, 0, true) expectedTTFT := kvCacheTransTPT * nTokens Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) @@ -952,7 +952,7 @@ var _ = Describe("Simulator", func() { simulator.runReqChan <- 100 - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).To(Equal(42)) }) @@ -968,7 +968,7 @@ var _ = Describe("Simulator", func() { simulator.runReqChan <- 1 - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).To(Equal(42)) }) @@ -985,7 +985,7 @@ var _ = Describe("Simulator", func() { simulator.runReqChan <- 1 } - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad))) }, @@ -1014,7 +1014,7 @@ var _ = Describe("Simulator", func() { simulator.runReqChan <- 1 } - ttft := simulator.getTimeToFirstToken(128, 0, false, &simulator.runReqChan) + ttft := simulator.getTimeToFirstToken(128, 0, false) max := timeFactorUnderLoad * float64(42) Expect(ttft).To(BeNumerically(">=", 42)) Expect(ttft).To(BeNumerically("<=", max)) diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index a503af6d..5ff1e240 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -69,11 +69,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons if len(toolCalls) > 0 { s.logger.Info("Going to send tools calls") for _, tc := range toolCalls { - s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason, &s.runReqChan) + s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason) } } else { s.logger.Info("Going to send text", "number of tokens", len(responseTokens)) - s.sendTokenChunks(context, w, responseTokens, nil, finishReason, &s.runReqChan) + s.sendTokenChunks(context, w, responseTokens, nil, finishReason) } } @@ -97,9 +97,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons // sendTokenChunks creates and sends response chunks func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, genTokens []string, - tc *openaiserverapi.ToolCall, finishReason string, runReqChan *chan int64) { + tc *openaiserverapi.ToolCall, finishReason string) { // time to first token delay - ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill, runReqChan) + ttft := s.getTimeToFirstToken(context.nPromptTokens, context.nCachedPromptTokens, context.doRemotePrefill) time.Sleep(time.Duration(ttft) * time.Millisecond) for i, token := range genTokens { From bbfcbe8dd1b144c898c732b563b2a552fde0d89f Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 13:54:03 +1000 Subject: [PATCH 07/17] Improve param desc Signed-off-by: Qifan Deng --- pkg/common/config.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index d822daa5..05a1316a 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -104,11 +104,12 @@ type Configuration struct { // KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"` - // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests - // when parallel requests are being processed. The value of this factor must be >= 1.0, with a default of 1.0. - // If this factor is 1.0, no extra time is added. When the factor is x (where x > 1.0) and there are MaxNumSeqs - // requests, the total time will be multiplied by x. - // The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs. + // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel + // requests are being processed. + // The value of this factor must be >= 1.0, with a default of 1.0. + // - If this factor is 1.0, no extra time is added. + // - When the factor is x (where x > 1.0) and there are MaxNumSeqs requests, the total time will be multiplied by x. + // - The extra time then decreases multiplicatively to 1.0 when the number of requests is less than MaxNumSeqs. TimeFactorUnderLoad float64 `yaml:"time-factor-under-load" json:"time-factor-under-load"` // Mode defines the simulator response generation mode, valid values: echo, random From dd31c34a181946ea574dd6d0a9a8f2d53239be08 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 5 Sep 2025 17:17:33 +1000 Subject: [PATCH 08/17] Use nRunningReqs instead of runReqChan Signed-off-by: Qifan Deng --- .gitignore | 3 ++- pkg/common/config.go | 22 +++++++++++----------- pkg/common/config_test.go | 22 +++------------------- pkg/llm-d-inference-sim/simulator.go | 4 ++-- pkg/llm-d-inference-sim/simulator_test.go | 15 ++------------- 5 files changed, 20 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 3906cfb9..950b0cb4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ vendor .devcontainer # MacOSX .DS_Store -*.test \ No newline at end of file +*.test +manifests/dev-config.yaml diff --git a/pkg/common/config.go b/pkg/common/config.go index 05a1316a..9a54fe86 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -104,7 +104,7 @@ type Configuration struct { // KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"` - // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel + // TimeFactorUnderLoad is a multiplicative factor that affects the overall time taken for requests when parallel // requests are being processed. // The value of this factor must be >= 1.0, with a default of 1.0. // - If this factor is 1.0, no extra time is added. @@ -176,27 +176,27 @@ type Configuration struct { DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"` } -func (c *Configuration) calcLoadFactor(runReqChan *chan int64) float64 { +func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 { if c.MaxNumSeqs <= 1 { return 1.0 } - return 1 + (c.TimeFactorUnderLoad-1)*float64(len(*runReqChan)-1)/float64(c.MaxNumSeqs-1) + return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1) } -func (c *Configuration) GetTimeToFirstToken(runReqChan *chan int64) int { - return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(runReqChan)) +func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int { + return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs)) } -func (c *Configuration) GetPrefillOverhead(runReqChan *chan int64) int { - return int(float64(c.PrefillOverhead) * c.calcLoadFactor(runReqChan)) +func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int { + return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs)) } -func (c *Configuration) GetPrefillTimePerToken(runReqChan *chan int64) int { - return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(runReqChan)) +func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int { + return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs)) } -func (c *Configuration) GetInterTokenLatency(runReqChan *chan int64) int { - return int(float64(c.InterTokenLatency) * c.calcLoadFactor(runReqChan)) +func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int { + return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs)) } type Metrics struct { diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 4cf59136..1edbea07 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -466,28 +466,18 @@ var _ = Describe("Simulator configuration", func() { c := newConfig() c.TimeFactorUnderLoad = 1.0 c.MaxNumSeqs = 11 - reqChan := make(chan int64, 3) - for i := 0; i < 3; i++ { - reqChan <- 1 - } - factor := c.calcLoadFactor(&reqChan) + factor := c.calcLoadFactor(3) Expect(factor).To(BeNumerically("==", 1.0)) - close(reqChan) }) It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() { c := newConfig() c.TimeFactorUnderLoad = 2.0 c.MaxNumSeqs = 11 - reqChan := make(chan int64, c.MaxNumSeqs) - for i := 0; i < c.MaxNumSeqs; i++ { - reqChan <- 1 - } - factor := c.calcLoadFactor(&reqChan) + factor := c.calcLoadFactor(11) Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad)) - close(reqChan) }) @@ -495,14 +485,8 @@ var _ = Describe("Simulator configuration", func() { c := newConfig() c.TimeFactorUnderLoad = 2.0 c.MaxNumSeqs = 11 - reqChan := make(chan int64, c.MaxNumSeqs) - for i := 0; i < c.MaxNumSeqs/2; i++ { - reqChan <- 1 - } - factor := c.calcLoadFactor(&reqChan) + factor := c.calcLoadFactor(6) Expect(factor).To(BeNumerically(">", 1.0)) Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad)) - close(reqChan) - }) }) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 9076c48b..d9d691bc 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -708,11 +708,11 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke } if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { // is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache - prefillTime := s.config.GetPrefillOverhead(&s.runReqChan) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(&s.runReqChan) + prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs) return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev) } // is aggregated PD and *not* using number of prompt tokens - return common.RandomNorm(s.config.GetTimeToFirstToken(&s.runReqChan), s.config.TimeToFirstTokenStdDev) + return common.RandomNorm(s.config.GetTimeToFirstToken(s.nRunningReqs), s.config.TimeToFirstTokenStdDev) } // returns inter token latency diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index fa06830d..9c9f8647 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -978,12 +978,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = 0 simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad simulator.config.MaxNumSeqs = maxNumOfReq - for len(simulator.runReqChan) > 0 { - <-simulator.runReqChan - } - for range maxNumOfReq { - simulator.runReqChan <- 1 - } + simulator.nRunningReqs = int64(maxNumOfReq) ttft := simulator.getTimeToFirstToken(128, 0, false) Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad))) @@ -1006,13 +1001,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = 0 simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad simulator.config.MaxNumSeqs = maxNumOfReq - - for len(simulator.runReqChan) > 0 { - <-simulator.runReqChan - } - for range nCurrNumOfReq { - simulator.runReqChan <- 1 - } + simulator.nRunningReqs = int64(nCurrNumOfReq) ttft := simulator.getTimeToFirstToken(128, 0, false) max := timeFactorUnderLoad * float64(42) From bb10ce7351add18bf0a41150278780becad80b80 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 7 Sep 2025 16:43:15 +1000 Subject: [PATCH 09/17] unstage manifests/dev-config.yaml Signed-off-by: Qifan Deng --- manifests/dev-config.yaml | 46 --------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 manifests/dev-config.yaml diff --git a/manifests/dev-config.yaml b/manifests/dev-config.yaml deleted file mode 100644 index a3497c16..00000000 --- a/manifests/dev-config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -block-size: 16 -data-parallel-size: 1 -enable-kvcache: false -event-batch-size: 16 -failure-injection-rate: 0 -failure-types: null -fake-metrics: - kv-cache-usage: 0.4 - running-requests: 10 - waiting-requests: 30 -hash-seed: 'hashseed' -inter-token-latency: 50 -inter-token-latency-std-dev: 15 -kv-cache-size: 1024 -kv-cache-transfer-latency: 0 -kv-cache-transfer-latency-std-dev: 0 -kv-cache-transfer-time-per-token: 100 -kv-cache-transfer-time-std-dev: 30 -lora-modules: null -max-cpu-loras: 1 -max-loras: 1 -max-model-len: 1024 -max-num-seqs: 7 -max-tool-call-array-param-length: 5 -max-tool-call-integer-param: 100 -max-tool-call-number-param: 100 -min-tool-call-array-param-length: 1 -min-tool-call-integer-param: 0 -min-tool-call-number-param: 0 -mode: random -model: Qwen/Qwen2.5-1.5B-Instruct -object-tool-call-not-required-field-probability: 50 -port: 8000 -prefill-overhead: 80 -prefill-time-per-token: 20 -prefill-time-std-dev: 3 -seed: 1757050700239757600 -served-model-name: - - Qwen/Qwen2.5-1.5B-Instruct -time-factor-under-load: 5 -time-to-first-token: 0 -time-to-first-token-std-dev: 0 -tokenizers-cache-dir: '' -tool-call-not-required-param-probability: 50 -zmq-endpoint: tcp://localhost:5557 -zmq-max-connect-attempts: 0 \ No newline at end of file From ed59319b695328e9fbf3b8d4c180db6d397cc84e Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 7 Sep 2025 16:48:49 +1000 Subject: [PATCH 10/17] Update readme Signed-off-by: Qifan Deng --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 535e77fc..29d4d405 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,7 @@ For more details see the =", int(float32(interTokenLatency)*0.3*float32(numberOfTokens)))) Expect(latency).To(BeNumerically("<=", int(float32(interTokenLatency)*1.7*float32(numberOfTokens)))) }, From e3e3ffd51c0460ae68695092947b1412a8db8fb2 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 7 Sep 2025 21:13:13 +1000 Subject: [PATCH 13/17] Calc inter token latency based on load instead of one-calc-for-whole request Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 72cb357e..ad653c7c 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -674,15 +674,12 @@ func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, r } // calculate how long to wait before returning the response, time is based on number of tokens - nPromptTokens := usageData.PromptTokens nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens() - nGenTokens := usageData.CompletionTokens - ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) - - time.Sleep(time.Duration(ttft) * time.Microsecond) - for range nGenTokens - 1 { + ttft := s.getTimeToFirstToken(usageData.PromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill()) + time.Sleep(time.Duration(ttft) * time.Millisecond) + for range usageData.CompletionTokens - 1 { perTokenLatency := s.getInterTokenLatency() - time.Sleep(time.Duration(perTokenLatency) * time.Microsecond) + time.Sleep(time.Duration(perTokenLatency) * time.Millisecond) } ctx.Response.Header.SetContentType("application/json") From 75415e30719452f035d041508de5fd716578626d Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Tue, 9 Sep 2025 16:22:13 +1000 Subject: [PATCH 14/17] Move methods to simulator Signed-off-by: Qifan Deng --- pkg/common/config.go | 23 ----------------- pkg/common/config_test.go | 28 --------------------- pkg/llm-d-inference-sim/simulator.go | 29 +++++++++++++++++++--- pkg/llm-d-inference-sim/simulator_test.go | 30 +++++++++++++++++++++++ 4 files changed, 56 insertions(+), 54 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index 9a54fe86..c367c029 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -176,29 +176,6 @@ type Configuration struct { DPSize int `yaml:"data-parallel-size" json:"data-parallel-size"` } -func (c *Configuration) calcLoadFactor(nRunningReqs int64) float64 { - if c.MaxNumSeqs <= 1 { - return 1.0 - } - return 1 + (c.TimeFactorUnderLoad-1)*float64(nRunningReqs-1)/float64(c.MaxNumSeqs-1) -} - -func (c *Configuration) GetTimeToFirstToken(nRunningReqs int64) int { - return int(float64(c.TimeToFirstToken) * c.calcLoadFactor(nRunningReqs)) -} - -func (c *Configuration) GetPrefillOverhead(nRunningReqs int64) int { - return int(float64(c.PrefillOverhead) * c.calcLoadFactor(nRunningReqs)) -} - -func (c *Configuration) GetPrefillTimePerToken(nRunningReqs int64) int { - return int(float64(c.PrefillTimePerToken) * c.calcLoadFactor(nRunningReqs)) -} - -func (c *Configuration) GetInterTokenLatency(nRunningReqs int64) int { - return int(float64(c.InterTokenLatency) * c.calcLoadFactor(nRunningReqs)) -} - type Metrics struct { // LoraMetrics LoraMetrics []LorasMetrics `json:"loras"` diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 1edbea07..1c0353ed 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -461,32 +461,4 @@ var _ = Describe("Simulator configuration", func() { }) }) } - - It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() { - c := newConfig() - c.TimeFactorUnderLoad = 1.0 - c.MaxNumSeqs = 11 - - factor := c.calcLoadFactor(3) - Expect(factor).To(BeNumerically("==", 1.0)) - }) - - It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() { - c := newConfig() - c.TimeFactorUnderLoad = 2.0 - c.MaxNumSeqs = 11 - - factor := c.calcLoadFactor(11) - Expect(factor).To(BeNumerically("==", c.TimeFactorUnderLoad)) - - }) - - It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() { - c := newConfig() - c.TimeFactorUnderLoad = 2.0 - c.MaxNumSeqs = 11 - factor := c.calcLoadFactor(6) - Expect(factor).To(BeNumerically(">", 1.0)) - Expect(factor).To(BeNumerically("<", c.TimeFactorUnderLoad)) - }) }) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index ad653c7c..dcf85873 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -709,16 +709,16 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptToke } if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { // is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache - prefillTime := s.config.GetPrefillOverhead(s.nRunningReqs) + (nPromptTokens-nCachedPromptTokens)*s.config.GetPrefillTimePerToken(s.nRunningReqs) + prefillTime := s.GetPrefillOverhead() + (nPromptTokens-nCachedPromptTokens)*s.GetPrefillTimePerToken() return common.RandomNorm(prefillTime, s.config.PrefillTimeStdDev) } // is aggregated PD and *not* using number of prompt tokens - return common.RandomNorm(s.config.GetTimeToFirstToken(s.nRunningReqs), s.config.TimeToFirstTokenStdDev) + return common.RandomNorm(s.GetTimeToFirstToken(), s.config.TimeToFirstTokenStdDev) } // returns inter token latency func (s *VllmSimulator) getInterTokenLatency() int { - return common.RandomNorm(s.config.GetInterTokenLatency(s.nRunningReqs), s.config.InterTokenLatencyStdDev) + return common.RandomNorm(s.GetInterTokenLatency(), s.config.InterTokenLatencyStdDev) } // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist @@ -812,3 +812,26 @@ func (s *VllmSimulator) showConfig(dp bool) error { s.logger.Info("Configuration:", "", string(cfgJSON)) return nil } + +func (s *VllmSimulator) getRealtimeFactor() float64 { + if s.config.MaxNumSeqs <= 1 { + return 1.0 + } + return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1) +} + +func (s *VllmSimulator) GetTimeToFirstToken() int { + return int(float64(s.config.TimeToFirstToken) * s.getRealtimeFactor()) +} + +func (s *VllmSimulator) GetPrefillOverhead() int { + return int(float64(s.config.PrefillOverhead) * s.getRealtimeFactor()) +} + +func (s *VllmSimulator) GetPrefillTimePerToken() int { + return int(float64(s.config.PrefillTimePerToken) * s.getRealtimeFactor()) +} + +func (s *VllmSimulator) GetInterTokenLatency() int { + return int(float64(s.config.InterTokenLatency) * s.getRealtimeFactor()) +} diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 008b31ad..0b3b2212 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -1025,5 +1025,35 @@ var _ = Describe("Simulator", func() { Entry("factor: 20000.0", 20000.0, 310, 155), ) + It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() { + simulator.config.TimeFactorUnderLoad = 1.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 3 + + factor := simulator.getRealtimeFactor() + Expect(factor).To(BeNumerically("==", 1.0)) + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() { + simulator.config.TimeFactorUnderLoad = 2.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 11 + + factor := simulator.getRealtimeFactor() + Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad)) + + }) + + It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() { + simulator.config.TimeFactorUnderLoad = 2.0 + simulator.config.MaxNumSeqs = 11 + simulator.nRunningReqs = 6 + + factor := simulator.getRealtimeFactor() + Expect(factor).To(BeNumerically(">", 1.0)) + Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad)) + }) + }) + }) From 00c172455c05eaecc46a8563cd22d2fceabf9152 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Tue, 9 Sep 2025 16:26:47 +1000 Subject: [PATCH 15/17] Rename helper func Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 10 +++++----- pkg/llm-d-inference-sim/simulator_test.go | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index dcf85873..f042a6fb 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -813,7 +813,7 @@ func (s *VllmSimulator) showConfig(dp bool) error { return nil } -func (s *VllmSimulator) getRealtimeFactor() float64 { +func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 { if s.config.MaxNumSeqs <= 1 { return 1.0 } @@ -821,17 +821,17 @@ func (s *VllmSimulator) getRealtimeFactor() float64 { } func (s *VllmSimulator) GetTimeToFirstToken() int { - return int(float64(s.config.TimeToFirstToken) * s.getRealtimeFactor()) + return int(float64(s.config.TimeToFirstToken) * s.getCurrTimeFactorUnderLoad()) } func (s *VllmSimulator) GetPrefillOverhead() int { - return int(float64(s.config.PrefillOverhead) * s.getRealtimeFactor()) + return int(float64(s.config.PrefillOverhead) * s.getCurrTimeFactorUnderLoad()) } func (s *VllmSimulator) GetPrefillTimePerToken() int { - return int(float64(s.config.PrefillTimePerToken) * s.getRealtimeFactor()) + return int(float64(s.config.PrefillTimePerToken) * s.getCurrTimeFactorUnderLoad()) } func (s *VllmSimulator) GetInterTokenLatency() int { - return int(float64(s.config.InterTokenLatency) * s.getRealtimeFactor()) + return int(float64(s.config.InterTokenLatency) * s.getCurrTimeFactorUnderLoad()) } diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 0b3b2212..9fa75c4e 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -1030,7 +1030,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 3 - factor := simulator.getRealtimeFactor() + factor := simulator.getCurrTimeFactorUnderLoad() Expect(factor).To(BeNumerically("==", 1.0)) }) @@ -1039,7 +1039,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 11 - factor := simulator.getRealtimeFactor() + factor := simulator.getCurrTimeFactorUnderLoad() Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad)) }) @@ -1049,7 +1049,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 6 - factor := simulator.getRealtimeFactor() + factor := simulator.getCurrTimeFactorUnderLoad() Expect(factor).To(BeNumerically(">", 1.0)) Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad)) }) From f22745bb119e21e246baf5b51236160af2ec8a43 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Tue, 9 Sep 2025 16:29:44 +1000 Subject: [PATCH 16/17] Rename helper func Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 10 +++++----- pkg/llm-d-inference-sim/simulator_test.go | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index f042a6fb..5ba1b7e7 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -813,7 +813,7 @@ func (s *VllmSimulator) showConfig(dp bool) error { return nil } -func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 { +func (s *VllmSimulator) getCurrFactor() float64 { if s.config.MaxNumSeqs <= 1 { return 1.0 } @@ -821,17 +821,17 @@ func (s *VllmSimulator) getCurrTimeFactorUnderLoad() float64 { } func (s *VllmSimulator) GetTimeToFirstToken() int { - return int(float64(s.config.TimeToFirstToken) * s.getCurrTimeFactorUnderLoad()) + return int(float64(s.config.TimeToFirstToken) * s.getCurrFactor()) } func (s *VllmSimulator) GetPrefillOverhead() int { - return int(float64(s.config.PrefillOverhead) * s.getCurrTimeFactorUnderLoad()) + return int(float64(s.config.PrefillOverhead) * s.getCurrFactor()) } func (s *VllmSimulator) GetPrefillTimePerToken() int { - return int(float64(s.config.PrefillTimePerToken) * s.getCurrTimeFactorUnderLoad()) + return int(float64(s.config.PrefillTimePerToken) * s.getCurrFactor()) } func (s *VllmSimulator) GetInterTokenLatency() int { - return int(float64(s.config.InterTokenLatency) * s.getCurrTimeFactorUnderLoad()) + return int(float64(s.config.InterTokenLatency) * s.getCurrFactor()) } diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 9fa75c4e..fd0c3ed9 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -1030,7 +1030,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 3 - factor := simulator.getCurrTimeFactorUnderLoad() + factor := simulator.getCurrFactor() Expect(factor).To(BeNumerically("==", 1.0)) }) @@ -1039,7 +1039,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 11 - factor := simulator.getCurrTimeFactorUnderLoad() + factor := simulator.getCurrFactor() Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad)) }) @@ -1049,7 +1049,7 @@ var _ = Describe("Simulator", func() { simulator.config.MaxNumSeqs = 11 simulator.nRunningReqs = 6 - factor := simulator.getCurrTimeFactorUnderLoad() + factor := simulator.getCurrFactor() Expect(factor).To(BeNumerically(">", 1.0)) Expect(factor).To(BeNumerically("<", simulator.config.TimeFactorUnderLoad)) }) From a624b0b120c0b4ff6e0c788de8b0211435834f3d Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Tue, 9 Sep 2025 17:14:31 +1000 Subject: [PATCH 17/17] Fix inter token latency test Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index fd0c3ed9..17485001 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -777,6 +777,8 @@ var _ = Describe("Simulator", func() { func(interTokenLatency int, stddev int, numberOfTokens int) { simulator.config.InterTokenLatency = interTokenLatency simulator.config.InterTokenLatencyStdDev = stddev + simulator.config.MaxNumSeqs = 1 + simulator.config.TimeFactorUnderLoad = 1.0 latency := 0 for range numberOfTokens - 1 {