From 0b7c39e2cb6bc5ac78c0f7631c065d5bd5b1d3de Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 24 Aug 2025 22:15:00 +1000 Subject: [PATCH 01/19] Fix comments on prefill arg in completion request interface Signed-off-by: Qifan Deng --- pkg/openai-server-api/request.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go index d368a211..afab801d 100644 --- a/pkg/openai-server-api/request.go +++ b/pkg/openai-server-api/request.go @@ -53,9 +53,9 @@ type CompletionRequest interface { GetToolChoice() string // GetMaxCompletionTokens returns the maximum completion tokens requested GetMaxCompletionTokens() *int64 - // IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request + // IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is decode request IsDoRemoteDecode() bool - // IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request + // IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is prefill request IsDoRemotePrefill() bool } From e0d61de4392aed8924ba2929799c7857ccee2784 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 00:45:07 +1000 Subject: [PATCH 02/19] Add feature of calc ttft by prefill overhead. TODO: kvcache transfer overhead Signed-off-by: Qifan Deng --- pkg/common/config.go | 17 +++++++ pkg/common/config_test.go | 4 ++ pkg/llm-d-inference-sim/simulator.go | 33 +++++++++++-- pkg/llm-d-inference-sim/simulator_test.go | 60 ++++++++++++++++++++++- pkg/llm-d-inference-sim/streaming.go | 14 +++--- 5 files changed, 116 insertions(+), 12 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index 3d5f6ac1..439fc038 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -65,6 +65,11 @@ type Configuration struct { // in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not // cause the actual time to first token to differ by more than 70% from TimeToFirstToken TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"` + + // PrefillOverhead time taken to prefill the context, in milliseconds + PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + PrefillOverheadComplexity string `yaml:"prefill-overhead-complexity" json:"prefill-overhead-complexity"` + // InterTokenLatency time between generated tokens, in milliseconds InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"` // InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds, @@ -295,6 +300,16 @@ func (c *Configuration) validate() error { if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) { return errors.New("time to first token standard deviation cannot be more than 30% of time to first token") } + if c.PrefillOverhead < 0 { + return errors.New("prefill overhead cannot be negative") + } else if c.PrefillOverhead == 0 { + if c.PrefillOverheadComplexity != "" { + return errors.New("prefill overhead complexity is set, but prefill overhead is 0") + } + } + if c.PrefillOverheadComplexity != "" && c.PrefillOverheadComplexity != "n^2" && c.PrefillOverheadComplexity != "nlog(n)" { + return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"") + } if c.KVCacheTransferLatency < 0 { return errors.New("kv-cache tranfer time cannot be negative") } @@ -400,6 +415,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences") f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)") f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)") + f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if is not 0.") + f.StringVar(&config.PrefillOverheadComplexity, "prefill-overhead-complexity", config.PrefillOverheadComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".") f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)") f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)") f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index f50c40a9..f7cf2e16 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -388,6 +388,10 @@ var _ = Describe("Simulator configuration", func() { name: "invalid (negative) zmq-max-connect-attempts for config file", args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"}, }, + { + name: " must be set when is set", + args: []string{"cmd", "--prefill-overhead-complexity", "n^2", "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index d9813996..93154712 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -22,6 +22,7 @@ import ( "encoding/json" "errors" "fmt" + "math" "net" "os" "strings" @@ -465,7 +466,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) { model: displayModel, doRemotePrefill: req.IsDoRemotePrefill(), }, - responseTokens, toolCalls, finishReason, usageDataToSend, + usageDataToSend.PromptTokens, responseTokens, toolCalls, finishReason, usageDataToSend, ) } else { if req.IsDoRemoteDecode() { @@ -633,8 +634,9 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques } // calculate how long to wait before returning the response, time is based on number of tokens - numOfTokens := usageData.CompletionTokens - totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens) + nPromptTokens := usageData.PromptTokens + nGenTokens := usageData.CompletionTokens + totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill, nPromptTokens) + s.getTotalInterTokenLatency(nGenTokens) time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) ctx.Response.Header.SetContentType("application/json") @@ -652,7 +654,14 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques } // returns time to first token based on the current request's doRemotePrefill -func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int { +func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool, nPromptTokens int) int { + if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 { + if nPromptTokens <= 1 { + return s.config.PrefillOverhead + } + return s.calcPrefillOverhead(nPromptTokens) + } + mean := float64(s.config.TimeToFirstToken) stddev := float64(s.config.TimeToFirstTokenStdDev) if doRemotePrefill { @@ -678,6 +687,22 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { return total } +// calc the prefill overhead against number of tokens +func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int { + pfOverhead := s.config.PrefillOverhead + complexity := s.config.PrefillOverheadComplexity + // policies of different complexities of prefill implementation + switch complexity { + case "n^2", "": + // this is simple implementation of n^2 + return pfOverhead * nPromptTokens * nPromptTokens + case "nlog(n)": + return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens)))) + } + + return 0 +} + // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse { modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}} diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 88d87759..5fc462a0 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "io" + "math" "net" "net/http" "os" @@ -801,7 +802,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev simulator.config.KVCacheTransferLatency = kvCacheLatency simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev - timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill) + timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill, 1) if doREmotePrefill { Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3))) Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7))) @@ -822,6 +823,63 @@ var _ = Describe("Simulator", func() { Entry(nil, 10000, 0, 1000, 0, true), Entry(nil, 10000, 0, 1000, 0, false), ) + + It("when is not 0, ignore ", func() { + timeToFirstToken := 10000 + prefillOverhead := 100 + simulator.config.TimeToFirstToken = timeToFirstToken + simulator.config.PrefillOverhead = prefillOverhead + timeToFirst := simulator.getTimeToFirstToken(false, 1) + Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3))) + Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7))) + }) + + It("when is 0, use ", func() { + simulator.config.TimeToFirstToken = 0 + simulator.config.PrefillOverhead = 100 + timeToFirst := simulator.getTimeToFirstToken(false, 1) + Expect(timeToFirst).To(BeNumerically(">=", 100)) + }) + + DescribeTable("time to first token is super linear of prefill against number of prompt tokens", + func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { + square := prefillOverhead * nTokens * nTokens + simulator.config.PrefillOverhead = prefillOverhead + timeToFirst := simulator.getTimeToFirstToken(false, nTokens) + diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square) + Expect(diffRatio).To(BeNumerically("<", tolerance)) + } + }, + func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", + prefillOverhead, tolerance, minNTokens, maxNTokens) + }, + Entry("small numbers", 100, 0.1, 1, 10), + Entry("medium numbers, larger range", 200, 0.1, 50, 100), + Entry("large numbers", 150, 0.05, 20000, 20010), + ) + + DescribeTable("time to first token is log-linear of prefill against number of prompt tokens", + func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + simulator.config.PrefillOverheadComplexity = "nlog(n)" + + for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { + nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens))) + simulator.config.PrefillOverhead = prefillOverhead + timeToFirst := simulator.getTimeToFirstToken(false, nTokens) + diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn) + Expect(diffRatio).To(BeNumerically("<", tolerance)) + } + }, + func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", + prefillOverhead, tolerance, minNTokens, maxNTokens) + }, + Entry("small numbers", 100, 0.1, 2, 10), + Entry("medium numbers, larger range", 200, 0.1, 50, 100), + Entry("large numbers", 150, 0.05, 20000, 20010), + ) }) Context("fake metrics", func() { diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index 969f29af..e2295244 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -39,7 +39,7 @@ type streamingContext struct { // as defined by isChatCompletion // response content is wrapped according SSE format // First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds -func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []openaiserverapi.ToolCall, +func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, nPromptTokens int, responseTokens []string, toolCalls []openaiserverapi.ToolCall, finishReason string, usageData *openaiserverapi.Usage) { context.ctx.SetContentType("text/event-stream") context.ctx.SetStatusCode(fasthttp.StatusOK) @@ -67,11 +67,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons if len(toolCalls) > 0 { s.logger.Info("Going to send tools calls") for _, tc := range toolCalls { - s.sendTokenChunks(context, w, tc.Function.TokenizedArguments, &tc, finishReason) + s.sendTokenChunks(context, w, nPromptTokens, tc.Function.TokenizedArguments, &tc, finishReason) } } else { s.logger.Info("Going to send text", "number of tokens", len(responseTokens)) - s.sendTokenChunks(context, w, responseTokens, nil, finishReason) + s.sendTokenChunks(context, w, nPromptTokens, responseTokens, nil, finishReason) } } @@ -94,11 +94,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons } // sendTokenChunks creates and sends response chunks -func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *openaiserverapi.ToolCall, finishReason string) { +func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, nPromptTokens int, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) { // time to first token delay - time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill)) * time.Millisecond) + time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill, nPromptTokens)) * time.Millisecond) - for i, token := range tokens { + for i, token := range genTokens { if i != 0 { time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond) } @@ -119,7 +119,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ var chunk openaiserverapi.CompletionRespChunk var finishReasonToSend *string - if i == len(tokens)-1 && (finishReason == common.LengthFinishReason || finishReason == common.ToolsFinishReason) { + if i == len(genTokens)-1 && (finishReason == common.LengthFinishReason || finishReason == common.ToolsFinishReason) { finishReasonToSend = &finishReason } if context.isChatCompletion { From a199aeaa214c72b369bd5647a1d2c577a03a0973 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 12:02:04 +1000 Subject: [PATCH 03/19] Rename prefill-overhead-complexity to prefill-complexity Signed-off-by: Qifan Deng --- pkg/common/config.go | 10 +++++----- pkg/common/config_test.go | 4 ++-- pkg/llm-d-inference-sim/simulator.go | 2 +- pkg/llm-d-inference-sim/simulator_test.go | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index 439fc038..c8e956f9 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -67,8 +67,8 @@ type Configuration struct { TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"` // PrefillOverhead time taken to prefill the context, in milliseconds - PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` - PrefillOverheadComplexity string `yaml:"prefill-overhead-complexity" json:"prefill-overhead-complexity"` + PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"` // InterTokenLatency time between generated tokens, in milliseconds InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"` @@ -303,11 +303,11 @@ func (c *Configuration) validate() error { if c.PrefillOverhead < 0 { return errors.New("prefill overhead cannot be negative") } else if c.PrefillOverhead == 0 { - if c.PrefillOverheadComplexity != "" { + if c.PrefillComplexity != "" { return errors.New("prefill overhead complexity is set, but prefill overhead is 0") } } - if c.PrefillOverheadComplexity != "" && c.PrefillOverheadComplexity != "n^2" && c.PrefillOverheadComplexity != "nlog(n)" { + if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" { return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"") } if c.KVCacheTransferLatency < 0 { @@ -416,7 +416,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)") f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)") f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if is not 0.") - f.StringVar(&config.PrefillOverheadComplexity, "prefill-overhead-complexity", config.PrefillOverheadComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".") + f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".") f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)") f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)") f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index f7cf2e16..830e55a0 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -389,8 +389,8 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"}, }, { - name: " must be set when is set", - args: []string{"cmd", "--prefill-overhead-complexity", "n^2", "--config", "../../manifests/config.yaml"}, + name: " must be set when is set", + args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"}, }, } diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 93154712..7603ada2 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -690,7 +690,7 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { // calc the prefill overhead against number of tokens func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int { pfOverhead := s.config.PrefillOverhead - complexity := s.config.PrefillOverheadComplexity + complexity := s.config.PrefillComplexity // policies of different complexities of prefill implementation switch complexity { case "n^2", "": diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 5fc462a0..3584165a 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -862,7 +862,7 @@ var _ = Describe("Simulator", func() { DescribeTable("time to first token is log-linear of prefill against number of prompt tokens", func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { - simulator.config.PrefillOverheadComplexity = "nlog(n)" + simulator.config.PrefillComplexity = "nlog(n)" for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens))) From cecb32c8fb1215c3813b564a343b4db8f0d6462e Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 15:18:04 +1000 Subject: [PATCH 04/19] Calc kv cache transfer overhead based on prompt length Signed-off-by: Qifan Deng --- pkg/common/config.go | 24 +++++- pkg/common/config_test.go | 12 +++ pkg/llm-d-inference-sim/simulator.go | 34 +++++++-- pkg/llm-d-inference-sim/simulator_test.go | 89 ++++++++++++++++++++--- pkg/llm-d-inference-sim/streaming.go | 2 +- 5 files changed, 142 insertions(+), 19 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index c8e956f9..f4ea6198 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -67,7 +67,9 @@ type Configuration struct { TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"` // PrefillOverhead time taken to prefill the context, in milliseconds - PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + // PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context + PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + // options are "n^2" and "nlog(n)" PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"` // InterTokenLatency time between generated tokens, in milliseconds @@ -85,6 +87,13 @@ type Configuration struct { // KVCacheTransferLatency KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"` + // KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated, + // in milliseconds. + // KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache. + KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"` + // options are "linear" and "in-place", default is "linear" + KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"` + // Mode defines the simulator response generation mode, valid values: echo, random Mode string `yaml:"mode" json:"mode"` // Seed defines random seed for operations @@ -319,6 +328,17 @@ func (c *Configuration) validate() error { if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) { return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer") } + if c.KVCacheTransferOverhead < 0 { + return errors.New("kv-cache transfer overhead cannot be negative") + } else if c.KVCacheTransferOverhead == 0 { + if c.KVCacheTransferComplexity != "" { + return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0") + } + } + if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" { + return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"") + } + if c.MaxLoras < 1 { return errors.New("max LoRAs cannot be less than 1") } @@ -422,6 +442,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)") f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)") + f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if is not set.") + f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".") f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call") f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 830e55a0..373b8b80 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -392,6 +392,18 @@ var _ = Describe("Simulator configuration", func() { name: " must be set when is set", args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"}, }, + { + name: " should not be 'xxx'", + args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"}, + }, + { + name: " must be set when is set", + args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"}, + }, + { + name: " should not be 'xxx'", + args: []string{"cmd", "--kv-cache-transfer-complexity", "xxx", "--config", "../../manifests/config.yaml"}, + }, } for _, test := range invalidTests { diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 7603ada2..93797291 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -636,7 +636,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques // calculate how long to wait before returning the response, time is based on number of tokens nPromptTokens := usageData.PromptTokens nGenTokens := usageData.CompletionTokens - totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill, nPromptTokens) + s.getTotalInterTokenLatency(nGenTokens) + totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens) time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) ctx.Response.Header.SetContentType("application/json") @@ -654,13 +654,17 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques } // returns time to first token based on the current request's doRemotePrefill -func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool, nPromptTokens int) int { +func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int { if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 { if nPromptTokens <= 1 { - return s.config.PrefillOverhead + if !doRemotePrefill { + return s.config.PrefillOverhead + } + return s.config.KVCacheTransferOverhead } - return s.calcPrefillOverhead(nPromptTokens) + return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) } + fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill) mean := float64(s.config.TimeToFirstToken) stddev := float64(s.config.TimeToFirstTokenStdDev) @@ -688,7 +692,10 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { } // calc the prefill overhead against number of tokens -func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int { +func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int { + if doRemotePrefill { + return s.calcRemotePrefillOverhead(nPromptTokens) + } pfOverhead := s.config.PrefillOverhead complexity := s.config.PrefillComplexity // policies of different complexities of prefill implementation @@ -699,7 +706,24 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int) int { case "nlog(n)": return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens)))) } + // should never reach here + return 0 +} +// calc the remote prefill overhead against number of tokens +func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int { + overhead := s.config.KVCacheTransferOverhead + complexity := s.config.KVCacheTransferComplexity + switch complexity { + case "linear", "": + fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens) + return overhead * nPromptTokens + case "in-place": + // when the context is already filled + // this is a simple implementation which return a defined overhead + return overhead + } + // should never reach here return 0 } diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 3584165a..11853564 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -802,7 +802,7 @@ var _ = Describe("Simulator", func() { simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev simulator.config.KVCacheTransferLatency = kvCacheLatency simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev - timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill, 1) + timeToFirst := simulator.getTimeToFirstToken(1, doREmotePrefill) if doREmotePrefill { Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3))) Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7))) @@ -826,29 +826,30 @@ var _ = Describe("Simulator", func() { It("when is not 0, ignore ", func() { timeToFirstToken := 10000 - prefillOverhead := 100 simulator.config.TimeToFirstToken = timeToFirstToken - simulator.config.PrefillOverhead = prefillOverhead - timeToFirst := simulator.getTimeToFirstToken(false, 1) + simulator.config.PrefillOverhead = 100 + timeToFirst := simulator.getTimeToFirstToken(1, false) Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3))) Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7))) }) - It("when is 0, use ", func() { + It("when is 0, and is not 0, use ", func() { simulator.config.TimeToFirstToken = 0 simulator.config.PrefillOverhead = 100 - timeToFirst := simulator.getTimeToFirstToken(false, 1) + timeToFirst := simulator.getTimeToFirstToken(1, false) Expect(timeToFirst).To(BeNumerically(">=", 100)) }) DescribeTable("time to first token is super linear of prefill against number of prompt tokens", func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + simulator.config.PrefillComplexity = "n^2" for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - square := prefillOverhead * nTokens * nTokens simulator.config.PrefillOverhead = prefillOverhead - timeToFirst := simulator.getTimeToFirstToken(false, nTokens) + timeToFirst := simulator.getTimeToFirstToken(nTokens, false) + + square := prefillOverhead * nTokens * nTokens diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square) - Expect(diffRatio).To(BeNumerically("<", tolerance)) + Expect(diffRatio).To(BeNumerically("<=", tolerance)) } }, func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { @@ -865,11 +866,12 @@ var _ = Describe("Simulator", func() { simulator.config.PrefillComplexity = "nlog(n)" for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens))) simulator.config.PrefillOverhead = prefillOverhead - timeToFirst := simulator.getTimeToFirstToken(false, nTokens) + timeToFirst := simulator.getTimeToFirstToken(nTokens, false) + + nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens))) diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn) - Expect(diffRatio).To(BeNumerically("<", tolerance)) + Expect(diffRatio).To(BeNumerically("<=", tolerance)) } }, func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { @@ -880,6 +882,69 @@ var _ = Describe("Simulator", func() { Entry("medium numbers, larger range", 200, 0.1, 50, 100), Entry("large numbers", 150, 0.05, 20000, 20010), ) + + It("when not 0, ignore ", func() { + overhead := 100 + simulator.config.KVCacheTransferLatency = 1000 + simulator.config.KVCacheTransferOverhead = overhead + timeToFirst := simulator.getTimeToFirstToken(1, false) + Expect(timeToFirst).To(BeNumerically(">=", overhead)) + }) + + It("when is 0, and is not 0, use ", func() { + overhead := 100 + simulator.config.KVCacheTransferLatency = 0 + simulator.config.KVCacheTransferOverhead = overhead + timeToFirst := simulator.getTimeToFirstToken(1, false) + Expect(timeToFirst).To(BeNumerically(">", 0)) + }) + + DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens", + func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + simulator.config.TimeToFirstToken = 0 + simulator.config.PrefillOverhead = 1 + simulator.config.KVCacheTransferComplexity = "linear" + + for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { + simulator.config.KVCacheTransferOverhead = kvCacheOverhead + timeToFirst := simulator.getTimeToFirstToken(nTokens, true) + + linear := kvCacheOverhead * nTokens + diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear) + Expect(diffRatio).To(BeNumerically("<=", tolerance)) + } + }, + func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", + kvCacheOverhead, tolerance, minNTokens, maxNTokens) + }, + Entry("small numbers", 100, 0.1, 1, 10), + Entry("medium numbers, larger range", 200, 0.1, 50, 100), + Entry("large numbers", 150, 0.05, 20000, 20010), + ) + + DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens", + func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + simulator.config.TimeToFirstToken = 0 + simulator.config.PrefillOverhead = 1 + simulator.config.KVCacheTransferComplexity = "in-place" + for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { + simulator.config.KVCacheTransferOverhead = kvCacheOverhead + timeToFirst := simulator.getTimeToFirstToken(nTokens, true) + + inPlace := kvCacheOverhead + diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace) + Expect(diffRatio).To(BeNumerically("<=", tolerance)) + } + }, + func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", + kvCacheOverhead, tolerance, minNTokens, maxNTokens) + }, + Entry("small numbers", 100, 0.1, 1, 10), + Entry("medium numbers, larger range", 200, 0.1, 50, 100), + Entry("large numbers", 150, 0.05, 20000, 20010), + ) }) Context("fake metrics", func() { diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go index e2295244..d234114a 100644 --- a/pkg/llm-d-inference-sim/streaming.go +++ b/pkg/llm-d-inference-sim/streaming.go @@ -96,7 +96,7 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, nPrompt // sendTokenChunks creates and sends response chunks func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, nPromptTokens int, genTokens []string, tc *openaiserverapi.ToolCall, finishReason string) { // time to first token delay - time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill, nPromptTokens)) * time.Millisecond) + time.Sleep(time.Duration(s.getTimeToFirstToken(nPromptTokens, context.doRemotePrefill)) * time.Millisecond) for i, token := range genTokens { if i != 0 { From 0c80d58382cbccdaaca8c559b7f20e7f42aca54c Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 15:29:08 +1000 Subject: [PATCH 05/19] Add invalid test cases for args prefill-overhead and kv-cache-transfer-overhead Signed-off-by: Qifan Deng --- pkg/common/config_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 373b8b80..cae17fd5 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -388,6 +388,11 @@ var _ = Describe("Simulator configuration", func() { name: "invalid (negative) zmq-max-connect-attempts for config file", args: []string{"cmd", "--config", "../../manifests/invalid-config.yaml"}, }, + { + name: "invalid (negative) prefill-overhead", + args: []string{"cmd", "--prefill-overhead", "-1", + "--config", "../../manifests/config.yaml"}, + }, { name: " must be set when is set", args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"}, @@ -396,6 +401,11 @@ var _ = Describe("Simulator configuration", func() { name: " should not be 'xxx'", args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid (negative) kv-cache-transfer-overhead", + args: []string{"cmd", "--kv-cache-transfer-overhead", "-1", + "--config", "../../manifests/config.yaml"}, + }, { name: " must be set when is set", args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"}, From 18d30756a7173798e9bf0ed010478829b6b14d9a Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 17:12:17 +1000 Subject: [PATCH 06/19] Add standard deviation in utils Signed-off-by: Qifan Deng --- pkg/common/utils.go | 19 +++++++++++++++++++ pkg/common/utils_test.go | 12 ++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 2cb4ad66..39555c2d 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -261,3 +261,22 @@ func init() { func Tokenize(text string) []string { return re.FindAllString(text, -1) } + +// Calculate standard deviation of an int array +func StdDevInt(data []int) float64 { + var sum int + for _, value := range data { + sum += value + } + mean := sum / len(data) + + var sumSquares int + for _, value := range data { + diff := value - mean + sumSquares += diff * diff + } + + variance := sumSquares / len(data) + + return math.Sqrt(float64(variance)) +} diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index dd6cadab..81341078 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -156,4 +156,16 @@ var _ = Describe("Utils", Ordered, func() { } }) + Context("Standard Deviation", func() { + It("should return 0 for a single element", func() { + data := []int{42} + Expect(StdDevInt(data)).To(Equal(0.0)) + }) + + It("should return the correct standard deviation for multiple elements", func() { + data := []int{1, 2, 3, 4, 5} + Expect(StdDevInt(data)).To(Equal(1.4142135623730951)) + }) + }) + }) From 1fd0a9af12e6be85ba8cde29314be1e8ba39a8e0 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 20:59:30 +1000 Subject: [PATCH 07/19] Add stddev for prefill overhead and kvcache trans overhead Signed-off-by: Qifan Deng --- pkg/common/config.go | 12 +++ pkg/common/config_test.go | 10 ++ pkg/llm-d-inference-sim/simulator.go | 18 ++-- pkg/llm-d-inference-sim/simulator_test.go | 109 +++++++++++++--------- 4 files changed, 94 insertions(+), 55 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index f4ea6198..10367a49 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -69,6 +69,8 @@ type Configuration struct { // PrefillOverhead time taken to prefill the context, in milliseconds // PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + // PrefillOverheadStdDev similar to TimeToFirstTokenStdDev + PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"` // options are "n^2" and "nlog(n)" PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"` @@ -91,6 +93,8 @@ type Configuration struct { // in milliseconds. // KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache. KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"` + // KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev + KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"` // options are "linear" and "in-place", default is "linear" KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"` @@ -316,6 +320,9 @@ func (c *Configuration) validate() error { return errors.New("prefill overhead complexity is set, but prefill overhead is 0") } } + if c.PrefillOverheadStdDev < 0 { + return errors.New("prefill overhead standard deviation cannot be negative") + } if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" { return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"") } @@ -335,6 +342,9 @@ func (c *Configuration) validate() error { return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0") } } + if c.KVCacheTransferOverheadStdDev < 0 { + return errors.New("kv-cache transfer overhead standard deviation cannot be negative") + } if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" { return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"") } @@ -436,6 +446,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)") f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)") f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if is not 0.") + f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)") f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".") f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)") f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)") @@ -443,6 +454,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)") f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)") f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if is not set.") + f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)") f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".") f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call") diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index cae17fd5..54651c97 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -393,6 +393,11 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--prefill-overhead", "-1", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid (negative) prefill-overhead-std-dev", + args: []string{"cmd", "--prefill-overhead-std-dev", "-1", + "--config", "../../manifests/config.yaml"}, + }, { name: " must be set when is set", args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"}, @@ -406,6 +411,11 @@ var _ = Describe("Simulator configuration", func() { args: []string{"cmd", "--kv-cache-transfer-overhead", "-1", "--config", "../../manifests/config.yaml"}, }, + { + name: "invalid (negative) kv-cache-transfer-overhead-std-dev", + args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1", + "--config", "../../manifests/config.yaml"}, + }, { name: " must be set when is set", args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"}, diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 93797291..5f628d33 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -664,7 +664,6 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b } return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) } - fmt.Printf("get time to first token %d, nPromptTokens %d, doRemotePrefill %v\n", s.config.TimeToFirstToken, nPromptTokens, doRemotePrefill) mean := float64(s.config.TimeToFirstToken) stddev := float64(s.config.TimeToFirstTokenStdDev) @@ -699,32 +698,31 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b pfOverhead := s.config.PrefillOverhead complexity := s.config.PrefillComplexity // policies of different complexities of prefill implementation + overhead := 0 switch complexity { case "n^2", "": // this is simple implementation of n^2 - return pfOverhead * nPromptTokens * nPromptTokens + overhead = pfOverhead * nPromptTokens * nPromptTokens case "nlog(n)": - return int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens)))) + overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens)))) } - // should never reach here - return 0 + return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev))) } // calc the remote prefill overhead against number of tokens func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int { overhead := s.config.KVCacheTransferOverhead complexity := s.config.KVCacheTransferComplexity + total := 0 switch complexity { case "linear", "": - fmt.Printf("linear complexity, overhead %d, nPromptTokens %d\n", overhead, nPromptTokens) - return overhead * nPromptTokens + total = overhead * nPromptTokens case "in-place": // when the context is already filled // this is a simple implementation which return a defined overhead - return overhead + total = overhead } - // should never reach here - return 0 + return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev))) } // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 11853564..8cc21b9e 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -841,46 +841,57 @@ var _ = Describe("Simulator", func() { }) DescribeTable("time to first token is super linear of prefill against number of prompt tokens", - func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) { + simulator.config.TimeToFirstToken = 0 simulator.config.PrefillComplexity = "n^2" + simulator.config.PrefillOverhead = prefillOverhead + simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev + for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - simulator.config.PrefillOverhead = prefillOverhead timeToFirst := simulator.getTimeToFirstToken(nTokens, false) - square := prefillOverhead * nTokens * nTokens - diffRatio := math.Abs(float64(timeToFirst-square)) / float64(square) - Expect(diffRatio).To(BeNumerically("<=", tolerance)) + n2 := prefillOverhead * nTokens * nTokens + n2logn := n2 * int(math.Log2(float64(nTokens))) + nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens))) + + Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3))) + Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7))) } }, - func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", - prefillOverhead, tolerance, minNTokens, maxNTokens) + func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", + prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens) }, - Entry("small numbers", 100, 0.1, 1, 10), - Entry("medium numbers, larger range", 200, 0.1, 50, 100), - Entry("large numbers", 150, 0.05, 20000, 20010), + Entry("small numbers", 100, 50, 2, 10), + Entry("medium numbers, larger range", 200, 100, 50, 100), + Entry("large numbers", 150, 125, 20000, 20010), + Entry("stddev is 0", 150, 0, 20000, 20010), ) DescribeTable("time to first token is log-linear of prefill against number of prompt tokens", - func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) { + simulator.config.TimeToFirstToken = 0 simulator.config.PrefillComplexity = "nlog(n)" + simulator.config.PrefillOverhead = prefillOverhead + simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - simulator.config.PrefillOverhead = prefillOverhead timeToFirst := simulator.getTimeToFirstToken(nTokens, false) - nlogn := int(float64(prefillOverhead) * float64(nTokens) * math.Log2(float64(nTokens))) - diffRatio := math.Abs(float64(timeToFirst-nlogn)) / float64(nlogn) - Expect(diffRatio).To(BeNumerically("<=", tolerance)) + logn := prefillOverhead * int(math.Log2(float64(nTokens))) + n2 := prefillOverhead * nTokens * nTokens + Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3))) + Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7))) } }, - func(prefillOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("prefillOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", - prefillOverhead, tolerance, minNTokens, maxNTokens) + func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", + prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens) }, - Entry("small numbers", 100, 0.1, 2, 10), - Entry("medium numbers, larger range", 200, 0.1, 50, 100), - Entry("large numbers", 150, 0.05, 20000, 20010), + Entry("small numbers", 100, 50, 2, 10), + Entry("medium numbers, larger range", 200, 100, 50, 100), + Entry("large numbers", 150, 125, 20000, 20010), + Entry("stddev is 0", 150, 0, 20000, 20010), ) It("when not 0, ignore ", func() { @@ -900,50 +911,58 @@ var _ = Describe("Simulator", func() { }) DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens", - func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) { simulator.config.TimeToFirstToken = 0 simulator.config.PrefillOverhead = 1 simulator.config.KVCacheTransferComplexity = "linear" + simulator.config.KVCacheTransferOverheadStdDev = stddev + simulator.config.KVCacheTransferOverhead = kvCacheOverhead for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - simulator.config.KVCacheTransferOverhead = kvCacheOverhead timeToFirst := simulator.getTimeToFirstToken(nTokens, true) - linear := kvCacheOverhead * nTokens - diffRatio := math.Abs(float64(timeToFirst-linear)) / float64(linear) - Expect(diffRatio).To(BeNumerically("<=", tolerance)) + n2 := kvCacheOverhead * nTokens * nTokens + logn := kvCacheOverhead * int(math.Log2(float64(nTokens))) + Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3))) + Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7))) } }, - func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", - kvCacheOverhead, tolerance, minNTokens, maxNTokens) + func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", + kvCacheOverhead, stddev, minNTokens, maxNTokens) }, - Entry("small numbers", 100, 0.1, 1, 10), - Entry("medium numbers, larger range", 200, 0.1, 50, 100), - Entry("large numbers", 150, 0.05, 20000, 20010), + Entry("small numbers", 100, 50, 2, 10), + Entry("medium numbers, larger range", 200, 180, 50, 100), + Entry("large numbers", 150, 70, 20000, 20010), + Entry("stddev is 0", 150, 0, 20000, 20010), ) DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens", - func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) { + func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) { simulator.config.TimeToFirstToken = 0 simulator.config.PrefillOverhead = 1 simulator.config.KVCacheTransferComplexity = "in-place" + simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev + simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead + + var ttfts []int for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - simulator.config.KVCacheTransferOverhead = kvCacheOverhead timeToFirst := simulator.getTimeToFirstToken(nTokens, true) - - inPlace := kvCacheOverhead - diffRatio := math.Abs(float64(timeToFirst-inPlace)) / float64(inPlace) - Expect(diffRatio).To(BeNumerically("<=", tolerance)) + ttfts = append(ttfts, timeToFirst) } + // get stdv of ttfts + stdv := common.StdDevInt(ttfts) + fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv) + Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev)) }, - func(kvCacheOverhead int, tolerance float64, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("kvCacheOverhead: %d tolerance: %f minNTokens: %d maxNTokens: %d", - kvCacheOverhead, tolerance, minNTokens, maxNTokens) + func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string { + return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d", + kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens) }, - Entry("small numbers", 100, 0.1, 1, 10), - Entry("medium numbers, larger range", 200, 0.1, 50, 100), - Entry("large numbers", 150, 0.05, 20000, 20010), + Entry("small numbers", 100, 50, 2, 10), + Entry("medium numbers, larger range", 200, 150, 50, 100), + Entry("large numbers", 150, 200, 20000, 20010), + Entry("stddev is 0", 150, 0, 20000, 20010), ) }) From 1e8f33d6e55422730d423002597aeb011ec4bc2d Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Mon, 25 Aug 2025 21:58:35 +1000 Subject: [PATCH 08/19] Fix test condition when remove p/d is enabled and in-place policy is used Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index 8cc21b9e..df023cf1 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -953,7 +953,7 @@ var _ = Describe("Simulator", func() { // get stdv of ttfts stdv := common.StdDevInt(ttfts) fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv) - Expect(stdv).To(BeNumerically("<=", kvCacheTransOverheadStdDev)) + Expect(stdv).To(BeNumerically("<=", int(float64(kvCacheTransOverheadStdDev)*1.7))) }, func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string { return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d", From dff8d3ddcd1846266eba3c10aa18697eae02f858 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 29 Aug 2025 21:47:59 +0800 Subject: [PATCH 09/19] Use simplfied implementation of ttft Signed-off-by: Qifan Deng --- pkg/common/config.go | 69 +++++----- pkg/llm-d-inference-sim/simulator.go | 44 ++----- pkg/llm-d-inference-sim/simulator_test.go | 147 +++++++--------------- 3 files changed, 86 insertions(+), 174 deletions(-) diff --git a/pkg/common/config.go b/pkg/common/config.go index 10367a49..45aa7acf 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -66,14 +66,6 @@ type Configuration struct { // cause the actual time to first token to differ by more than 70% from TimeToFirstToken TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev" json:"time-to-first-token-std-dev"` - // PrefillOverhead time taken to prefill the context, in milliseconds - // PrefillOverhead along with PrefillComplexity defines the time taken to prefill the context - PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` - // PrefillOverheadStdDev similar to TimeToFirstTokenStdDev - PrefillOverheadStdDev int `yaml:"prefill-overhead-std-dev" json:"prefill-overhead-std-dev"` - // options are "n^2" and "nlog(n)" - PrefillComplexity string `yaml:"prefill-complexity" json:"prefill-complexity"` - // InterTokenLatency time between generated tokens, in milliseconds InterTokenLatency int `yaml:"inter-token-latency" json:"inter-token-latency"` // InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds, @@ -89,14 +81,20 @@ type Configuration struct { // KVCacheTransferLatency KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev" json:"kv-cache-transfer-latency-std-dev"` + // $Total Prefill Time = PrefillOverhead + n * PrefillTimePerToken$ + // the assumption is that n is less than k, where k is the number of prallelism units of GPU + // PrefillOverhead time taken to prefill the context, in milliseconds + PrefillOverhead int `yaml:"prefill-overhead" json:"prefill-overhead"` + PrefillTimePerToken int `yaml:"prefill-time-per-token" json:"prefill-time-per-token"` + // PrefillOverheadStdDev similar to TimeToFirstTokenStdDev + PrefillTimeStdDev int `yaml:"prefill-time-std-dev" json:"prefill-time-std-dev"` + // $Total KV Cache Transfer Time = n * KVCacheTransferTimePerToken$ + // the assumption is that the cache blocks are all missed at the remote pod // KVCacheTransfer overhead time taken to transfer kv-cache from another vLLM instance in case P/D is activated, // in milliseconds. - // KVCacheTransferOverhead along with KVCacheTransferComplexity defines the time taken to transfer kv-cache. - KVCacheTransferOverhead int `yaml:"kv-cache-transfer-overhead" json:"kv-cache-transfer-overhead"` + KVCacheTransferTimePerToken int `yaml:"kv-cache-transfer-time-per-token" json:"kv-cache-transfer-time-per-token"` // KVCacheTransferOverheadStdDev similar to TimeToFirstTokenStdDev - KVCacheTransferOverheadStdDev int `yaml:"kv-cache-transfer-overhead-std-dev" json:"kv-cache-transfer-overhead-std-dev"` - // options are "linear" and "in-place", default is "linear" - KVCacheTransferComplexity string `yaml:"kv-cache-transfer-complexity" json:"kv-cache-transfer-complexity"` + KVCacheTransferTimeStdDev int `yaml:"kv-cache-transfer-time-std-dev" json:"kv-cache-transfer-time-std-dev"` // Mode defines the simulator response generation mode, valid values: echo, random Mode string `yaml:"mode" json:"mode"` @@ -313,19 +311,24 @@ func (c *Configuration) validate() error { if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) { return errors.New("time to first token standard deviation cannot be more than 30% of time to first token") } + if c.PrefillOverhead < 0 { return errors.New("prefill overhead cannot be negative") - } else if c.PrefillOverhead == 0 { - if c.PrefillComplexity != "" { - return errors.New("prefill overhead complexity is set, but prefill overhead is 0") - } } - if c.PrefillOverheadStdDev < 0 { - return errors.New("prefill overhead standard deviation cannot be negative") + if c.PrefillTimePerToken < 0 { + return errors.New("prefill time per token cannot be negative") + } + if c.PrefillTimeStdDev < 0 { + return errors.New("prefill time standard deviation cannot be negative") + } + + if c.KVCacheTransferTimePerToken < 0 { + return errors.New("kv-cache tranfer time per token cannot be negative") } - if c.PrefillComplexity != "" && c.PrefillComplexity != "n^2" && c.PrefillComplexity != "nlog(n)" { - return errors.New("prefill overhead complexity should be either \"n^2\" or \"nlog(n)\"") + if c.KVCacheTransferTimeStdDev < 0 { + return errors.New("kv-cache tranfer time standard deviation cannot be negative") } + if c.KVCacheTransferLatency < 0 { return errors.New("kv-cache tranfer time cannot be negative") } @@ -335,19 +338,6 @@ func (c *Configuration) validate() error { if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) { return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer") } - if c.KVCacheTransferOverhead < 0 { - return errors.New("kv-cache transfer overhead cannot be negative") - } else if c.KVCacheTransferOverhead == 0 { - if c.KVCacheTransferComplexity != "" { - return errors.New("kv-cache transfer complexity is set, but kv-cache transfer overhead is 0") - } - } - if c.KVCacheTransferOverheadStdDev < 0 { - return errors.New("kv-cache transfer overhead standard deviation cannot be negative") - } - if c.KVCacheTransferComplexity != "" && c.KVCacheTransferComplexity != "linear" && c.KVCacheTransferComplexity != "in-place" { - return errors.New("kv-cache transfer complexity should be either \"linear\" or \"in-place\"") - } if c.MaxLoras < 1 { return errors.New("max LoRAs cannot be less than 1") @@ -445,17 +435,18 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) { f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences") f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)") f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)") + f.IntVar(&config.PrefillOverhead, "prefill-overhead", config.PrefillOverhead, "Time to prefill in milliseconds. This argument is ignored if is not 0.") - f.IntVar(&config.PrefillOverheadStdDev, "prefill-overhead-std-dev", config.PrefillOverheadStdDev, "Standard deviation for time to prefill (in milliseconds)") - f.StringVar(&config.PrefillComplexity, "prefill-complexity", config.PrefillComplexity, "Complexity of prefill based on token length. Options are \"n^2\" and \"nlog(n)\". Default is \"n^2\".") + f.IntVar(&config.PrefillTimePerToken, "prefill-time-per-token", config.PrefillTimePerToken, "Time to prefill per token (in milliseconds)") + f.IntVar(&config.PrefillTimeStdDev, "prefill-time-std-dev", config.PrefillTimeStdDev, "Standard deviation for time to prefill (in milliseconds)") + f.IntVar(&config.KVCacheTransferTimePerToken, "kv-cache-transfer-time-per-token", config.KVCacheTransferTimePerToken, "Time for KV-cache transfer per token from a remote vLLM (in milliseconds)") + f.IntVar(&config.KVCacheTransferTimeStdDev, "kv-cache-transfer-time-std-dev", config.KVCacheTransferTimeStdDev, "Standard deviation for time for KV-cache transfer per token from a remote vLLM (in milliseconds)") + f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)") f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)") f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)") f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)") f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)") - f.IntVar(&config.KVCacheTransferOverhead, "kv-cache-transfer-overhead", config.KVCacheTransferOverhead, "Time to transfer kv-cache in milliseconds. This argument is ignored if is not set.") - f.IntVar(&config.KVCacheTransferOverheadStdDev, "kv-cache-transfer-overhead-std-dev", config.KVCacheTransferOverheadStdDev, "Standard deviation for time to transfer kv-cache (in milliseconds)") - f.StringVar(&config.KVCacheTransferComplexity, "kv-cache-transfer-complexity", config.KVCacheTransferComplexity, "Complexity of kv-cache transfer based on token length. Options are \"linear\" and \"in-place\". Default is \"linear\".") f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call") f.IntVar(&config.MinToolCallIntegerParam, "min-tool-call-integer-param", config.MinToolCallIntegerParam, "Minimum possible value of integer parameters in a tool call") diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 5f628d33..d446db19 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -22,7 +22,6 @@ import ( "encoding/json" "errors" "fmt" - "math" "net" "os" "strings" @@ -655,13 +654,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques // returns time to first token based on the current request's doRemotePrefill func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int { - if s.config.TimeToFirstToken == 0 && s.config.PrefillOverhead != 0 { - if nPromptTokens <= 1 { - if !doRemotePrefill { - return s.config.PrefillOverhead - } - return s.config.KVCacheTransferOverhead - } + if s.config.TimeToFirstToken == 0 { return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) } @@ -695,34 +688,21 @@ func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill b if doRemotePrefill { return s.calcRemotePrefillOverhead(nPromptTokens) } - pfOverhead := s.config.PrefillOverhead - complexity := s.config.PrefillComplexity - // policies of different complexities of prefill implementation - overhead := 0 - switch complexity { - case "n^2", "": - // this is simple implementation of n^2 - overhead = pfOverhead * nPromptTokens * nPromptTokens - case "nlog(n)": - overhead = int(float64(pfOverhead) * (float64(nPromptTokens) * math.Log2(float64(nPromptTokens)))) - } - return int(common.RandomNorm(float64(overhead), float64(s.config.PrefillOverheadStdDev))) + + constOverhead := s.config.PrefillOverhead + ptpt := s.config.PrefillTimePerToken + prefillTime := constOverhead + nPromptTokens*ptpt + + stdDev := s.config.PrefillTimeStdDev + return int(common.RandomNorm(float64(prefillTime), float64(stdDev))) } // calc the remote prefill overhead against number of tokens func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int { - overhead := s.config.KVCacheTransferOverhead - complexity := s.config.KVCacheTransferComplexity - total := 0 - switch complexity { - case "linear", "": - total = overhead * nPromptTokens - case "in-place": - // when the context is already filled - // this is a simple implementation which return a defined overhead - total = overhead - } - return int(common.RandomNorm(float64(total), float64(s.config.KVCacheTransferOverheadStdDev))) + kvCacheTransTPT := s.config.KVCacheTransferTimePerToken + kvCacheTransT := kvCacheTransTPT * nPromptTokens + stdDev := s.config.KVCacheTransferTimeStdDev + return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) } // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index df023cf1..bcf78266 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -21,7 +21,6 @@ import ( "errors" "fmt" "io" - "math" "net" "net/http" "os" @@ -828,142 +827,84 @@ var _ = Describe("Simulator", func() { timeToFirstToken := 10000 simulator.config.TimeToFirstToken = timeToFirstToken simulator.config.PrefillOverhead = 100 - timeToFirst := simulator.getTimeToFirstToken(1, false) - Expect(timeToFirst).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3))) - Expect(timeToFirst).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7))) + ttft := simulator.getTimeToFirstToken(1, false) + Expect(ttft).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3))) + Expect(ttft).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7))) }) It("when is 0, and is not 0, use ", func() { simulator.config.TimeToFirstToken = 0 simulator.config.PrefillOverhead = 100 - timeToFirst := simulator.getTimeToFirstToken(1, false) - Expect(timeToFirst).To(BeNumerically(">=", 100)) + ttft := simulator.getTimeToFirstToken(1, false) + Expect(ttft).To(BeNumerically(">=", 100)) }) - DescribeTable("time to first token is super linear of prefill against number of prompt tokens", - func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) { + DescribeTable("time to first token is against number of prompt tokens", + func(prefillOverhead int, prefillTimePerToken int, stdDev int, nTokens int) { simulator.config.TimeToFirstToken = 0 - simulator.config.PrefillComplexity = "n^2" simulator.config.PrefillOverhead = prefillOverhead - simulator.config.PrefillOverheadStdDev = PrefillOverheadStdDev + simulator.config.PrefillTimePerToken = prefillTimePerToken + simulator.config.PrefillTimeStdDev = stdDev - for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - timeToFirst := simulator.getTimeToFirstToken(nTokens, false) + ttft := simulator.getTimeToFirstToken(nTokens, false) - n2 := prefillOverhead * nTokens * nTokens - n2logn := n2 * int(math.Log2(float64(nTokens))) - nlogn := prefillOverhead * nTokens * int(math.Log2(float64(nTokens))) + expectedTTFT := prefillOverhead + prefillTimePerToken*nTokens + Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) + Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7))) - Expect(timeToFirst).To(BeNumerically(">", int(float64(nlogn)*0.3))) - Expect(timeToFirst).To(BeNumerically("<", int(float64(n2logn)*1.7))) - } - }, - func(prefillOverhead int, PrefillOverheadStdDev int, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", - prefillOverhead, PrefillOverheadStdDev, minNTokens, maxNTokens) - }, - Entry("small numbers", 100, 50, 2, 10), - Entry("medium numbers, larger range", 200, 100, 50, 100), - Entry("large numbers", 150, 125, 20000, 20010), - Entry("stddev is 0", 150, 0, 20000, 20010), - ) - - DescribeTable("time to first token is log-linear of prefill against number of prompt tokens", - func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) { - simulator.config.TimeToFirstToken = 0 - simulator.config.PrefillComplexity = "nlog(n)" - simulator.config.PrefillOverhead = prefillOverhead - simulator.config.PrefillOverheadStdDev = prefillOverheadStdDev - - for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - timeToFirst := simulator.getTimeToFirstToken(nTokens, false) - - logn := prefillOverhead * int(math.Log2(float64(nTokens))) - n2 := prefillOverhead * nTokens * nTokens - Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3))) - Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7))) - } }, - func(prefillOverhead int, prefillOverheadStdDev int, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("prefillOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", - prefillOverhead, prefillOverheadStdDev, minNTokens, maxNTokens) + func(prefillOverhead int, prefillTimePerToken, stdDev int, nTokens int) string { + return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d", + prefillOverhead, prefillTimePerToken, stdDev, nTokens) }, - Entry("small numbers", 100, 50, 2, 10), - Entry("medium numbers, larger range", 200, 100, 50, 100), - Entry("large numbers", 150, 125, 20000, 20010), - Entry("stddev is 0", 150, 0, 20000, 20010), + Entry("single token", 100, 50, 70, 1), + Entry("stddev is 0", 100, 50, 0, 1), + Entry("medium overhead, 512 tokens", 200, 1000, 150, 512), + Entry("large overhead, 1024 tokens", 2000, 3000, 1800, 1024), + Entry("very long prompt", 150, 200, 100, 20000), ) It("when not 0, ignore ", func() { overhead := 100 simulator.config.KVCacheTransferLatency = 1000 - simulator.config.KVCacheTransferOverhead = overhead - timeToFirst := simulator.getTimeToFirstToken(1, false) - Expect(timeToFirst).To(BeNumerically(">=", overhead)) + simulator.config.KVCacheTransferTimePerToken = overhead + ttft := simulator.getTimeToFirstToken(1, false) + Expect(ttft).To(BeNumerically(">=", overhead)) }) It("when is 0, and is not 0, use ", func() { overhead := 100 simulator.config.KVCacheTransferLatency = 0 - simulator.config.KVCacheTransferOverhead = overhead - timeToFirst := simulator.getTimeToFirstToken(1, false) - Expect(timeToFirst).To(BeNumerically(">", 0)) + simulator.config.KVCacheTransferTimePerToken = overhead + ttft := simulator.getTimeToFirstToken(1, false) + Expect(ttft).To(BeNumerically(">", 0)) }) - DescribeTable("When remote kv cache transfer is enabled with \"linear\" policy, time to first token is linear of kv cache transfer against number of prompt tokens", - func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) { + DescribeTable("kv cache transfer time against number of prompt tokens", + func(kvCacheTransTPT int, stddev int, nTokens int) { simulator.config.TimeToFirstToken = 0 simulator.config.PrefillOverhead = 1 - simulator.config.KVCacheTransferComplexity = "linear" - simulator.config.KVCacheTransferOverheadStdDev = stddev - simulator.config.KVCacheTransferOverhead = kvCacheOverhead + simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT + simulator.config.KVCacheTransferTimeStdDev = stddev - for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - timeToFirst := simulator.getTimeToFirstToken(nTokens, true) + ttft := simulator.getTimeToFirstToken(nTokens, true) - n2 := kvCacheOverhead * nTokens * nTokens - logn := kvCacheOverhead * int(math.Log2(float64(nTokens))) - Expect(timeToFirst).To(BeNumerically(">", int(float64(logn)*0.3))) - Expect(timeToFirst).To(BeNumerically("<", int(float64(n2)*1.7))) - } - }, - func(kvCacheOverhead int, stddev int, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("kvCacheOverhead: %d stddev: %d minNTokens: %d maxNTokens: %d", - kvCacheOverhead, stddev, minNTokens, maxNTokens) - }, - Entry("small numbers", 100, 50, 2, 10), - Entry("medium numbers, larger range", 200, 180, 50, 100), - Entry("large numbers", 150, 70, 20000, 20010), - Entry("stddev is 0", 150, 0, 20000, 20010), - ) + expectedTTFT := kvCacheTransTPT * nTokens + Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3))) + Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7))) - DescribeTable("When remote kv cache transfer is enabled with \"in-place\" policy, time to first token should not be impacted by number of prompt tokens", - func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) { - simulator.config.TimeToFirstToken = 0 - simulator.config.PrefillOverhead = 1 - simulator.config.KVCacheTransferComplexity = "in-place" - simulator.config.KVCacheTransferOverheadStdDev = kvCacheTransOverheadStdDev - simulator.config.KVCacheTransferOverhead = kvCacheTransOverhead - - var ttfts []int - for nTokens := minNTokens; nTokens <= maxNTokens; nTokens++ { - timeToFirst := simulator.getTimeToFirstToken(nTokens, true) - ttfts = append(ttfts, timeToFirst) - } - // get stdv of ttfts - stdv := common.StdDevInt(ttfts) - fmt.Printf("ttfts: %v, stdv: %f\n", ttfts, stdv) - Expect(stdv).To(BeNumerically("<=", int(float64(kvCacheTransOverheadStdDev)*1.7))) }, - func(kvCacheTransOverhead int, kvCacheTransOverheadStdDev int, minNTokens int, maxNTokens int) string { - return fmt.Sprintf("kvCacheTransferOverhead: %d kvCacheTransferOverheadStdDev: %d minNTokens: %d maxNTokens: %d", - kvCacheTransOverhead, kvCacheTransOverheadStdDev, minNTokens, maxNTokens) + func(kvCacheTransferTimePerToken int, stddev int, nTokens int) string { + return fmt.Sprintf("kvCacheTransferTimePerToken: %d stddev: %d nTokens: %d", + kvCacheTransferTimePerToken, stddev, nTokens) }, - Entry("small numbers", 100, 50, 2, 10), - Entry("medium numbers, larger range", 200, 150, 50, 100), - Entry("large numbers", 150, 200, 20000, 20010), - Entry("stddev is 0", 150, 0, 20000, 20010), + Entry("single token", 100, 70, 1), + Entry("stddev is 0", 100, 0, 1), + Entry("medium overhead, 512 tokens", 200, 150, 512), + Entry("large overhead, 1024 tokens", 2000, 1800, 1024), + Entry("very long prompt", 150, 100, 20000), ) + }) Context("fake metrics", func() { From 0910dbf64533f4c2508cafd6b4c6580919f8b053 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 29 Aug 2025 21:50:36 +0800 Subject: [PATCH 10/19] Add sep lines in readme params Signed-off-by: Qifan Deng --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index f274b2b1..ed5ee511 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ For more details see the Date: Fri, 29 Aug 2025 22:00:38 +0800 Subject: [PATCH 11/19] Update readme with explanation of new ttft Signed-off-by: Qifan Deng --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index ed5ee511..087a6137 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,13 @@ For more details see the Date: Fri, 29 Aug 2025 22:09:36 +0800 Subject: [PATCH 12/19] Fix ttft new params tests Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator_test.go | 46 +++++++++++++++-------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index bcf78266..d0338dfd 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -824,19 +824,29 @@ var _ = Describe("Simulator", func() { ) It("when is not 0, ignore ", func() { - timeToFirstToken := 10000 + timeToFirstToken := 1000 simulator.config.TimeToFirstToken = timeToFirstToken + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.PrefillOverhead = 100 - ttft := simulator.getTimeToFirstToken(1, false) - Expect(ttft).To(BeNumerically(">=", int(float32(timeToFirstToken)*0.3))) - Expect(ttft).To(BeNumerically("<=", int(float32(timeToFirstToken)*1.7))) + simulator.config.PrefillTimePerToken = 200 + simulator.config.PrefillTimeStdDev = 80 + + ttft := simulator.getTimeToFirstToken(128, false) + + Expect(ttft).To(BeNumerically("==", timeToFirstToken)) }) It("when is 0, and is not 0, use ", func() { simulator.config.TimeToFirstToken = 0 + simulator.config.TimeToFirstTokenStdDev = 0 + simulator.config.PrefillOverhead = 100 - ttft := simulator.getTimeToFirstToken(1, false) - Expect(ttft).To(BeNumerically(">=", 100)) + simulator.config.PrefillTimePerToken = 200 + simulator.config.PrefillTimeStdDev = 80 + + ttft := simulator.getTimeToFirstToken(128, false) + Expect(ttft).NotTo(BeNumerically("==", 0)) }) DescribeTable("time to first token is against number of prompt tokens", @@ -865,19 +875,25 @@ var _ = Describe("Simulator", func() { ) It("when not 0, ignore ", func() { - overhead := 100 - simulator.config.KVCacheTransferLatency = 1000 - simulator.config.KVCacheTransferTimePerToken = overhead - ttft := simulator.getTimeToFirstToken(1, false) - Expect(ttft).To(BeNumerically(">=", overhead)) + simulator.config.KVCacheTransferLatency = 200 + simulator.config.KVCacheTransferLatencyStdDev = 0 + + simulator.config.KVCacheTransferTimePerToken = 100 + simulator.config.KVCacheTransferTimeStdDev = 0 + + ttft := simulator.getTimeToFirstToken(128, false) + Expect(ttft).To(BeNumerically("==", 200)) }) It("when is 0, and is not 0, use ", func() { - overhead := 100 simulator.config.KVCacheTransferLatency = 0 - simulator.config.KVCacheTransferTimePerToken = overhead - ttft := simulator.getTimeToFirstToken(1, false) - Expect(ttft).To(BeNumerically(">", 0)) + simulator.config.KVCacheTransferLatencyStdDev = 0 + + simulator.config.KVCacheTransferTimePerToken = 100 + simulator.config.KVCacheTransferTimeStdDev = 0 + + ttft := simulator.getTimeToFirstToken(128, false) + Expect(ttft).To(BeNumerically("==", 12800)) }) DescribeTable("kv cache transfer time against number of prompt tokens", From 9886b94e4bc0280f6af9271762ddcf89f5187edc Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 29 Aug 2025 22:18:23 +0800 Subject: [PATCH 13/19] Fix kv cache trasfer tests and impl Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 38 +++++++++++++---------- pkg/llm-d-inference-sim/simulator_test.go | 4 +-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index d446db19..59fb8e83 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -654,16 +654,18 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques // returns time to first token based on the current request's doRemotePrefill func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int { - if s.config.TimeToFirstToken == 0 { + if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) } - mean := float64(s.config.TimeToFirstToken) - stddev := float64(s.config.TimeToFirstTokenStdDev) - if doRemotePrefill { - mean = float64(s.config.KVCacheTransferLatency) - stddev = float64(s.config.KVCacheTransferLatencyStdDev) + if !doRemotePrefill { + mean := float64(s.config.TimeToFirstToken) + stddev := float64(s.config.TimeToFirstTokenStdDev) + return int(common.RandomNorm(mean, stddev)) } + + mean := float64(s.config.KVCacheTransferLatency) + stddev := float64(s.config.KVCacheTransferLatencyStdDev) return int(common.RandomNorm(mean, stddev)) } @@ -685,22 +687,24 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { // calc the prefill overhead against number of tokens func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int { - if doRemotePrefill { - return s.calcRemotePrefillOverhead(nPromptTokens) - } + if !doRemotePrefill { + constOverhead := s.config.PrefillOverhead + ptpt := s.config.PrefillTimePerToken + prefillTime := constOverhead + nPromptTokens*ptpt - constOverhead := s.config.PrefillOverhead - ptpt := s.config.PrefillTimePerToken - prefillTime := constOverhead + nPromptTokens*ptpt + stdDev := s.config.PrefillTimeStdDev + return int(common.RandomNorm(float64(prefillTime), float64(stdDev))) + } - stdDev := s.config.PrefillTimeStdDev - return int(common.RandomNorm(float64(prefillTime), float64(stdDev))) -} + if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 { + mean := float64(s.config.KVCacheTransferLatency) + stddev := float64(s.config.KVCacheTransferLatencyStdDev) + return int(common.RandomNorm(mean, stddev)) + } -// calc the remote prefill overhead against number of tokens -func (s *VllmSimulator) calcRemotePrefillOverhead(nPromptTokens int) int { kvCacheTransTPT := s.config.KVCacheTransferTimePerToken kvCacheTransT := kvCacheTransTPT * nPromptTokens + stdDev := s.config.KVCacheTransferTimeStdDev return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) } diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go index d0338dfd..c06b57fa 100644 --- a/pkg/llm-d-inference-sim/simulator_test.go +++ b/pkg/llm-d-inference-sim/simulator_test.go @@ -881,7 +881,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, false) + ttft := simulator.getTimeToFirstToken(128, true) Expect(ttft).To(BeNumerically("==", 200)) }) @@ -892,7 +892,7 @@ var _ = Describe("Simulator", func() { simulator.config.KVCacheTransferTimePerToken = 100 simulator.config.KVCacheTransferTimeStdDev = 0 - ttft := simulator.getTimeToFirstToken(128, false) + ttft := simulator.getTimeToFirstToken(128, true) Expect(ttft).To(BeNumerically("==", 12800)) }) From 904e18d0f88a2c7e95de548691db01f210e3bb35 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 29 Aug 2025 22:28:28 +0800 Subject: [PATCH 14/19] Fix invalid config test of new ttft params Signed-off-by: Qifan Deng --- pkg/common/config_test.go | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go index 33b07fea..7d5fae13 100644 --- a/pkg/common/config_test.go +++ b/pkg/common/config_test.go @@ -407,35 +407,24 @@ var _ = Describe("Simulator configuration", func() { "--config", "../../manifests/config.yaml"}, }, { - name: "invalid (negative) prefill-overhead-std-dev", - args: []string{"cmd", "--prefill-overhead-std-dev", "-1", + name: "invalid (negative) prefill-time-per-token", + args: []string{"cmd", "--prefill-time-per-token", "-1", "--config", "../../manifests/config.yaml"}, }, { - name: " must be set when is set", - args: []string{"cmd", "--prefill-complexity", "n^2", "--config", "../../manifests/config.yaml"}, - }, - { - name: " should not be 'xxx'", - args: []string{"cmd", "--prefill-complexity", "xxx", "--config", "../../manifests/config.yaml"}, - }, - { - name: "invalid (negative) kv-cache-transfer-overhead", - args: []string{"cmd", "--kv-cache-transfer-overhead", "-1", + name: "invalid (negative) prefill-time-std-dev", + args: []string{"cmd", "--prefill-time-std-dev", "-1", "--config", "../../manifests/config.yaml"}, }, { - name: "invalid (negative) kv-cache-transfer-overhead-std-dev", - args: []string{"cmd", "--kv-cache-transfer-overhead-std-dev", "-1", + name: "invalid (negative) kv-cache-transfer-time-per-token", + args: []string{"cmd", "--kv-cache-transfer-time-per-token", "-1", "--config", "../../manifests/config.yaml"}, }, { - name: " must be set when is set", - args: []string{"cmd", "--kv-cache-transfer-complexity", "linear", "--config", "../../manifests/config.yaml"}, - }, - { - name: " should not be 'xxx'", - args: []string{"cmd", "--kv-cache-transfer-complexity", "xxx", "--config", "../../manifests/config.yaml"}, + name: "invalid (negative) kv-cache-transfer-time-std-dev", + args: []string{"cmd", "--kv-cache-transfer-time-std-dev", "-1", + "--config", "../../manifests/config.yaml"}, }, } From 4078dbd5c25940069988775d8b8f245dc9a4840a Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Fri, 29 Aug 2025 22:34:05 +0800 Subject: [PATCH 15/19] Revert "Add standard deviation in utils" This reverts commit 18d30756a7173798e9bf0ed010478829b6b14d9a. Signed-off-by: Qifan Deng --- pkg/common/utils.go | 19 ------------------- pkg/common/utils_test.go | 12 ------------ 2 files changed, 31 deletions(-) diff --git a/pkg/common/utils.go b/pkg/common/utils.go index 5295b3dd..d3ea5b44 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -328,22 +328,3 @@ func init() { func Tokenize(text string) []string { return re.FindAllString(text, -1) } - -// Calculate standard deviation of an int array -func StdDevInt(data []int) float64 { - var sum int - for _, value := range data { - sum += value - } - mean := sum / len(data) - - var sumSquares int - for _, value := range data { - diff := value - mean - sumSquares += diff * diff - } - - variance := sumSquares / len(data) - - return math.Sqrt(float64(variance)) -} diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index 4dac4a4c..b8f3285e 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -168,16 +168,4 @@ var _ = Describe("Utils", Ordered, func() { } }) - Context("Standard Deviation", func() { - It("should return 0 for a single element", func() { - data := []int{42} - Expect(StdDevInt(data)).To(Equal(0.0)) - }) - - It("should return the correct standard deviation for multiple elements", func() { - data := []int{1, 2, 3, 4, 5} - Expect(StdDevInt(data)).To(Equal(1.4142135623730951)) - }) - }) - }) From 91e702f82c43b9258df039d43cd718422da2deb8 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 31 Aug 2025 22:14:22 +0800 Subject: [PATCH 16/19] Remove additional variables in prefill time calculation Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 3d555e63..f142de1c 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -701,22 +701,15 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { // calc the prefill overhead against number of tokens func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int { if !doRemotePrefill { - constOverhead := s.config.PrefillOverhead - ptpt := s.config.PrefillTimePerToken - prefillTime := constOverhead + nPromptTokens*ptpt - - stdDev := s.config.PrefillTimeStdDev - return int(common.RandomNorm(float64(prefillTime), float64(stdDev))) + prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken + return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev))) } if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 { - mean := float64(s.config.KVCacheTransferLatency) - stddev := float64(s.config.KVCacheTransferLatencyStdDev) - return int(common.RandomNorm(mean, stddev)) + return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev))) } - kvCacheTransTPT := s.config.KVCacheTransferTimePerToken - kvCacheTransT := kvCacheTransTPT * nPromptTokens + kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens stdDev := s.config.KVCacheTransferTimeStdDev return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) From 8430ea369f4781abe3838fc58793a2871e141b6e Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 31 Aug 2025 22:24:51 +0800 Subject: [PATCH 17/19] Improve is remote prefill/decode interface doc Signed-off-by: Qifan Deng --- pkg/openai-server-api/request.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go index afab801d..b23104f8 100644 --- a/pkg/openai-server-api/request.go +++ b/pkg/openai-server-api/request.go @@ -53,9 +53,13 @@ type CompletionRequest interface { GetToolChoice() string // GetMaxCompletionTokens returns the maximum completion tokens requested GetMaxCompletionTokens() *int64 - // IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is decode request + // IsDoRemoteDecode() returns true if do_remote_decode field is true in the request, + // when the field is true, the decode phase should be done on remote pod, + // whereas prefill phase is done on local pod, thus this is a prefill request IsDoRemoteDecode() bool - // IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is prefill request + // IsDoRemotePrefill() returns true if do_remote_prefill field is true in the request, + // when the field is true, the prefill phase should be done on remote pod, + // whereas decode phase is done on local pod, thus this is a decode request IsDoRemotePrefill() bool } From a5305c82fa8100ffe7bddc55bc79ad9d29cfaae3 Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 31 Aug 2025 22:39:39 +0800 Subject: [PATCH 18/19] Improve implementation of ttft calc Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 43 ++++++++++------------------ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index f142de1c..948bae95 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -667,19 +667,23 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques // returns time to first token based on the current request's doRemotePrefill func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int { - if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { - return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) + if doRemotePrefill { + if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 { + // is disaggregated PD and ttft is calculated using number of prompt tokens + kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens + stdDev := s.config.KVCacheTransferTimeStdDev + return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) + } + // is disaggregated PD and *not* using number of prompt tokens + return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev))) } - - if !doRemotePrefill { - mean := float64(s.config.TimeToFirstToken) - stddev := float64(s.config.TimeToFirstTokenStdDev) - return int(common.RandomNorm(mean, stddev)) + if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { + // is aggregated PD and ttft is calculated using number of prompt tokens + prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken + return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev))) } - - mean := float64(s.config.KVCacheTransferLatency) - stddev := float64(s.config.KVCacheTransferLatencyStdDev) - return int(common.RandomNorm(mean, stddev)) + // is aggregated PD and *not* using number of prompt tokens + return int(common.RandomNorm(float64(s.config.TimeToFirstToken), float64(s.config.TimeToFirstTokenStdDev))) } // returns inter token latency @@ -698,23 +702,6 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { return total } -// calc the prefill overhead against number of tokens -func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int { - if !doRemotePrefill { - prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken - return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev))) - } - - if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 { - return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev))) - } - - kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens - - stdDev := s.config.KVCacheTransferTimeStdDev - return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) -} - // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse { modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}} From b74b3aa17a807f2f04ddfab5070eab0a8335a8ac Mon Sep 17 00:00:00 2001 From: Qifan Deng Date: Sun, 31 Aug 2025 22:46:09 +0800 Subject: [PATCH 19/19] Remove unnecessary variable Signed-off-by: Qifan Deng --- pkg/llm-d-inference-sim/simulator.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index 948bae95..2ecade9d 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -671,8 +671,7 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 { // is disaggregated PD and ttft is calculated using number of prompt tokens kvCacheTransT := s.config.KVCacheTransferTimePerToken * nPromptTokens - stdDev := s.config.KVCacheTransferTimeStdDev - return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) + return int(common.RandomNorm(float64(kvCacheTransT), float64(s.config.KVCacheTransferTimeStdDev))) } // is disaggregated PD and *not* using number of prompt tokens return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))