From 0fa56774b6a85ac807fb926a7767235769b3c7cd Mon Sep 17 00:00:00 2001 From: Murphy Chen Date: Thu, 25 Sep 2025 15:01:01 +0800 Subject: [PATCH 1/3] support vLLM cache salting in prefix aware scorer --- .../framework/plugins/multi/prefix/plugin.go | 6 ++- pkg/epp/scheduling/types/types.go | 16 ++++++++ pkg/epp/util/request/body_test.go | 38 +++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go index 6bc81a44a..8267e7322 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -257,7 +257,7 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map } // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block. -// hash(0) is the hash of the model name, since different models generally don't share prefix cache. +// hash(0) is the hash of the model name and cache_salt(if provided), since different models generally don't share prefix cache. // For block i, hash(i) = hash(block i content, hash(i-1)). func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) @@ -286,6 +286,10 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i // Add the model to the first block hash so that different models have different hashes even with the same body. h := xxhash.New() _, _ = h.Write([]byte(request.TargetModel)) + if cacheSalt := request.Body.GetCacheSalt(); cacheSalt != "" { + _, _ = h.Write([]byte(cacheSalt)) + } + prevBlockHash := BlockHash(h.Sum64()) for i := 0; i+cacheBlockSize <= len(userInput); i += cacheBlockSize { h.Reset() diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index 2685a22d0..e542de779 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -56,6 +56,18 @@ type LLMRequestBody struct { ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"` } +func (r *LLMRequestBody) GetCacheSalt() string { + if r.ChatCompletions == nil && r.Completions == nil { + return "" + } + + if r.ChatCompletions != nil { + return r.ChatCompletions.CacheSalt + } + + return r.Completions.CacheSalt +} + // CompletionsRequest is a structured representation of the fields we parse out of the // /v1/completions request body. // This struct includes fields usable for plugins and scheduling decisions - and not the entire @@ -63,6 +75,8 @@ type LLMRequestBody struct { type CompletionsRequest struct { // Prompt is the prompt that was sent in the request body. Prompt string `json:"prompt,omitempty"` + // CacheSalt is parameters from the vLLM security feature. + CacheSalt string `json:"cache_salt,omitempty"` } func (r *CompletionsRequest) String() string { @@ -88,6 +102,8 @@ type ChatCompletionsRequest struct { ContinueFinalMessage bool `json:"continue_final_message,omitempty"` AddGenerationPrompt bool `json:"add_generation_prompt,omitempty"` ChatTemplateKWArgs map[string]interface{} `json:"chat_template_kwargs,omitempty"` + /* parameters from the vLLM security feature */ + CacheSalt string `json:"cache_salt,omitempty"` } func (r *ChatCompletionsRequest) String() string { diff --git a/pkg/epp/util/request/body_test.go b/pkg/epp/util/request/body_test.go index 64ab6de11..51389e561 100644 --- a/pkg/epp/util/request/body_test.go +++ b/pkg/epp/util/request/body_test.go @@ -225,6 +225,44 @@ func TestExtractRequestData(t *testing.T) { }, wantErr: true, }, + { + name: "completions request with cache_salt", + body: map[string]any{ + "model": "test", + "prompt": "test prompt", + "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + want: &types.LLMRequestBody{ + Completions: &types.CompletionsRequest{ + Prompt: "test prompt", + CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + }, + }, + { + name: "chat completions request with cache_salt", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{ + "role": "system", "content": "this is a system message", + }, + map[string]any{ + "role": "user", "content": "hello", + }, + }, + "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + want: &types.LLMRequestBody{ + ChatCompletions: &types.ChatCompletionsRequest{ + Messages: []types.Message{ + {Role: "system", Content: "this is a system message"}, + {Role: "user", Content: "hello"}, + }, + CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==", + }, + }, + }, } for _, tt := range tests { From b3685de98e34d982c287150055632dc3d3e9b0ad Mon Sep 17 00:00:00 2001 From: Murphy Chen Date: Fri, 26 Sep 2025 15:31:39 +0800 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Cong Liu --- pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go | 2 +- pkg/epp/scheduling/types/types.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go index 8267e7322..ee7ceaeef 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -257,7 +257,7 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map } // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block. -// hash(0) is the hash of the model name and cache_salt(if provided), since different models generally don't share prefix cache. +// hash[0] is calculated including the model name and cache_salt(if provided), since different models generally don't share prefix cache. // For block i, hash(i) = hash(block i content, hash(i-1)). func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash { loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index e542de779..c93f0c5ac 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -56,7 +56,7 @@ type LLMRequestBody struct { ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"` } -func (r *LLMRequestBody) GetCacheSalt() string { +func (r *LLMRequestBody) CacheSalt() string { if r.ChatCompletions == nil && r.Completions == nil { return "" } @@ -75,7 +75,7 @@ func (r *LLMRequestBody) GetCacheSalt() string { type CompletionsRequest struct { // Prompt is the prompt that was sent in the request body. Prompt string `json:"prompt,omitempty"` - // CacheSalt is parameters from the vLLM security feature. + // CacheSalt is an optional request parameter to isolate prefix caches for security reasons. CacheSalt string `json:"cache_salt,omitempty"` } @@ -102,7 +102,7 @@ type ChatCompletionsRequest struct { ContinueFinalMessage bool `json:"continue_final_message,omitempty"` AddGenerationPrompt bool `json:"add_generation_prompt,omitempty"` ChatTemplateKWArgs map[string]interface{} `json:"chat_template_kwargs,omitempty"` - /* parameters from the vLLM security feature */ + // CacheSalt is an optional request parameter to isolate prefix caches for security reasons. CacheSalt string `json:"cache_salt,omitempty"` } From 2f5ac56241eabf305ea1ed4a924f874134d4ced5 Mon Sep 17 00:00:00 2001 From: Murphy Chen Date: Fri, 26 Sep 2025 15:35:15 +0800 Subject: [PATCH 3/3] fix lint --- pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go index ee7ceaeef..287028833 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -286,7 +286,7 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i // Add the model to the first block hash so that different models have different hashes even with the same body. h := xxhash.New() _, _ = h.Write([]byte(request.TargetModel)) - if cacheSalt := request.Body.GetCacheSalt(); cacheSalt != "" { + if cacheSalt := request.Body.CacheSalt(); cacheSalt != "" { _, _ = h.Write([]byte(cacheSalt)) }