From 0fa56774b6a85ac807fb926a7767235769b3c7cd Mon Sep 17 00:00:00 2001
From: Murphy Chen <minquan.chen@daocloud.io>
Date: Thu, 25 Sep 2025 15:01:01 +0800
Subject: [PATCH 1/3] support vLLM cache salting in prefix aware scorer

---
 .../framework/plugins/multi/prefix/plugin.go  |  6 ++-
 pkg/epp/scheduling/types/types.go             | 16 ++++++++
 pkg/epp/util/request/body_test.go             | 38 +++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
index 6bc81a44a..8267e7322 100644
--- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
+++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -257,7 +257,7 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map
 }
 
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
-// hash(0) is the hash of the model name, since different models generally don't share prefix cache.
+// hash(0) is the hash of the model name and cache_salt(if provided), since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
 func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
@@ -286,6 +286,10 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
 	h := xxhash.New()
 	_, _ = h.Write([]byte(request.TargetModel))
+	if cacheSalt := request.Body.GetCacheSalt(); cacheSalt != "" {
+		_, _ = h.Write([]byte(cacheSalt))
+	}
+
 	prevBlockHash := BlockHash(h.Sum64())
 	for i := 0; i+cacheBlockSize <= len(userInput); i += cacheBlockSize {
 		h.Reset()
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
index 2685a22d0..e542de779 100644
--- a/pkg/epp/scheduling/types/types.go
+++ b/pkg/epp/scheduling/types/types.go
@@ -56,6 +56,18 @@ type LLMRequestBody struct {
 	ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"`
 }
 
+func (r *LLMRequestBody) GetCacheSalt() string {
+	if r.ChatCompletions == nil && r.Completions == nil {
+		return ""
+	}
+
+	if r.ChatCompletions != nil {
+		return r.ChatCompletions.CacheSalt
+	}
+
+	return r.Completions.CacheSalt
+}
+
 // CompletionsRequest is a structured representation of the fields we parse out of the
 // /v1/completions request body.
 // This struct includes fields usable for plugins and scheduling decisions - and not the entire
@@ -63,6 +75,8 @@ type LLMRequestBody struct {
 type CompletionsRequest struct {
 	// Prompt is the prompt that was sent in the request body.
 	Prompt string `json:"prompt,omitempty"`
+	// CacheSalt is parameters from the vLLM security feature.
+	CacheSalt string `json:"cache_salt,omitempty"`
 }
 
 func (r *CompletionsRequest) String() string {
@@ -88,6 +102,8 @@ type ChatCompletionsRequest struct {
 	ContinueFinalMessage      bool                   `json:"continue_final_message,omitempty"`
 	AddGenerationPrompt       bool                   `json:"add_generation_prompt,omitempty"`
 	ChatTemplateKWArgs        map[string]interface{} `json:"chat_template_kwargs,omitempty"`
+	/* parameters from the vLLM security feature */
+	CacheSalt string `json:"cache_salt,omitempty"`
 }
 
 func (r *ChatCompletionsRequest) String() string {
diff --git a/pkg/epp/util/request/body_test.go b/pkg/epp/util/request/body_test.go
index 64ab6de11..51389e561 100644
--- a/pkg/epp/util/request/body_test.go
+++ b/pkg/epp/util/request/body_test.go
@@ -225,6 +225,44 @@ func TestExtractRequestData(t *testing.T) {
 			},
 			wantErr: true,
 		},
+		{
+			name: "completions request with cache_salt",
+			body: map[string]any{
+				"model":      "test",
+				"prompt":     "test prompt",
+				"cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+			},
+			want: &types.LLMRequestBody{
+				Completions: &types.CompletionsRequest{
+					Prompt:    "test prompt",
+					CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+				},
+			},
+		},
+		{
+			name: "chat completions request with cache_salt",
+			body: map[string]any{
+				"model": "test",
+				"messages": []any{
+					map[string]any{
+						"role": "system", "content": "this is a system message",
+					},
+					map[string]any{
+						"role": "user", "content": "hello",
+					},
+				},
+				"cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+			},
+			want: &types.LLMRequestBody{
+				ChatCompletions: &types.ChatCompletionsRequest{
+					Messages: []types.Message{
+						{Role: "system", Content: "this is a system message"},
+						{Role: "user", Content: "hello"},
+					},
+					CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {

From b3685de98e34d982c287150055632dc3d3e9b0ad Mon Sep 17 00:00:00 2001
From: Murphy Chen <minquan.chen@daocloud.io>
Date: Fri, 26 Sep 2025 15:31:39 +0800
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: Cong Liu <conliu@google.com>
---
 pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go | 2 +-
 pkg/epp/scheduling/types/types.go                           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
index 8267e7322..ee7ceaeef 100644
--- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
+++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -257,7 +257,7 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map
 }
 
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
-// hash(0) is the hash of the model name and cache_salt(if provided), since different models generally don't share prefix cache.
+// hash[0] is calculated including the model name and cache_salt(if provided), since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
 func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
index e542de779..c93f0c5ac 100644
--- a/pkg/epp/scheduling/types/types.go
+++ b/pkg/epp/scheduling/types/types.go
@@ -56,7 +56,7 @@ type LLMRequestBody struct {
 	ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"`
 }
 
-func (r *LLMRequestBody) GetCacheSalt() string {
+func (r *LLMRequestBody) CacheSalt() string {
 	if r.ChatCompletions == nil && r.Completions == nil {
 		return ""
 	}
@@ -75,7 +75,7 @@ func (r *LLMRequestBody) GetCacheSalt() string {
 type CompletionsRequest struct {
 	// Prompt is the prompt that was sent in the request body.
 	Prompt string `json:"prompt,omitempty"`
-	// CacheSalt is parameters from the vLLM security feature.
+	// CacheSalt is an optional request parameter to isolate prefix caches for security reasons.
 	CacheSalt string `json:"cache_salt,omitempty"`
 }
 
@@ -102,7 +102,7 @@ type ChatCompletionsRequest struct {
 	ContinueFinalMessage      bool                   `json:"continue_final_message,omitempty"`
 	AddGenerationPrompt       bool                   `json:"add_generation_prompt,omitempty"`
 	ChatTemplateKWArgs        map[string]interface{} `json:"chat_template_kwargs,omitempty"`
-	/* parameters from the vLLM security feature */
+	// CacheSalt is an optional request parameter to isolate prefix caches for security reasons.
 	CacheSalt string `json:"cache_salt,omitempty"`
 }
 

From 2f5ac56241eabf305ea1ed4a924f874134d4ced5 Mon Sep 17 00:00:00 2001
From: Murphy Chen <minquan.chen@daocloud.io>
Date: Fri, 26 Sep 2025 15:35:15 +0800
Subject: [PATCH 3/3] fix lint

---
 pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
index ee7ceaeef..287028833 100644
--- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
+++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -286,7 +286,7 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
 	h := xxhash.New()
 	_, _ = h.Write([]byte(request.TargetModel))
-	if cacheSalt := request.Body.GetCacheSalt(); cacheSalt != "" {
+	if cacheSalt := request.Body.CacheSalt(); cacheSalt != "" {
 		_, _ = h.Write([]byte(cacheSalt))
 	}