Rename prefix scorer HashBlockSize to BlockSize (#1613)

Frapschen · web-flow · commit f4ab8c68750a · 2025-09-19T12:00:14.000-07:00
diff --git a/pkg/epp/config/loader/configloader_test.go b/pkg/epp/config/loader/configloader_test.go
@@ -73,7 +73,7 @@ func TestLoadRawConfiguration(t *testing.T) {
 			},
 			{
 				Type:       test2Type,
-				Parameters: json.RawMessage("{\"hashBlockSize\":32}"),
+				Parameters: json.RawMessage("{\"blockSize\":32}"),
 			},
 			{
 				Name: "testPicker",
@@ -175,7 +175,7 @@ func TestLoadRawConfigurationWithDefaults(t *testing.T) {
 			{
 				Name:       test2Type,
 				Type:       test2Type,
-				Parameters: json.RawMessage("{\"hashBlockSize\":32}"),
+				Parameters: json.RawMessage("{\"blockSize\":32}"),
 			},
 			{
 				Name: "testPicker",
@@ -464,7 +464,7 @@ plugins:
   type: test-profile-handler
 - type: test-two
   parameters:
-    hashBlockSize: 32
+    blockSize: 32
 - name: testPicker
   type: test-picker
 schedulingProfiles:
@@ -767,7 +767,7 @@ plugins:
 - name: prefixCacheScorer
   type: prefix-cache-scorer
   parameters:
-    hashBlockSize: 32
+    blockSize: 32
 - name: maxScorePicker
   type: max-score-picker
 - name: profileHandler
@@ -792,7 +792,7 @@ plugins:
 - name: prefixCacheScorer
   type: prefix-cache-scorer
   parameters:
-    hashBlockSize: 32
+    blockSize: 32
 schedulingProfiles:
 - name: default
   plugins:
@@ -826,7 +826,7 @@ plugins:
 - name: prefixCacheScorer
   type: prefix-cache-scorer
   parameters:
-    hashBlockSize: asdf
+    blockSize: asdf
 schedulingProfiles:
 - name: default
   plugins:
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -37,7 +37,7 @@ import (
 
 const (
 	// vLLM default token block size is 16, and a good guess of average characters per token is 4.
-	DefaultHashBlockSize = 64
+	DefaultBlockSize = 64
 	// The maximum number of blocks to match. Two long requests with the same prefix up to this
 	// limit will be indistinguishable.
 	// This parameter provides a trade-off between cache size, prefix matching speed and matching
@@ -58,15 +58,15 @@ const (
 )
 
 var DefaultConfig = Config{
-	HashBlockSize:          DefaultHashBlockSize,
+	BlockSize:              DefaultBlockSize,
 	MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 	LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 }
 
 type Config struct {
-	// The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests
+	// The input prompt is broken into sizes of BlockSize to calculate block hashes . Requests
 	// with length shorter than the block size will be ignored.
-	HashBlockSize int `json:"hashBlockSize"`
+	BlockSize int `json:"blockSize"`
 	// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
 	// be ignored.
 	MaxPrefixBlocksToMatch int `json:"maxPrefixBlocksToMatch"`
@@ -133,7 +133,7 @@ var (
 // PrefixCachePluginFactory defines the factory function for Prefix plugin.
 func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) {
 	parameters := Config{
-		HashBlockSize:          DefaultHashBlockSize,
+		BlockSize:              DefaultBlockSize,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -180,7 +180,7 @@ func (p *Plugin) WithName(name string) *Plugin {
 // Score returns the scoring result for the given list of pods based on context.
 func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes := hashPrompt(ctx, request, p.config.HashBlockSize, p.config.MaxPrefixBlocksToMatch)
+	hashes := hashPrompt(ctx, request, p.config.BlockSize, p.config.MaxPrefixBlocksToMatch)
 	state := &SchedulingContextState{
 		PrefixHashes:       hashes,
 		PrefixCacheServers: p.matchLongestPrefix(ctx, hashes),
@@ -231,7 +231,7 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche
 
 	total := len(state.PrefixHashes)
 	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
-	metrics.RecordPrefixCacheMatch(matchLen*p.config.HashBlockSize, total*p.config.HashBlockSize)
+	metrics.RecordPrefixCacheMatch(matchLen*p.config.BlockSize, total*p.config.BlockSize)
 }
 
 // matchLongestPrefix returns a map of servers and length of prefix that each server caches.
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -35,7 +35,7 @@ import (
 
 func TestPrefixPluginCompletion(t *testing.T) {
 	config := Config{
-		HashBlockSize:          4,
+		BlockSize:              4,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -201,7 +201,7 @@ func TestPrefixPluginCompletion(t *testing.T) {
 
 func TestPrefixPluginChatCompletions(t *testing.T) {
 	config := Config{
-		HashBlockSize:          4,
+		BlockSize:              4,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -235,7 +235,7 @@ func TestPrefixPluginChatCompletions(t *testing.T) {
 
 func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	config := Config{
-		HashBlockSize:          8, // Use larger block size for more predictable JSON marshaling
+		BlockSize:              8, // Use larger block size for more predictable JSON marshaling
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -349,7 +349,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) {
 	blockSize := 4
 	maxPrefixBlocks := 50000
 	config := Config{
-		HashBlockSize:          blockSize,
+		BlockSize:              blockSize,
 		MaxPrefixBlocksToMatch: maxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -418,7 +418,7 @@ func BenchmarkPrefixPluginChatCompletionsStress(b *testing.B) {
 	blockSize := 8
 	maxPrefixBlocks := 50000
 	config := Config{
-		HashBlockSize:          blockSize,
+		BlockSize:              blockSize,
 		MaxPrefixBlocksToMatch: maxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
diff --git a/site-src/guides/epp-configuration/config-text.md b/site-src/guides/epp-configuration/config-text.md
@@ -85,7 +85,7 @@ kind: EndpointPickerConfig
 plugins:
 - type: prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
+    blockSize: 5
     maxPrefixBlocksToMatch: 256
     lruCapacityPerServer: 31250
 schedulingProfiles:
@@ -152,7 +152,7 @@ spec:
           plugins:
           - type: prefix-cache-scorer
             parameters:
-              hashBlockSize: 5
+              blockSize: 5
               maxPrefixBlocksToMatch: 256
               lruCapacityPerServer: 31250
           schedulingProfiles:
@@ -171,7 +171,7 @@ kind: EndpointPickerConfig
 plugins:
 - type: prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
+    blockSize: 5
     maxPrefixBlocksToMatch: 256
     lruCapacityPerServer: 31250
 - type: single-profile-handler
@@ -201,7 +201,7 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC
 
 - *Type*: prefix-cache-scorer
 - *Parameters*:
-  - `hashBlockSize` specified the size of the blocks to break up the input prompt when
+  - `blockSize` specified the size of the blocks to break up the input prompt when
     calculating the block hashes. If not specified defaults to `64`
   - `maxPrefixBlocksToMatch` specifies the maximum number of prefix blocks to match. If
    not specified defaults to `256`
diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md
@@ -14,7 +14,7 @@ Like any other plugins, the prefix cache aware plugin can be enabled/disabled vi
 
 The prefix cache plugin exposes the following advanced configuration parameters:
 
-* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size
+* `blockSize`: The plugin matches prefixes in the unit of blocks. This is the size
 of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default
 is set to 64 in EPP. The default is recommended unless performance is critical for use cases with
 extremely long inputs.
diff --git a/test/testdata/configloader_1_test.yaml b/test/testdata/configloader_1_test.yaml
@@ -9,7 +9,7 @@ plugins:
   type: test-profile-handler
 - type: test-two
   parameters:
-    hashBlockSize: 32
+    blockSize: 32
 - name: testPicker
   type: test-picker