Skip to content

Commit aecee68

Browse files
authored
- split the prefix-cache scorers (#323)
- fixed typos install for `make lint` Signed-off-by: Maroon Ayoub <[email protected]>
1 parent b9c56d3 commit aecee68

File tree

10 files changed

+80
-216
lines changed

10 files changed

+80
-216
lines changed

Makefile

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,27 @@ EPP_TAG ?= dev
1313
IMG = $(IMAGE_TAG_BASE):$(EPP_TAG)
1414
NAMESPACE ?= hc4ai-operator
1515

16+
# Map go arch to typos arch
17+
ifeq ($(TARGETARCH),amd64)
18+
TYPOS_TARGET_ARCH = x86_64
19+
else ifeq ($(TARGETARCH),arm64)
20+
TYPOS_TARGET_ARCH = aarch64
21+
else
22+
TYPOS_TARGET_ARCH = $(TARGETARCH)
23+
endif
24+
1625
ifeq ($(TARGETOS),darwin)
1726
ifeq ($(TARGETARCH),amd64)
1827
TOKENIZER_ARCH = x86_64
1928
else
2029
TOKENIZER_ARCH = $(TARGETARCH)
2130
endif
31+
TAR_OPTS = --strip-components 1
32+
TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-apple-darwin
2233
else
2334
TOKENIZER_ARCH = $(TARGETARCH)
35+
TAR_OPTS = --wildcards '*/typos'
36+
TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-unknown-linux-musl
2437
endif
2538

2639
CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "")
@@ -94,6 +107,7 @@ post-deploy-test: ## Run post deployment tests
94107
lint: check-golangci-lint check-typos ## Run lint
95108
@printf "\033[33;1m==== Running linting ====\033[0m\n"
96109
golangci-lint run
110+
$(TYPOS)
97111

98112
##@ Build
99113

@@ -388,4 +402,3 @@ download-zmq: ## Install ZMQ dependencies based on OS/ARCH
388402
fi; \
389403
echo "✅ ZMQ dependencies installed."; \
390404
fi
391-

Makefile.tools.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ TYPOS_VERSION ?= v1.34.0
1212
typos: $(TYPOS)
1313
$(TYPOS): | $(LOCALBIN)
1414
@echo "Downloading typos $(TYPOS_VERSION)..."
15-
curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-x86_64-unknown-linux-musl.tar.gz | tar -xz -C $(LOCALBIN) --wildcards '*/typos'
15+
curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-$(TYPOS_ARCH).tar.gz | tar -xz -C $(LOCALBIN) $(TAR_OPTS)
1616
chmod +x $(TYPOS)

deploy/config/epp-prefix-estimate-config.yaml renamed to deploy/config/epp-estimate-prefix-cache-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ schedulingProfiles:
1313
plugins:
1414
- pluginRef: decode-filter
1515
- pluginRef: prefix-cache-scorer
16-
weight: 2.0
16+
weight: 1.0
1717
- pluginRef: load-aware-scorer
1818
weight: 1.0
1919
- pluginRef: max-score-picker

deploy/config/epp-prefix-cache-tracking-config.yaml renamed to deploy/config/epp-precise-prefix-cache-config.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ kind: EndpointPickerConfig
55
plugins:
66
- type: single-profile-handler
77
- type: decode-filter
8-
- type: prefix-cache-scorer
8+
- type: precise-prefix-cache-scorer
99
parameters:
10-
mode: cache_tracking
1110
indexerConfig:
1211
tokenProcessorConfig:
1312
blockSize: 64 # must match vLLM block size
@@ -21,8 +20,8 @@ schedulingProfiles:
2120
- name: default
2221
plugins:
2322
- pluginRef: decode-filter
24-
- pluginRef: prefix-cache-scorer
25-
weight: 3.0
23+
- pluginRef: precise-prefix-cache-scorer
24+
weight: 2.0
2625
- pluginRef: kv-cache-scorer
2726
weight: 1.0
2827
- pluginRef: queue-scorer

docs/architecture.md

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The design enables:
1212

1313
- Support for **multiple base models** within a shared cluster [Not supported in
1414
Phase1]
15-
- Efficient routing based on **KV cache locality**, **prefix**, **session affinity**, **load**, and
15+
- Efficient routing based on **KV cache locality**, **session affinity**, **load**, and
1616
**model metadata**
1717
- Disaggregated **Prefill/Decode (P/D)** execution
1818
- Pluggable **filters**, **scorers**, and **scrapers** for extensible routing
@@ -245,29 +245,14 @@ Filters out pods that are not marked as prefill. The filter looks for the label
245245

246246
---
247247

248-
#### PrefixCacheScorer
248+
#### PrecisePrefixCacheScorer
249249

250-
The `prefix-cache-scorer` scores a request based on KV-cache localities.
251-
It supports two modes: `estimate` and `cache_tracking`.
252-
253-
##### `estimate` mode (default):
254-
255-
This mode uses the default GIE prefix scorer and scores pods based on the estimated cache locality of the prompt.
256-
The estimation is based on scheduling history.
257-
258-
- **Type**: `prefix-cache-scorer`
259-
- **Parameters**:
260-
- `hashBlockSize`: Specifies the size of the blocks used to split the input **prompt** when calculating block hashes. Defaults to `64` if not specified.
261-
- `maxPrefixBlocksToMatch`: Specifies the maximum number of prefix blocks to match. Defaults to `256` if not specified.
262-
- `lruCapacityPerServer`: Specifies the capacity of the LRU indexer, in number of entries per server (pod). Defaults to `31,250` if not specified.
263-
264-
**Note:** `mode: estimate` is not required, as it is the default.
265-
266-
##### `cache_tracking` mode:
267-
268-
This mode scores requests based on the actual KV-cache states across the vLLM instances.
269-
It is more accurate than both `SessionAffinity` and `PrefixCachePlugin` in `estimate` mode,
270-
but incurs additional computation overhead and KV-Events streaming to track the KV-cache states.
250+
The `precise-prefix-cache-scorer` scores a request based on KV-cache localities.
251+
Similarly to the IGW `prefix-cache-scorer`, it provides a score based on the number of
252+
matching KV-cache blocks between the request's prompt and the KV-cache contents of each pod.
253+
However, unlike the IGW `prefix-cache-scorer`, which relies on estimations based on scheduling history,
254+
the `precise-prefix-cache-scorer` tracks the real-time KV-cache states across the vLLM instances to
255+
provide more accurate scoring.
271256

272257
When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-cache states
273258
across the vLLM instances. It will use the `kvcache.Indexer` to score the pods based on the
@@ -276,9 +261,8 @@ When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-c
276261

277262
Configuration:
278263

279-
- **Type**: `prefix-cache-scorer`
264+
- **Type**: `precise-prefix-cache-scorer`
280265
- **Parameters**:
281-
- `mode: cache_tracking`
282266
- `indexerConfig`: Configuration for the `kvcache.Indexer`.
283267
- `kvEventsConfig`: Configuration for the `kvevents.Pool`.
284268

@@ -294,7 +278,7 @@ Example configuration with the above parameters set:
294278

295279
```yaml
296280
plugins:
297-
- type: prefix-cache-scorer
281+
- type: precise-prefix-cache-scorer
298282
parameters:
299283
indexerConfig:
300284
tokenProcessorConfig:
@@ -310,7 +294,7 @@ Example configuration with all parameters set:
310294
311295
```yaml
312296
plugins:
313-
- type: prefix-cache-scorer
297+
- type: precise-prefix-cache-scorer
314298
parameters:
315299
kvEventsConfig:
316300
zmqEndpoint: tcp://*:5557

pkg/plugins/register.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
package plugins
22

33
import (
4-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
5-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
6-
74
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/filter"
85
prerequest "github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/pre-request"
96
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/profile"
107
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/scorer"
8+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
119
)
1210

1311
// RegisterAllPlugins registers the factory functions of all plugins in this repository.
@@ -18,7 +16,7 @@ func RegisterAllPlugins() {
1816
plugins.Register(filter.PrefillRoleType, filter.PrefillRoleFactory)
1917
plugins.Register(prerequest.PrefillHeaderHandlerType, prerequest.PrefillHeaderHandlerFactory)
2018
plugins.Register(profile.PdProfileHandlerType, profile.PdProfileHandlerFactory)
21-
plugins.Register(prefix.PrefixCachePluginType, scorer.PrefixCachePluginFactory)
19+
plugins.Register(scorer.PrecisePrefixCachePluginType, scorer.PrecisePrefixCachePluginFactory)
2220
plugins.Register(scorer.LoadAwareType, scorer.LoadAwareFactory)
2321
plugins.Register(scorer.SessionAffinityType, scorer.SessionAffinityFactory)
2422
plugins.Register(scorer.ActiveRequestType, scorer.ActiveRequestFactory)

pkg/plugins/scorer/prefix_cache_tracking.go renamed to pkg/plugins/scorer/precise_prefix_cache.go

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@ import (
1111
"sigs.k8s.io/controller-runtime/pkg/log"
1212
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
1313
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
14-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
1514
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
1615
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
1716
)
1817

19-
// PrefixCacheTrackingConfig holds the configuration for the
20-
// PrefixCacheTracking.
21-
type PrefixCacheTrackingConfig struct {
18+
const (
19+
// PrecisePrefixCachePluginType is the type-name of the PrecisePrefixCacheScorer plugin.
20+
PrecisePrefixCachePluginType = "precise-prefix-cache-scorer"
21+
)
22+
23+
// PrecisePrefixCachePluginConfig holds the configuration for the
24+
// PrecisePrefixCacheScorer plugin.
25+
type PrecisePrefixCachePluginConfig struct {
2226
// IndexerConfig holds the configuration for the `kvcache.Indexer` which is
2327
// used to score pods based on the KV-cache index state.
2428
IndexerConfig *kvcache.Config `json:"indexerConfig"`
@@ -29,13 +33,13 @@ type PrefixCacheTrackingConfig struct {
2933
}
3034

3135
// compile-time type assertion
32-
var _ framework.Scorer = &PrefixCacheTracking{}
36+
var _ framework.Scorer = &PrecisePrefixCacheScorer{}
3337

34-
// PrefixCacheTrackingPluginFactory defines the factory function for creating
38+
// PrecisePrefixCachePluginFactory defines the factory function for creating
3539
// a new instance of the PrefixCacheTrackingPlugin.
36-
func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage,
40+
func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage,
3741
handle plugins.Handle) (plugins.Plugin, error) {
38-
parameters := PrefixCacheTrackingConfig{
42+
parameters := PrecisePrefixCachePluginConfig{
3943
IndexerConfig: kvcache.NewDefaultConfig(),
4044
KVEventsConfig: kvevents.DefaultConfig(),
4145
}
@@ -47,13 +51,13 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage
4751

4852
if rawParameters != nil {
4953
if err := json.Unmarshal(rawParameters, &parameters); err != nil {
50-
return nil, fmt.Errorf("failed to parse %s plugin config: %w", prefix.PrefixCachePluginType, err)
54+
return nil, fmt.Errorf("failed to parse %s plugin config: %w", PrecisePrefixCachePluginType, err)
5155
}
5256
}
5357

5458
scorer, err := New(handle.Context(), parameters)
5559
if err != nil {
56-
return nil, fmt.Errorf("failed to create %s plugin: %w", prefix.PrefixCachePluginType, err)
60+
return nil, fmt.Errorf("failed to create %s plugin: %w", PrecisePrefixCachePluginType, err)
5761
}
5862

5963
return scorer.WithName(name), nil
@@ -68,7 +72,7 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage
6872
//
6973
// If the configuration is invalid or if the indexer fails to initialize,
7074
// an error is returned.
71-
func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTracking, error) {
75+
func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePrefixCacheScorer, error) {
7276
// initialize the indexer
7377
kvCacheIndexer, err := kvcache.NewKVCacheIndexer(ctx, config.IndexerConfig)
7478
if err != nil {
@@ -81,36 +85,36 @@ func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTra
8185
pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex())
8286
pool.Start(ctx)
8387

84-
return &PrefixCacheTracking{
85-
typedName: plugins.TypedName{Type: prefix.PrefixCachePluginType},
88+
return &PrecisePrefixCacheScorer{
89+
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
8690
kvCacheIndexer: kvCacheIndexer,
8791
}, nil
8892
}
8993

90-
// PrefixCacheTracking implements the framework.Scorer interface.
91-
// The scorer implements the `cache_tracking` mode of the prefix cache plugin.
94+
// PrecisePrefixCacheScorer implements the framework.Scorer interface.
95+
// The scorer implements precise prefix-cache KV-block locality scoring.
9296
// It uses the `kvcache.Indexer` to score pods based on the KV-cache index
9397
// state, and the `kvevents.Pool` to subscribe to KV-cache events
94-
// to update the internal KV-cache index state.
95-
type PrefixCacheTracking struct {
98+
// to keep the internal KV-cache index state up-to-date.
99+
type PrecisePrefixCacheScorer struct {
96100
typedName plugins.TypedName
97101
kvCacheIndexer *kvcache.Indexer
98102
}
99103

100104
// TypedName returns the typed name of the plugin.
101-
func (s *PrefixCacheTracking) TypedName() plugins.TypedName {
105+
func (s *PrecisePrefixCacheScorer) TypedName() plugins.TypedName {
102106
return s.typedName
103107
}
104108

105109
// WithName sets the name of the plugin.
106-
func (s *PrefixCacheTracking) WithName(name string) *PrefixCacheTracking {
110+
func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScorer {
107111
s.typedName.Name = name
108112
return s
109113
}
110114

111115
// Score scores the provided pod based on the KVCache index state.
112116
// The returned scores are normalized to a range of 0-1.
113-
func (s *PrefixCacheTracking) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
117+
func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
114118
loggerDebug := log.FromContext(ctx).WithName(s.typedName.String()).V(logutil.DEBUG)
115119
if request == nil {
116120
loggerDebug.Info("Request is nil, skipping scoring")

pkg/plugins/scorer/prefix_cache.go

Lines changed: 0 additions & 61 deletions
This file was deleted.

0 commit comments

Comments
 (0)