Skip to content

Commit 3dd33b7

Browse files
authored
Switch to the new default scheduler plugins in integration test (#1291)
1 parent 19dd14e commit 3dd33b7

File tree

3 files changed

+46
-73
lines changed

3 files changed

+46
-73
lines changed

pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ const (
5555
PrefixCachePluginType = "prefix-cache-scorer"
5656
)
5757

58+
var DefaultConfig = Config{
59+
HashBlockSize: DefaultHashBlockSize,
60+
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
61+
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
62+
}
63+
5864
type Config struct {
5965
// The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests
6066
// with length shorter than the block size will be ignored.

pkg/epp/scheduling/scheduler_test.go

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -27,43 +27,27 @@ import (
2727
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
2828
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds
2929
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
30-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
30+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
3131
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
3232
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile"
33+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer"
3334
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3435
)
3536

3637
// Tests the default scheduler configuration and expected behavior.
3738
func TestSchedule(t *testing.T) {
38-
loraAffinityFilter := filter.NewLoraAffinityFilter(filter.DefaultLoraAffinityThreshold)
39-
leastQueueFilter := filter.NewLeastQueueFilter()
40-
leastKvCacheFilter := filter.NewLeastKVCacheFilter()
41-
42-
lowLatencyFilter := &filter.DecisionTreeFilter{
43-
Current: filter.NewLowQueueFilter(filter.DefaultQueueingThresholdLoRA),
44-
NextOnSuccess: &filter.DecisionTreeFilter{
45-
Current: loraAffinityFilter,
46-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
47-
Current: leastQueueFilter,
48-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
49-
Current: leastKvCacheFilter,
50-
},
51-
},
52-
},
53-
NextOnFailure: &filter.DecisionTreeFilter{
54-
Current: leastQueueFilter,
55-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
56-
Current: loraAffinityFilter,
57-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
58-
Current: leastKvCacheFilter,
59-
},
60-
},
61-
},
62-
}
39+
kvCacheUtilizationScorer := scorer.NewKVCacheUtilizationScorer()
40+
queueingScorer := scorer.NewQueueScorer()
41+
prefixCacheScorer := prefix.New(prefix.DefaultConfig)
42+
loraAffinityScorer := scorer.NewLoraAffinityScorer()
6343

6444
defaultProfile := framework.NewSchedulerProfile().
65-
WithFilters(lowLatencyFilter).
66-
WithPicker(picker.NewRandomPicker(picker.DefaultMaxNumOfEndpoints))
45+
WithScorers(framework.NewWeightedScorer(kvCacheUtilizationScorer, 1),
46+
framework.NewWeightedScorer(queueingScorer, 1),
47+
framework.NewWeightedScorer(prefixCacheScorer, 1),
48+
framework.NewWeightedScorer(loraAffinityScorer, 1),
49+
).
50+
WithPicker(picker.NewMaxScorePicker(picker.DefaultMaxNumOfEndpoints))
6751

6852
profileHandler := profile.NewSingleProfileHandler()
6953

@@ -110,8 +94,8 @@ func TestSchedule(t *testing.T) {
11094
&types.PodMetrics{
11195
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
11296
MetricsState: &backendmetrics.MetricsState{
113-
WaitingQueueSize: 3,
114-
KVCacheUsagePercent: 0.1,
97+
WaitingQueueSize: 0,
98+
KVCacheUsagePercent: 0.2,
11599
MaxActiveModels: 2,
116100
ActiveModels: map[string]int{
117101
"foo": 1,
@@ -123,7 +107,7 @@ func TestSchedule(t *testing.T) {
123107
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
124108
MetricsState: &backendmetrics.MetricsState{
125109
WaitingQueueSize: 10,
126-
KVCacheUsagePercent: 0.2,
110+
KVCacheUsagePercent: 0.8,
127111
MaxActiveModels: 2,
128112
ActiveModels: map[string]int{
129113
"foo": 1,
@@ -139,15 +123,16 @@ func TestSchedule(t *testing.T) {
139123
Pod: &types.PodMetrics{
140124
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
141125
MetricsState: &backendmetrics.MetricsState{
142-
WaitingQueueSize: 3,
143-
KVCacheUsagePercent: 0.1,
126+
WaitingQueueSize: 0,
127+
KVCacheUsagePercent: 0.2,
144128
MaxActiveModels: 2,
145129
ActiveModels: map[string]int{
146130
"foo": 1,
147131
"critical": 1,
148132
},
149133
},
150134
},
135+
Score: 2.8,
151136
},
152137
},
153138
},

test/integration/epp/hermetic_test.go

Lines changed: 22 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,10 @@ import (
6767
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
6868
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
6969
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
70-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
70+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
7171
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
7272
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile"
73+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer"
7374
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
7475
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
7576
epptestutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing"
@@ -249,15 +250,15 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
249250
),
250251
},
251252
{
252-
name: "select no lora despite active model, avoid excessive queue size",
253+
name: "select lora despite higher kv cache usage",
253254
requests: integrationutils.GenerateStreamedRequestSet(logger, "test3", modelSQLLora, nil),
254255
// Pod 2 will be picked despite NOT having the requested model active as it is above the affinity for queue size.
255256
// Also it is critical, so we should still admit the request despite all queue sizes being greater than the queue
256257
// size threshold.
257258
pods: newPodStates(
258259
podState{index: 0, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}},
259-
podState{index: 1, queueSize: 200, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}},
260-
podState{index: 2, queueSize: 6, kvCacheUsage: 0.2, activeModels: []string{"foo"}},
260+
podState{index: 1, queueSize: 10, kvCacheUsage: 0.4, activeModels: []string{"foo", modelSQLLoraTarget}},
261+
podState{index: 2, queueSize: 10, kvCacheUsage: 0.3, activeModels: []string{"foo"}},
261262
),
262263
wantMetrics: map[string]string{
263264
"inference_model_request_total": inferenceObjectiveRequestTotal([]label{
@@ -267,7 +268,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
267268
},
268269
wantErr: false,
269270
wantResponses: integrationutils.NewRequestBufferedResponse(
270-
"192.168.1.3:8000",
271+
"192.168.1.2:8000",
271272
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test3","temperature":0}`, modelSQLLoraTarget),
272273
&configPb.HeaderValueOption{
273274
Header: &configPb.HeaderValue{
@@ -301,7 +302,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
301302
// Pod 1 will be picked because it has relatively low queue size and low KV cache.
302303
pods: newPodStates(
303304
podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
304-
podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
305+
podState{index: 1, queueSize: 4, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
305306
podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}},
306307
),
307308
wantMetrics: map[string]string{
@@ -312,7 +313,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
312313
},
313314
wantErr: false,
314315
wantResponses: integrationutils.NewRequestBufferedResponse(
315-
"192.168.1.2:8000",
316+
"192.168.1.1:8000",
316317
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test5","temperature":0}`, modelSheddableTarget),
317318
&configPb.HeaderValueOption{
318319
Header: &configPb.HeaderValue{
@@ -353,7 +354,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
353354
// Pod 1 will be picked because it has relatively low queue size and low KV cache.
354355
pods: newPodStates(
355356
podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
356-
podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
357+
podState{index: 1, queueSize: 4, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
357358
podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}},
358359
),
359360
wantMetrics: map[string]string{
@@ -364,7 +365,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
364365
},
365366
wantErr: false,
366367
wantResponses: integrationutils.NewRequestBufferedResponse(
367-
"192.168.1.2:8000",
368+
"192.168.1.1:8000",
368369
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddableTarget),
369370
&configPb.HeaderValueOption{
370371
Header: &configPb.HeaderValue{
@@ -402,9 +403,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
402403
},
403404
},
404405
},
405-
// pod 0: selected
406-
// pod 1: excluded; above KV cache threshold
407-
// pod 2: excluded; above queue size threshold
406+
// pod 0: selected due to low queue size and kv cache usage
408407
pods: newPodStates(
409408
podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}},
410409
podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}},
@@ -418,7 +417,7 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
418417
},
419418
wantErr: false,
420419
wantResponses: integrationutils.NewRequestBufferedResponse(
421-
"192.168.1.2:8000",
420+
"192.168.1.1:8000",
422421
fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelDirect),
423422
&configPb.HeaderValueOption{
424423
Header: &configPb.HeaderValue{
@@ -1090,35 +1089,18 @@ func BeforeSuite() func() {
10901089
serverRunner.PoolNamespacedName = types.NamespacedName{Name: testPoolName, Namespace: testNamespace}
10911090
serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf)
10921091

1093-
loraAffinityFilter := filter.NewLoraAffinityFilter(filter.DefaultLoraAffinityThreshold)
1094-
leastQueueFilter := filter.NewLeastQueueFilter()
1095-
leastKvCacheFilter := filter.NewLeastKVCacheFilter()
1096-
1097-
lowLatencyFilter := &filter.DecisionTreeFilter{
1098-
Current: filter.NewLowQueueFilter(filter.DefaultQueueingThresholdLoRA),
1099-
NextOnSuccess: &filter.DecisionTreeFilter{
1100-
Current: loraAffinityFilter,
1101-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
1102-
Current: leastQueueFilter,
1103-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
1104-
Current: leastKvCacheFilter,
1105-
},
1106-
},
1107-
},
1108-
NextOnFailure: &filter.DecisionTreeFilter{
1109-
Current: leastQueueFilter,
1110-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
1111-
Current: loraAffinityFilter,
1112-
NextOnSuccessOrFailure: &filter.DecisionTreeFilter{
1113-
Current: leastKvCacheFilter,
1114-
},
1115-
},
1116-
},
1117-
}
1092+
kvCacheUtilizationScorer := scorer.NewKVCacheUtilizationScorer()
1093+
queueingScorer := scorer.NewQueueScorer()
1094+
prefixCacheScorer := prefix.New(prefix.DefaultConfig)
1095+
loraAffinityScorer := scorer.NewLoraAffinityScorer()
11181096

11191097
defaultProfile := framework.NewSchedulerProfile().
1120-
WithFilters(lowLatencyFilter).
1121-
WithPicker(picker.NewRandomPicker(picker.DefaultMaxNumOfEndpoints))
1098+
WithScorers(framework.NewWeightedScorer(kvCacheUtilizationScorer, 1),
1099+
framework.NewWeightedScorer(queueingScorer, 1),
1100+
framework.NewWeightedScorer(prefixCacheScorer, 1),
1101+
framework.NewWeightedScorer(loraAffinityScorer, 1),
1102+
).
1103+
WithPicker(picker.NewMaxScorePicker(picker.DefaultMaxNumOfEndpoints))
11221104

11231105
profileHandler := profile.NewSingleProfileHandler()
11241106

0 commit comments

Comments
 (0)