Skip to content

Commit 6281a31

Browse files
committed
Merge branch 'main' into conformance
2 parents c84fae3 + 33cda4b commit 6281a31

File tree

37 files changed

+1401
-1527
lines changed

37 files changed

+1401
-1527
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ RUN go mod download
1919
# Sources
2020
COPY cmd/epp ./cmd
2121
COPY pkg/epp ./pkg/epp
22+
COPY conformance/testing-epp ./conformance/testing-epp
2223
COPY internal ./internal
2324
COPY api ./api
2425
WORKDIR /src/cmd

cmd/epp/main.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,16 @@ import (
3535
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
3636
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3737

38+
conformance_epp "sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp"
3839
"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
3940
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
4041
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
4142
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
4243
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics/collectors"
44+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
4345
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
4446
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
4547
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
46-
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
4748
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
4849
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
4950
profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
@@ -111,8 +112,9 @@ var (
111112
setupLog = ctrl.Log.WithName("setup")
112113

113114
// Environment variables
114-
schedulerV2 = envutil.GetEnvBool("EXPERIMENTAL_USE_SCHEDULER_V2", false, setupLog)
115-
prefixCacheScheduling = envutil.GetEnvBool("ENABLE_PREFIX_CACHE_SCHEDULING", false, setupLog)
115+
schedulerV2 = envutil.GetEnvBool("EXPERIMENTAL_USE_SCHEDULER_V2", false, setupLog)
116+
prefixCacheScheduling = envutil.GetEnvBool("ENABLE_PREFIX_CACHE_SCHEDULING", false, setupLog)
117+
reqHeaderBasedSchedulerForTesting = envutil.GetEnvBool("ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING", false, setupLog)
116118
)
117119

118120
func loadPrefixCacheConfig() prefix.Config {
@@ -208,7 +210,6 @@ func run() error {
208210
kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
209211

210212
schedulerProfile := framework.NewSchedulerProfile().
211-
WithFilters(filter.NewSheddableCapacityFilter()).
212213
WithScorers(framework.NewWeightedScorer(&scorer.QueueScorer{}, queueScorerWeight),
213214
framework.NewWeightedScorer(&scorer.KVCacheScorer{}, kvCacheScorerWeight)).
214215
WithPicker(picker.NewMaxScorePicker())
@@ -225,8 +226,14 @@ func run() error {
225226
scheduler = scheduling.NewSchedulerWithConfig(datastore, schedulerConfig)
226227
}
227228

229+
if reqHeaderBasedSchedulerForTesting {
230+
scheduler = conformance_epp.NewReqHeaderBasedScheduler(datastore)
231+
}
232+
228233
saturationDetector := saturationdetector.NewDetector(sdConfig, datastore, ctrl.Log)
229234

235+
director := requestcontrol.NewDirector(datastore, scheduler, saturationDetector) // can call "director.WithPostResponsePlugins" to add post response plugins
236+
230237
// --- Setup ExtProc Server Runner ---
231238
serverRunner := &runserver.ExtProcServerRunner{
232239
GrpcPort: *grpcPort,
@@ -237,7 +244,7 @@ func run() error {
237244
SecureServing: *secureServing,
238245
CertPath: *certPath,
239246
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
240-
Scheduler: scheduler,
247+
Director: director,
241248
SaturationDetector: saturationDetector,
242249
}
243250
if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {

config/charts/inferencepool/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,33 @@ $ helm install vllm-llama3-8b-instruct \
2222

2323
Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
2424

25+
### Install with Custom Environment Variables
26+
27+
To set custom environment variables for the EndpointPicker deployment:
28+
29+
```txt
30+
$ helm install vllm-llama3-8b-instruct \
31+
--set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
32+
--set provider.name=[none|gke] \
33+
--set inferenceExtension.env.FEATURE_FLAG_ENABLED=true \
34+
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
35+
```
36+
37+
Alternatively, you can define environment variables in a values file:
38+
39+
```yaml
40+
# values.yaml
41+
inferenceExtension:
42+
env:
43+
FEATURE_FLAG_ENABLED: "true"
44+
```
45+
46+
And apply it with:
47+
48+
```txt
49+
$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
50+
```
51+
2552
### Install for Triton TensorRT-LLM
2653

2754
Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
@@ -57,6 +84,7 @@ The following table list the configurable parameters of the chart.
5784
| `inferenceExtension.image.tag` | Image tag of the endpoint picker. |
5885
| `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. |
5986
| `inferenceExtension.extProcPort` | Port where the endpoint picker service is served for external processing. Defaults to `9002`. |
87+
| `inferenceExtension.env` | Map of environment variables to set in the endpoint picker container. Defaults to `{}`. |
6088
| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. |
6189

6290
## Notes

config/charts/inferencepool/templates/epp-deployment.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,8 @@ spec:
6262
service: inference-extension
6363
initialDelaySeconds: 5
6464
periodSeconds: 10
65+
env:
66+
{{- range $key, $value := .Values.inferenceExtension.env }}
67+
- name: {{ $key }}
68+
value: {{ $value | quote }}
69+
{{- end }}

config/charts/inferencepool/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ inferenceExtension:
66
tag: main
77
pullPolicy: Always
88
extProcPort: 9002
9+
env: {}
10+
# Example environment variables:
11+
# env:
12+
# KV_CACHE_SCORE_WEIGHT: "1"
913

1014
inferencePool:
1115
targetPortNumber: 8000

config/manifests/vllm/sim-deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ spec:
1414
spec:
1515
containers:
1616
- name: vllm-sim
17-
image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.0
17+
image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.1
1818
imagePullPolicy: Always
1919
args:
2020
- --model
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package filter
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
"github.com/google/go-cmp/cmp"
24+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
25+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
26+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
27+
)
28+
29+
func TestFilter(t *testing.T) {
30+
tests := []struct {
31+
name string
32+
req *types.LLMRequest
33+
filter framework.Filter
34+
input []types.Pod
35+
output []types.Pod
36+
}{
37+
{
38+
name: "TestHeaderBasedFilter, header endpoint unset in request",
39+
req: &types.LLMRequest{}, // Delieverately unset the header.
40+
filter: &HeaderBasedTestingFilter{},
41+
input: []types.Pod{
42+
&types.PodMetrics{
43+
Pod: &backend.Pod{
44+
Address: "test-endpoint",
45+
},
46+
},
47+
},
48+
output: []types.Pod{},
49+
},
50+
{
51+
name: "TestHeaderBasedFilter, header endpoint set in request but no match",
52+
req: &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}},
53+
filter: &HeaderBasedTestingFilter{},
54+
input: []types.Pod{
55+
&types.PodMetrics{
56+
Pod: &backend.Pod{
57+
Address: "test-endpoint-unmatch",
58+
},
59+
},
60+
},
61+
output: []types.Pod{},
62+
},
63+
{
64+
name: "TestHeaderBasedFilter, header endpoint set",
65+
req: &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}},
66+
filter: &HeaderBasedTestingFilter{},
67+
input: []types.Pod{
68+
&types.PodMetrics{
69+
Pod: &backend.Pod{
70+
Address: "test-endpoint",
71+
},
72+
},
73+
},
74+
output: []types.Pod{
75+
&types.PodMetrics{
76+
Pod: &backend.Pod{
77+
Address: "test-endpoint",
78+
},
79+
},
80+
},
81+
},
82+
}
83+
84+
for _, test := range tests {
85+
t.Run(test.name, func(t *testing.T) {
86+
got := test.filter.Filter(context.Background(), test.req, types.NewCycleState(), test.input)
87+
88+
if diff := cmp.Diff(test.output, got); diff != "" {
89+
t.Errorf("Unexpected output (-want +got): %v", diff)
90+
}
91+
})
92+
}
93+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package filter
18+
19+
import (
20+
"context"
21+
22+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
23+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
24+
)
25+
26+
const (
27+
headerTestEppEndPointSelectionKey = "test-epp-endpoint-selection"
28+
)
29+
30+
// compile-time type assertion
31+
var _ framework.Filter = &HeaderBasedTestingFilter{}
32+
33+
// NewHeaderBasedTestingFilter initializes a new HeaderBasedTestingFilter and returns its pointer.
34+
// This should be only used in testing purpose.
35+
func NewHeaderBasedTestingFilter() *HeaderBasedTestingFilter {
36+
return &HeaderBasedTestingFilter{}
37+
}
38+
39+
// HeaderBasedTestingFilter filters Pods based on an address specified in the "test-epp-endpoint-selection" request header.
40+
type HeaderBasedTestingFilter struct{}
41+
42+
// Name returns the name of the filter.
43+
func (f *HeaderBasedTestingFilter) Name() string {
44+
return "test-header-based"
45+
}
46+
47+
// Filter filters out pods that doesn't meet the filter criteria.
48+
func (f *HeaderBasedTestingFilter) Filter(_ context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
49+
filteredPods := []types.Pod{}
50+
51+
endPointInReqeust, found := request.Headers[headerTestEppEndPointSelectionKey]
52+
if !found {
53+
return filteredPods
54+
}
55+
56+
for _, pod := range pods {
57+
if pod.GetPod().Address == endPointInReqeust {
58+
filteredPods = append(filteredPods, pod)
59+
}
60+
}
61+
return filteredPods
62+
}

conformance/testing-epp/scheduler.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package scheduling
18+
19+
import (
20+
"sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp/plugins/filter"
21+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
22+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
23+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
24+
profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
25+
)
26+
27+
// NewReqHeaderBasedScheduler creates a scheduler for conformance tests that selects
28+
// an endpoint based on the "test-epp-endpoint-selection" request header. If the
29+
// header is missing or the specified endpoint doesn't exist, no endpoint is returned.
30+
func NewReqHeaderBasedScheduler(datastore scheduling.Datastore) *scheduling.Scheduler {
31+
predicatableSchedulerProfile := framework.NewSchedulerProfile().WithFilters(filter.NewHeaderBasedTestingFilter()).WithPicker(picker.NewMaxScorePicker())
32+
return scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig(
33+
profilepicker.NewAllProfilesPicker(), map[string]*framework.SchedulerProfile{"req-header-based-profile": predicatableSchedulerProfile}))
34+
}

0 commit comments

Comments
 (0)