kubernetes-sigs
diff --git a/‎Dockerfile
Lines changed: 1 addition & 0 deletions b/‎Dockerfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/epp/main.go
Lines changed: 12 additions & 5 deletions b/‎cmd/epp/main.go
Lines changed: 12 additions & 5 deletions
diff --git a/‎config/charts/inferencepool/README.md
Lines changed: 28 additions & 0 deletions b/‎config/charts/inferencepool/README.md
Lines changed: 28 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/templates/epp-deployment.yaml
Lines changed: 5 additions & 0 deletions b/‎config/charts/inferencepool/templates/epp-deployment.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/values.yaml
Lines changed: 4 additions & 0 deletions b/‎config/charts/inferencepool/values.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎config/manifests/vllm/sim-deployment.yaml
Lines changed: 1 addition & 1 deletion b/‎config/manifests/vllm/sim-deployment.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎conformance/testing-epp/plugins/filter/filter_test.go
Lines changed: 93 additions & 0 deletions b/‎conformance/testing-epp/plugins/filter/filter_test.go
Lines changed: 93 additions & 0 deletions
diff --git a/‎conformance/testing-epp/plugins/filter/request_header_based_filter.go
Lines changed: 62 additions & 0 deletions b/‎conformance/testing-epp/plugins/filter/request_header_based_filter.go
Lines changed: 62 additions & 0 deletions
diff --git a/‎conformance/testing-epp/scheduler.go
Lines changed: 34 additions & 0 deletions b/‎conformance/testing-epp/scheduler.go
Lines changed: 34 additions & 0 deletions
@@ -19,6 +19,7 @@ RUN go mod download
 # Sources
 COPY cmd/epp ./cmd
 COPY pkg/epp ./pkg/epp
+COPY conformance/testing-epp ./conformance/testing-epp
 COPY internal ./internal
 COPY api ./api
 WORKDIR /src/cmd
 
@@ -35,15 +35,16 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 
+	conformance_epp "sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp"
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics/collectors"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
 	profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
@@ -111,8 +112,9 @@ var (
 	setupLog = ctrl.Log.WithName("setup")
 
 	// Environment variables
-	schedulerV2           = envutil.GetEnvBool("EXPERIMENTAL_USE_SCHEDULER_V2", false, setupLog)
-	prefixCacheScheduling = envutil.GetEnvBool("ENABLE_PREFIX_CACHE_SCHEDULING", false, setupLog)
+	schedulerV2                       = envutil.GetEnvBool("EXPERIMENTAL_USE_SCHEDULER_V2", false, setupLog)
+	prefixCacheScheduling             = envutil.GetEnvBool("ENABLE_PREFIX_CACHE_SCHEDULING", false, setupLog)
+	reqHeaderBasedSchedulerForTesting = envutil.GetEnvBool("ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING", false, setupLog)
 )
 
 func loadPrefixCacheConfig() prefix.Config {
@@ -208,7 +210,6 @@ func run() error {
 		kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
 
 		schedulerProfile := framework.NewSchedulerProfile().
-			WithFilters(filter.NewSheddableCapacityFilter()).
 			WithScorers(framework.NewWeightedScorer(&scorer.QueueScorer{}, queueScorerWeight),
 				framework.NewWeightedScorer(&scorer.KVCacheScorer{}, kvCacheScorerWeight)).
 			WithPicker(picker.NewMaxScorePicker())
@@ -225,8 +226,14 @@ func run() error {
 		scheduler = scheduling.NewSchedulerWithConfig(datastore, schedulerConfig)
 	}
 
+	if reqHeaderBasedSchedulerForTesting {
+		scheduler = conformance_epp.NewReqHeaderBasedScheduler(datastore)
+	}
+
 	saturationDetector := saturationdetector.NewDetector(sdConfig, datastore, ctrl.Log)
 
+	director := requestcontrol.NewDirector(datastore, scheduler, saturationDetector) // can call "director.WithPostResponsePlugins" to add post response plugins
+
 	// --- Setup ExtProc Server Runner ---
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
@@ -237,7 +244,7 @@ func run() error {
 		SecureServing:                            *secureServing,
 		CertPath:                                 *certPath,
 		RefreshPrometheusMetricsInterval:         *refreshPrometheusMetricsInterval,
-		Scheduler:                                scheduler,
+		Director:                                 director,
 		SaturationDetector:                       saturationDetector,
 	}
 	if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {
 
@@ -22,6 +22,33 @@ $ helm install vllm-llama3-8b-instruct \
 
 Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed.
 
+### Install with Custom Environment Variables
+
+To set custom environment variables for the EndpointPicker deployment:
+
+```txt
+$ helm install vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+  --set provider.name=[none|gke] \
+  --set inferenceExtension.env.FEATURE_FLAG_ENABLED=true \
+  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
+```
+
+Alternatively, you can define environment variables in a values file:
+
+```yaml
+# values.yaml
+inferenceExtension:
+  env:
+    FEATURE_FLAG_ENABLED: "true"
+```
+
+And apply it with:
+
+```txt
+$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+```
+
 ### Install for Triton TensorRT-LLM
 
 Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g.,
@@ -57,6 +84,7 @@ The following table list the configurable parameters of the chart.
 | `inferenceExtension.image.tag`              | Image tag of the endpoint picker.                                                                                      |
 | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`.      |
 | `inferenceExtension.extProcPort`            | Port where the endpoint picker service is served for external processing. Defaults to `9002`.                          |
+| `inferenceExtension.env`                    | Map of environment variables to set in the endpoint picker container. Defaults to `{}`.                                |
 | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
 
 ## Notes
 
@@ -62,3 +62,8 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+        {{- range $key, $value := .Values.inferenceExtension.env }}
+        - name: {{ $key }}
+          value: {{ $value | quote }}
+        {{- end }}
@@ -6,6 +6,10 @@ inferenceExtension:
     tag: main
     pullPolicy: Always
   extProcPort: 9002
+  env: {}
+  # Example environment variables:
+  # env:
+  #   KV_CACHE_SCORE_WEIGHT: "1"
 
 inferencePool:
   targetPortNumber: 8000
 
@@ -14,7 +14,7 @@ spec:
     spec:
       containers:
       - name: vllm-sim
-        image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.0
+        image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.1
         imagePullPolicy: Always
         args:
         - --model
 
@@ -0,0 +1,93 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package filter
+
+import (
+	"context"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+func TestFilter(t *testing.T) {
+	tests := []struct {
+		name   string
+		req    *types.LLMRequest
+		filter framework.Filter
+		input  []types.Pod
+		output []types.Pod
+	}{
+		{
+			name:   "TestHeaderBasedFilter, header endpoint unset in request",
+			req:    &types.LLMRequest{}, // Delieverately unset the header.
+			filter: &HeaderBasedTestingFilter{},
+			input: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{
+						Address: "test-endpoint",
+					},
+				},
+			},
+			output: []types.Pod{},
+		},
+		{
+			name:   "TestHeaderBasedFilter, header endpoint set in request but no match",
+			req:    &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}},
+			filter: &HeaderBasedTestingFilter{},
+			input: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{
+						Address: "test-endpoint-unmatch",
+					},
+				},
+			},
+			output: []types.Pod{},
+		},
+		{
+			name:   "TestHeaderBasedFilter, header endpoint set",
+			req:    &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}},
+			filter: &HeaderBasedTestingFilter{},
+			input: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{
+						Address: "test-endpoint",
+					},
+				},
+			},
+			output: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{
+						Address: "test-endpoint",
+					},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			got := test.filter.Filter(context.Background(), test.req, types.NewCycleState(), test.input)
+
+			if diff := cmp.Diff(test.output, got); diff != "" {
+				t.Errorf("Unexpected output (-want +got): %v", diff)
+			}
+		})
+	}
+}
@@ -0,0 +1,62 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package filter
+
+import (
+	"context"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+const (
+	headerTestEppEndPointSelectionKey = "test-epp-endpoint-selection"
+)
+
+// compile-time type assertion
+var _ framework.Filter = &HeaderBasedTestingFilter{}
+
+// NewHeaderBasedTestingFilter initializes a new HeaderBasedTestingFilter and returns its pointer.
+// This should be only used in testing purpose.
+func NewHeaderBasedTestingFilter() *HeaderBasedTestingFilter {
+	return &HeaderBasedTestingFilter{}
+}
+
+// HeaderBasedTestingFilter filters Pods based on an address specified in the "test-epp-endpoint-selection" request header.
+type HeaderBasedTestingFilter struct{}
+
+// Name returns the name of the filter.
+func (f *HeaderBasedTestingFilter) Name() string {
+	return "test-header-based"
+}
+
+// Filter filters out pods that doesn't meet the filter criteria.
+func (f *HeaderBasedTestingFilter) Filter(_ context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
+	filteredPods := []types.Pod{}
+
+	endPointInReqeust, found := request.Headers[headerTestEppEndPointSelectionKey]
+	if !found {
+		return filteredPods
+	}
+
+	for _, pod := range pods {
+		if pod.GetPod().Address == endPointInReqeust {
+			filteredPods = append(filteredPods, pod)
+		}
+	}
+	return filteredPods
+}
@@ -0,0 +1,34 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduling
+
+import (
+	"sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp/plugins/filter"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
+	profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
+)
+
+// NewReqHeaderBasedScheduler creates a scheduler for conformance tests that selects
+// an endpoint based on the "test-epp-endpoint-selection" request header. If the
+// header is missing or the specified endpoint doesn't exist, no endpoint is returned.
+func NewReqHeaderBasedScheduler(datastore scheduling.Datastore) *scheduling.Scheduler {
+	predicatableSchedulerProfile := framework.NewSchedulerProfile().WithFilters(filter.NewHeaderBasedTestingFilter()).WithPicker(picker.NewMaxScorePicker())
+	return scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig(
+		profilepicker.NewAllProfilesPicker(), map[string]*framework.SchedulerProfile{"req-header-based-profile": predicatableSchedulerProfile}))
+}