@@ -24,12 +24,14 @@ import (
2424 "math/rand"
2525 "net"
2626 "strconv"
27+ "strings"
2728 "time"
2829
2930 "github.com/go-logr/logr"
3031 "sigs.k8s.io/controller-runtime/pkg/log"
3132 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
3233 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
34+ backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
3335 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
3436 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
3537 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -39,6 +41,11 @@ import (
3941 requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request"
4042)
4143
44+ const (
45+ subsetHintNamespace = "envoy.lb.subset_hint"
46+ subsetHintKey = "x-gateway-destination-endpoint-subset"
47+ )
48+
4249// Scheduler defines the interface required by the Director for scheduling.
4350type Scheduler interface {
4451 Schedule (ctx context.Context , request * schedulingtypes.LLMRequest , candidatePods []schedulingtypes.Pod ) (result * schedulingtypes.SchedulingResult , err error )
@@ -118,12 +125,12 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
118125 }
119126
120127 // Prepare LLMRequest (needed for both saturation detection and Scheduler)
121- reqCtx .SchedulingRequest = schedulingtypes .NewLLMRequest (
122- reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
123- reqCtx .ResolvedTargetModel ,
124- prompt ,
125- reqCtx .Request .Headers ,
126- reqCtx . Request . Metadata )
128+ reqCtx .SchedulingRequest = & schedulingtypes.LLMRequest {
129+ RequestId : reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
130+ TargetModel : reqCtx .ResolvedTargetModel ,
131+ Prompt : prompt ,
132+ Headers : reqCtx .Request .Headers ,
133+ }
127134
128135 logger = logger .WithValues ("model" , reqCtx .Model , "resolvedTargetModel" , reqCtx .ResolvedTargetModel , "criticality" , requestCriticality )
129136
@@ -135,11 +142,11 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
135142 return reqCtx , err
136143 }
137144
138- // --- 3. Call Scheduler ---
139- // Snapshot pod metrics from the datastore to:
140- // 1. Reduce concurrent access to the datastore.
141- // 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
142- candidatePods := schedulingtypes . ToSchedulerPodMetrics ( d . datastore . PodGetAll ())
145+ // --- 3. Call Scheduler (with the relevant candidate pods) ---
146+ candidatePods := d . getCandidatePodsForScheduling ( ctx , reqCtx . Request . Metadata )
147+ if len ( candidatePods ) == 0 {
148+ return reqCtx , errutil. Error { Code : errutil . ServiceUnavailable , Msg : "failed to find candidate pods for serving the request" }
149+ }
143150 results , err := d .scheduler .Schedule (ctx , reqCtx .SchedulingRequest , candidatePods )
144151 if err != nil {
145152 return reqCtx , errutil.Error {Code : errutil .InferencePoolResourceExhausted , Msg : fmt .Errorf ("failed to find target pod: %w" , err ).Error ()}
@@ -177,6 +184,52 @@ func (d *Director) admitRequest(ctx context.Context, requestCriticality v1alpha2
177184 return nil
178185}
179186
187+ // getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore.
188+ // according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies
189+ // a subset of endpoints, only these endpoints will be considered as candidates for the scheduler.
190+ // Snapshot pod metrics from the datastore to:
191+ // 1. Reduce concurrent access to the datastore.
192+ // 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
193+ func (d * Director ) getCandidatePodsForScheduling (ctx context.Context , requestMetadata map [string ]any ) []schedulingtypes.Pod {
194+ loggerTrace := log .FromContext (ctx ).V (logutil .TRACE )
195+
196+ subsetMap , found := requestMetadata [subsetHintNamespace ].(map [string ]any )
197+ if ! found {
198+ return schedulingtypes .ToSchedulerPodMetrics (d .datastore .PodGetAll ())
199+ }
200+
201+ // Check if endpoint key is present in the subset map and ensure there is at least one value
202+ endpointSubsetList , found := subsetMap [subsetHintKey ].([]any )
203+ if ! found {
204+ return schedulingtypes .ToSchedulerPodMetrics (d .datastore .PodGetAll ())
205+ } else if len (endpointSubsetList ) == 0 {
206+ loggerTrace .Info ("found empty subset filter in request metadata, filtering all pods" )
207+ return []schedulingtypes.Pod {}
208+ }
209+
210+ // Create a map of endpoint addresses for easy lookup
211+ endpoints := make (map [string ]bool )
212+ for _ , endpoint := range endpointSubsetList {
213+ // Extract address from endpoint
214+ // The endpoint is formatted as "<address>:<port>" (ex. "10.0.1.0:8080")
215+ epStr := strings .Split (endpoint .(string ), ":" )[0 ]
216+ endpoints [epStr ] = true
217+ }
218+
219+ podTotalCount := 0
220+ podFitleredList := d .datastore .PodList (func (pm backendmetrics.PodMetrics ) bool {
221+ podTotalCount ++
222+ if _ , found := endpoints [pm .GetPod ().Address ]; found {
223+ return true
224+ }
225+ return false
226+ })
227+
228+ loggerTrace .Info ("filtered candidate pods by subset filtering" , "podTotalCount" , podTotalCount , "filteredCount" , len (podFitleredList ))
229+
230+ return schedulingtypes .ToSchedulerPodMetrics (podFitleredList )
231+ }
232+
180233// prepareRequest populates the RequestContext and calls the registered PreRequest plugins
181234// for allowing plugging customized logic based on the scheduling results.
182235func (d * Director ) prepareRequest (ctx context.Context , reqCtx * handlers.RequestContext , result * schedulingtypes.SchedulingResult ) (* handlers.RequestContext , error ) {
0 commit comments