@@ -24,12 +24,14 @@ import (
24
24
"math/rand"
25
25
"net"
26
26
"strconv"
27
+ "strings"
27
28
"time"
28
29
29
30
"github.com/go-logr/logr"
30
31
"sigs.k8s.io/controller-runtime/pkg/log"
31
32
"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
32
33
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
34
+ backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
33
35
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
34
36
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
35
37
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -39,6 +41,11 @@ import (
39
41
requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request"
40
42
)
41
43
44
+ const (
45
+ subsetHintNamespace = "envoy.lb.subset_hint"
46
+ subsetHintKey = "x-gateway-destination-endpoint-subset"
47
+ )
48
+
42
49
// Scheduler defines the interface required by the Director for scheduling.
43
50
type Scheduler interface {
44
51
Schedule (ctx context.Context , request * schedulingtypes.LLMRequest , candidatePods []schedulingtypes.Pod ) (result * schedulingtypes.SchedulingResult , err error )
@@ -118,12 +125,12 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
118
125
}
119
126
120
127
// Prepare LLMRequest (needed for both saturation detection and Scheduler)
121
- reqCtx .SchedulingRequest = schedulingtypes .NewLLMRequest (
122
- reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
123
- reqCtx .ResolvedTargetModel ,
124
- prompt ,
125
- reqCtx .Request .Headers ,
126
- reqCtx . Request . Metadata )
128
+ reqCtx .SchedulingRequest = & schedulingtypes.LLMRequest {
129
+ RequestId : reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
130
+ TargetModel : reqCtx .ResolvedTargetModel ,
131
+ Prompt : prompt ,
132
+ Headers : reqCtx .Request .Headers ,
133
+ }
127
134
128
135
logger = logger .WithValues ("model" , reqCtx .Model , "resolvedTargetModel" , reqCtx .ResolvedTargetModel , "criticality" , requestCriticality )
129
136
@@ -135,11 +142,11 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
135
142
return reqCtx , err
136
143
}
137
144
138
- // --- 3. Call Scheduler ---
139
- // Snapshot pod metrics from the datastore to:
140
- // 1. Reduce concurrent access to the datastore.
141
- // 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
142
- candidatePods := schedulingtypes . ToSchedulerPodMetrics ( d . datastore . PodGetAll ())
145
+ // --- 3. Call Scheduler (with the relevant candidate pods) ---
146
+ candidatePods := d . getCandidatePodsForScheduling ( ctx , reqCtx . Request . Metadata )
147
+ if len ( candidatePods ) == 0 {
148
+ return reqCtx , errutil. Error { Code : errutil . ServiceUnavailable , Msg : "failed to find candidate pods for serving the request" }
149
+ }
143
150
results , err := d .scheduler .Schedule (ctx , reqCtx .SchedulingRequest , candidatePods )
144
151
if err != nil {
145
152
return reqCtx , errutil.Error {Code : errutil .InferencePoolResourceExhausted , Msg : fmt .Errorf ("failed to find target pod: %w" , err ).Error ()}
@@ -177,6 +184,52 @@ func (d *Director) admitRequest(ctx context.Context, requestCriticality v1alpha2
177
184
return nil
178
185
}
179
186
187
+ // getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore.
188
+ // according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies
189
+ // a subset of endpoints, only these endpoints will be considered as candidates for the scheduler.
190
+ // Snapshot pod metrics from the datastore to:
191
+ // 1. Reduce concurrent access to the datastore.
192
+ // 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
193
+ func (d * Director ) getCandidatePodsForScheduling (ctx context.Context , requestMetadata map [string ]any ) []schedulingtypes.Pod {
194
+ loggerTrace := log .FromContext (ctx ).V (logutil .TRACE )
195
+
196
+ subsetMap , found := requestMetadata [subsetHintNamespace ].(map [string ]any )
197
+ if ! found {
198
+ return schedulingtypes .ToSchedulerPodMetrics (d .datastore .PodGetAll ())
199
+ }
200
+
201
+ // Check if endpoint key is present in the subset map and ensure there is at least one value
202
+ endpointSubsetList , found := subsetMap [subsetHintKey ].([]any )
203
+ if ! found {
204
+ return schedulingtypes .ToSchedulerPodMetrics (d .datastore .PodGetAll ())
205
+ } else if len (endpointSubsetList ) == 0 {
206
+ loggerTrace .Info ("found empty subset filter in request metadata, filtering all pods" )
207
+ return []schedulingtypes.Pod {}
208
+ }
209
+
210
+ // Create a map of endpoint addresses for easy lookup
211
+ endpoints := make (map [string ]bool )
212
+ for _ , endpoint := range endpointSubsetList {
213
+ // Extract address from endpoint
214
+ // The endpoint is formatted as "<address>:<port>" (ex. "10.0.1.0:8080")
215
+ epStr := strings .Split (endpoint .(string ), ":" )[0 ]
216
+ endpoints [epStr ] = true
217
+ }
218
+
219
+ podTotalCount := 0
220
+ podFitleredList := d .datastore .PodList (func (pm backendmetrics.PodMetrics ) bool {
221
+ podTotalCount ++
222
+ if _ , found := endpoints [pm .GetPod ().Address ]; found {
223
+ return true
224
+ }
225
+ return false
226
+ })
227
+
228
+ loggerTrace .Info ("filtered candidate pods by subset filtering" , "podTotalCount" , podTotalCount , "filteredCount" , len (podFitleredList ))
229
+
230
+ return schedulingtypes .ToSchedulerPodMetrics (podFitleredList )
231
+ }
232
+
180
233
// prepareRequest populates the RequestContext and calls the registered PreRequest plugins
181
234
// for allowing plugging customized logic based on the scheduling results.
182
235
func (d * Director ) prepareRequest (ctx context.Context , reqCtx * handlers.RequestContext , result * schedulingtypes.SchedulingResult ) (* handlers.RequestContext , error ) {
0 commit comments