@@ -29,10 +29,10 @@ import (
2929
3030 "sigs.k8s.io/controller-runtime/pkg/log"
3131
32+ v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
3233 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
3334 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
3435 backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
35- "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
3636 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
3737 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metadata"
3838 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -42,30 +42,38 @@ import (
4242 requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request"
4343)
4444
45+ // Datastore defines the interface required by the Director.
46+ type Datastore interface {
47+ PoolGet () (* v1.InferencePool , error )
48+ ObjectiveGet (modelName string ) * v1alpha2.InferenceObjective
49+ PodList (predicate func (backendmetrics.PodMetrics ) bool ) []backendmetrics.PodMetrics
50+ }
51+
4552// Scheduler defines the interface required by the Director for scheduling.
4653type Scheduler interface {
4754 Schedule (ctx context.Context , request * schedulingtypes.LLMRequest , candidatePods []schedulingtypes.Pod ) (result * schedulingtypes.SchedulingResult , err error )
4855}
4956
5057// SaturationDetector provides a signal indicating whether the backends are considered saturated.
5158type SaturationDetector interface {
52- IsSaturated (ctx context.Context ) bool
59+ IsSaturated (ctx context.Context , candidatePods []backendmetrics. PodMetrics ) bool
5360}
5461
5562// NewDirectorWithConfig creates a new Director instance with all dependencies.
56- func NewDirectorWithConfig (datastore datastore. Datastore , scheduler Scheduler , saturationDetector SaturationDetector , config * Config ) * Director {
63+ func NewDirectorWithConfig (datastore Datastore , scheduler Scheduler , saturationDetector SaturationDetector , config * Config ) * Director {
5764 return & Director {
5865 datastore : datastore ,
5966 scheduler : scheduler ,
6067 saturationDetector : saturationDetector ,
6168 preRequestPlugins : config .preRequestPlugins ,
6269 postResponsePlugins : config .postResponsePlugins ,
70+ defaultPriority : 0 , // define default priority explicitly
6371 }
6472}
6573
6674// Director orchestrates the request handling flow, including scheduling.
6775type Director struct {
68- datastore datastore. Datastore
76+ datastore Datastore
6977 scheduler Scheduler
7078 saturationDetector SaturationDetector
7179 preRequestPlugins []PreRequest
@@ -76,17 +84,12 @@ type Director struct {
7684 defaultPriority int
7785}
7886
79- // HandleRequest orchestrates the request lifecycle:
80- // 1. Parses request details.
81- // 2. Calls admitRequest for admission control.
82- // 3. Calls Scheduler.Schedule if request is approved.
83- // 4. Calls prepareRequest to populate RequestContext with result and call PreRequest plugins.
84- //
87+ // HandleRequest orchestrates the request lifecycle.
8588// It always returns the requestContext even in the error case, as the request context is used in error handling.
8689func (d * Director ) HandleRequest (ctx context.Context , reqCtx * handlers.RequestContext ) (* handlers.RequestContext , error ) {
8790 logger := log .FromContext (ctx )
8891
89- // --- 1. Parse Request, Resolve Target Models, and Determine Parameters ---
92+ // Parse Request, Resolve Target Models, and Determine Parameters
9093 requestBodyMap := reqCtx .Request .Body
9194 var ok bool
9295 reqCtx .IncomingModelName , ok = requestBodyMap ["model" ].(string )
@@ -130,22 +133,23 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
130133 ctx = log .IntoContext (ctx , logger )
131134 logger .V (logutil .DEBUG ).Info ("LLM request assembled" )
132135
133- // --- 2. Admission Control check --
134- if err := d .admitRequest (ctx , * infObjective .Spec .Priority , reqCtx .FairnessID ); err != nil {
135- return reqCtx , err
136- }
137-
138- // --- 3. Call Scheduler (with the relevant candidate pods) ---
136+ // Get candidate pods for scheduling
139137 candidatePods := d .getCandidatePodsForScheduling (ctx , reqCtx .Request .Metadata )
140138 if len (candidatePods ) == 0 {
141139 return reqCtx , errutil.Error {Code : errutil .ServiceUnavailable , Msg : "failed to find candidate pods for serving the request" }
142140 }
143- result , err := d .scheduler .Schedule (ctx , reqCtx .SchedulingRequest , candidatePods )
141+
142+ // Admission Control check
143+ if err := d .admitRequest (ctx , candidatePods , * infObjective .Spec .Priority , reqCtx .FairnessID ); err != nil {
144+ return reqCtx , err
145+ }
146+
147+ result , err := d .scheduler .Schedule (ctx , reqCtx .SchedulingRequest , d .toSchedulerPodMetrics (candidatePods ))
144148 if err != nil {
145149 return reqCtx , errutil.Error {Code : errutil .InferencePoolResourceExhausted , Msg : fmt .Errorf ("failed to find target pod: %w" , err ).Error ()}
146150 }
147151
148- // --- 4. Prepare Request (Populates RequestContext and call PreRequest plugins) ---
152+ // Prepare Request (Populates RequestContext and call PreRequest plugins)
149153 // Insert target endpoint to instruct Envoy to route requests to the specified target pod and attach the port number.
150154 // Invoke PreRequest registered plugins.
151155 reqCtx , err = d .prepareRequest (ctx , reqCtx , result )
@@ -156,52 +160,27 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
156160 return reqCtx , nil
157161}
158162
159- // admitRequest handles admission control to decide whether or not to accept the request
160- // based on the request priority and system saturation state.
161- func (d * Director ) admitRequest (ctx context.Context , requestPriority int , fairnessID string ) error {
162- logger := log .FromContext (ctx )
163-
164- logger .V (logutil .TRACE ).Info ("Entering Flow Control" , "priority" , requestPriority , "fairnessID" , fairnessID )
165-
166- // This will be removed in favor of a more robust implementation (Flow Control) in the very near future.
167- // TODO: Make this a configurable value.
168- // Tracking issue https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1347
169- if requestPriority >= 0 {
170- logger .V (logutil .TRACE ).Info ("Non-sheddable request bypassing saturation check." )
171- return nil
172- }
173-
174- if d .saturationDetector .IsSaturated (ctx ) { // Assuming non-nil Saturation Detector
175- return errutil.Error {
176- Code : errutil .InferencePoolResourceExhausted ,
177- Msg : "system saturated, sheddable request dropped" ,
178- }
179- }
180-
181- return nil
182- }
183-
184163// getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore.
185164// according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies
186165// a subset of endpoints, only these endpoints will be considered as candidates for the scheduler.
187166// Snapshot pod metrics from the datastore to:
188167// 1. Reduce concurrent access to the datastore.
189168// 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles.
190- func (d * Director ) getCandidatePodsForScheduling (ctx context.Context , requestMetadata map [string ]any ) []schedulingtypes. Pod {
169+ func (d * Director ) getCandidatePodsForScheduling (ctx context.Context , requestMetadata map [string ]any ) []backendmetrics. PodMetrics {
191170 loggerTrace := log .FromContext (ctx ).V (logutil .TRACE )
192171
193172 subsetMap , found := requestMetadata [metadata .SubsetFilterNamespace ].(map [string ]any )
194173 if ! found {
195- return d .toSchedulerPodMetrics ( d . datastore .PodList (backendmetrics .AllPodsPredicate ) )
174+ return d .datastore .PodList (backendmetrics .AllPodsPredicate )
196175 }
197176
198177 // Check if endpoint key is present in the subset map and ensure there is at least one value
199178 endpointSubsetList , found := subsetMap [metadata .SubsetFilterKey ].([]any )
200179 if ! found {
201- return d .toSchedulerPodMetrics ( d . datastore .PodList (backendmetrics .AllPodsPredicate ) )
180+ return d .datastore .PodList (backendmetrics .AllPodsPredicate )
202181 } else if len (endpointSubsetList ) == 0 {
203182 loggerTrace .Info ("found empty subset filter in request metadata, filtering all pods" )
204- return []schedulingtypes. Pod {}
183+ return []backendmetrics. PodMetrics {}
205184 }
206185
207186 // Create a map of endpoint addresses for easy lookup
@@ -214,17 +193,42 @@ func (d *Director) getCandidatePodsForScheduling(ctx context.Context, requestMet
214193 }
215194
216195 podTotalCount := 0
217- podFitleredList := d .datastore .PodList (func (pm backendmetrics.PodMetrics ) bool {
196+ podFilteredList := d .datastore .PodList (func (pm backendmetrics.PodMetrics ) bool {
218197 podTotalCount ++
219198 if _ , found := endpoints [pm .GetPod ().Address ]; found {
220199 return true
221200 }
222201 return false
223202 })
224203
225- loggerTrace .Info ("filtered candidate pods by subset filtering" , "podTotalCount" , podTotalCount , "filteredCount" , len (podFitleredList ))
204+ loggerTrace .Info ("filtered candidate pods by subset filtering" , "podTotalCount" , podTotalCount , "filteredCount" , len (podFilteredList ))
205+
206+ return podFilteredList
207+ }
208+
209+ // admitRequest handles admission control to decide whether or not to accept the request
210+ // based on the request priority and saturation state.
211+ func (d * Director ) admitRequest (ctx context.Context , candidatePods []backendmetrics.PodMetrics , requestPriority int , fairnessID string ) error {
212+ loggerTrace := log .FromContext (ctx ).V (logutil .TRACE )
213+
214+ loggerTrace .Info ("Entering Flow Control" , "priority" , requestPriority , "fairnessID" , fairnessID )
215+
216+ // This will be removed in favor of a more robust implementation (Flow Control) in the very near future.
217+ // TODO: Make this a configurable value.
218+ // Tracking issue https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1347
219+ if requestPriority >= 0 {
220+ loggerTrace .Info ("Non-sheddable request bypassing saturation check." )
221+ return nil
222+ }
223+
224+ if d .saturationDetector .IsSaturated (ctx , candidatePods ) {
225+ return errutil.Error {
226+ Code : errutil .InferencePoolResourceExhausted ,
227+ Msg : "system saturated, sheddable request dropped" ,
228+ }
229+ }
226230
227- return d . toSchedulerPodMetrics ( podFitleredList )
231+ return nil
228232}
229233
230234// prepareRequest populates the RequestContext and calls the registered PreRequest plugins
0 commit comments