@@ -22,6 +22,7 @@ import (
22
22
"context"
23
23
"fmt"
24
24
"math/rand"
25
+ "net"
25
26
"strconv"
26
27
"time"
27
28
@@ -54,6 +55,7 @@ func NewDirectorWithConfig(datastore datastore.Datastore, scheduler Scheduler, s
54
55
datastore : datastore ,
55
56
scheduler : scheduler ,
56
57
saturationDetector : saturationDetector ,
58
+ preRequestPlugins : config .preRequestPlugins ,
57
59
postResponsePlugins : config .postResponsePlugins ,
58
60
}
59
61
}
@@ -63,14 +65,15 @@ type Director struct {
63
65
datastore datastore.Datastore
64
66
scheduler Scheduler
65
67
saturationDetector SaturationDetector
68
+ preRequestPlugins []PreRequest
66
69
postResponsePlugins []PostResponse
67
70
}
68
71
69
72
// HandleRequest orchestrates the request lifecycle:
70
73
// 1. Parses request details.
71
- // 2. Calls PreDispatch for admission control.
72
- // 3. Calls Dispatch (which calls Scheduler) if request is approved.
73
- // 4. Calls PostDispatch to populate RequestContext with results.
74
+ // 2. Calls admitRequest for admission control.
75
+ // 3. Calls Scheduler.Schedule if request is approved.
76
+ // 4. Calls prepareRequest to populate RequestContext with results and call PreRequest plugins .
74
77
//
75
78
// It always returns the requestContext even in the error case, as the request context is used in error handling.
76
79
func (d * Director ) HandleRequest (ctx context.Context , reqCtx * handlers.RequestContext ) (* handlers.RequestContext , error ) {
@@ -117,42 +120,39 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
117
120
Prompt : prompt ,
118
121
Headers : reqCtx .Request .Headers ,
119
122
}
120
- logger = logger .WithValues (
121
- "model" , reqCtx .Model ,
122
- "resolvedTargetModel" , reqCtx .ResolvedTargetModel ,
123
- "criticality" , requestCriticality ,
124
- )
123
+
124
+ logger = logger .WithValues ("model" , reqCtx .Model , "resolvedTargetModel" , reqCtx .ResolvedTargetModel , "criticality" , requestCriticality )
125
125
ctx = log .IntoContext (ctx , logger )
126
126
logger .V (logutil .DEBUG ).Info ("LLM request assembled" )
127
127
128
- // --- 2. Saturation Check ---
129
- preDispatchErr := d .PreDispatch (ctx , reqCtx , requestCriticality )
130
- if preDispatchErr != nil {
131
- return reqCtx , preDispatchErr
128
+ // --- 2. Admission Control check --
129
+ if err := d .admitRequest (ctx , requestCriticality ); err != nil {
130
+ return reqCtx , err
132
131
}
133
132
134
- // --- 3. Dispatch (Calls Scheduler) ---
135
- results , dispatchErr := d .Dispatch (ctx , reqCtx .SchedulingRequest )
136
- if dispatchErr != nil {
137
- return reqCtx , dispatchErr
133
+ // --- 3. Call Scheduler ---
134
+ results , err := d .scheduler . Schedule (ctx , reqCtx .SchedulingRequest )
135
+ if err != nil {
136
+ return reqCtx , errutil. Error { Code : errutil . InferencePoolResourceExhausted , Msg : fmt . Errorf ( "failed to find target pod: %w" , err ). Error ()}
138
137
}
139
138
140
- // --- 4. PostDispatch (Populates RequestContext) ---
141
- // Insert target endpoint to instruct Envoy to route requests to the specified target pod.
142
- // Attach the port number .
143
- reqCtx , postDispatchErr : = d .PostDispatch (ctx , reqCtx , results )
144
- if postDispatchErr != nil {
145
- return reqCtx , postDispatchErr
139
+ // --- 4. Prepare Request (Populates RequestContext and call PreRequest plugins ) ---
140
+ // Insert target endpoint to instruct Envoy to route requests to the specified target pod and attach the port number .
141
+ // Invoke PreRequest registered plugins .
142
+ reqCtx , err = d .prepareRequest (ctx , reqCtx , results )
143
+ if err != nil {
144
+ return reqCtx , err
146
145
}
147
146
148
147
return reqCtx , nil
149
148
}
150
149
151
- // PreDispatch handles admission control before dispatch.
152
- func (d * Director ) PreDispatch (ctx context.Context , reqCtx * handlers.RequestContext , reqCriticality v1alpha2.Criticality ) error {
150
+ // admitRequest handles admission control to decide whether or not to accept the request
151
+ // based on the request criticality and system saturation state.
152
+ func (d * Director ) admitRequest (ctx context.Context , requestCriticality v1alpha2.Criticality ) error {
153
153
logger := log .FromContext (ctx )
154
154
155
- if reqCriticality == v1alpha2 .Critical {
155
+ if requestCriticality == v1alpha2 .Critical {
156
156
logger .V (logutil .DEBUG ).Info ("Critical request bypassing saturation check." )
157
157
return nil
158
158
}
@@ -164,24 +164,14 @@ func (d *Director) PreDispatch(ctx context.Context, reqCtx *handlers.RequestCont
164
164
Msg : "system saturated, non-critical request dropped" ,
165
165
}
166
166
}
167
- return nil
168
- }
169
-
170
- // Dispatch runs one or many scheduling cycles.
171
- func (d * Director ) Dispatch (ctx context.Context , llmReq * schedulingtypes.LLMRequest ) (* schedulingtypes.SchedulingResult , error ) {
172
- var err error
173
- res , err := d .scheduler .Schedule (ctx , llmReq )
174
- if err != nil {
175
- return nil , errutil.Error {Code : errutil .InferencePoolResourceExhausted , Msg : fmt .Errorf ("failed to find target pod: %w" , err ).Error ()}
176
- }
177
167
178
- return res , nil // TODO handle multi cycle result after defining the PostDispatch extension point
168
+ return nil
179
169
}
180
170
181
- // PostDispatch populates the RequestContext based on scheduling results.
182
- func (d * Director ) PostDispatch (ctx context.Context , reqCtx * handlers.RequestContext , result * schedulingtypes.SchedulingResult ) (* handlers.RequestContext , error ) {
171
+ // prepareRequest populates the RequestContext and calls the registered PreRequest plugins
172
+ // for allowing plugging customized logic based on the scheduling results.
173
+ func (d * Director ) prepareRequest (ctx context.Context , reqCtx * handlers.RequestContext , result * schedulingtypes.SchedulingResult ) (* handlers.RequestContext , error ) {
183
174
logger := log .FromContext (ctx )
184
- // currently only get a single result. Will refactor to pluggably implement the PostSchedule
185
175
if result == nil || len (result .ProfileResults ) == 0 {
186
176
return reqCtx , errutil.Error {Code : errutil .Internal , Msg : "results must be greater than zero" }
187
177
}
@@ -192,13 +182,16 @@ func (d *Director) PostDispatch(ctx context.Context, reqCtx *handlers.RequestCon
192
182
if err != nil {
193
183
return reqCtx , err
194
184
}
185
+ targetPort := int (pool .Spec .TargetPortNumber )
195
186
196
- endpoint := targetPod .Address + ":" + strconv .Itoa (int ( pool . Spec . TargetPortNumber ))
187
+ endpoint := net . JoinHostPort ( targetPod .Address , strconv .Itoa (targetPort ))
197
188
logger .V (logutil .DEFAULT ).Info ("Request handled" , "model" , reqCtx .Model , "targetModel" , reqCtx .ResolvedTargetModel , "endpoint" , targetPod )
198
189
199
190
reqCtx .TargetPod = targetPod
200
191
reqCtx .TargetEndpoint = endpoint
201
192
193
+ d .runPreRequestPlugins (ctx , reqCtx .SchedulingRequest , result , targetPort )
194
+
202
195
return reqCtx , nil
203
196
}
204
197
@@ -254,6 +247,16 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed
254
247
return ""
255
248
}
256
249
250
+ func (d * Director ) runPreRequestPlugins (ctx context.Context , request * schedulingtypes.LLMRequest , schedulingResult * schedulingtypes.SchedulingResult ,
251
+ targetPort int ) {
252
+ for _ , plugin := range d .preRequestPlugins {
253
+ log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Running pre-request plugin" , "plugin" , plugin .Name ())
254
+ before := time .Now ()
255
+ plugin .PreRequest (ctx , request , schedulingResult , targetPort )
256
+ metrics .RecordRequestControlPluginProcessingLatency (PreRequestPluginType , plugin .Name (), time .Since (before ))
257
+ }
258
+ }
259
+
257
260
func (d * Director ) runPostResponsePlugins (ctx context.Context , request * schedulingtypes.LLMRequest , response * Response , targetPod * backend.Pod ) {
258
261
for _ , plugin := range d .postResponsePlugins {
259
262
log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Running post-response plugin" , "plugin" , plugin .Name ())
0 commit comments