@@ -23,13 +23,15 @@ import (
23
23
"fmt"
24
24
"math/rand"
25
25
"strconv"
26
+ "time"
26
27
27
28
"github.com/go-logr/logr"
28
29
"sigs.k8s.io/controller-runtime/pkg/log"
29
30
"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
30
31
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
31
32
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
32
33
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
34
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
33
35
schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
34
36
errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
35
37
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -39,24 +41,32 @@ import (
39
41
// Scheduler defines the interface required by the Director for scheduling.
40
42
type Scheduler interface {
41
43
Schedule (ctx context.Context , b * schedulingtypes.LLMRequest ) (result map [string ]* schedulingtypes.Result , err error )
42
- OnResponse (ctx context.Context , resp * schedulingtypes.LLMResponse , targetPodName string )
43
44
}
44
45
45
46
// SaturationDetector provides a signal indicating whether the backends are considered saturated.
46
47
type SaturationDetector interface {
47
48
IsSaturated (ctx context.Context ) bool
48
49
}
49
50
51
+ // NewDirector creates a new Director instance with all dependencies.
52
+ // postResponsePlugins remains nil as this is an optional field that can be set using the "WithPostResponsePlugins" function.
53
+ func NewDirector (datastore datastore.Datastore , scheduler Scheduler , saturationDetector SaturationDetector ) * Director {
54
+ return & Director {datastore : datastore , scheduler : scheduler , saturationDetector : saturationDetector }
55
+ }
56
+
50
57
// Director orchestrates the request handling flow, including scheduling.
51
58
type Director struct {
52
- datastore datastore.Datastore
53
- scheduler Scheduler
54
- saturationDetector SaturationDetector
59
+ datastore datastore.Datastore
60
+ scheduler Scheduler
61
+ saturationDetector SaturationDetector
62
+ postResponsePlugins []PostResponsePlugin
55
63
}
56
64
57
- // NewDirector creates a new Director instance with all dependencies.
58
- func NewDirector (datastore datastore.Datastore , scheduler Scheduler , saturationDetector SaturationDetector ) * Director {
59
- return & Director {datastore , scheduler , saturationDetector }
65
+ // WithPostResponsePlugins sets the given plugins as the PostResponse plugins.
66
+ // If the Director has PostResponse plugins already, this call replaces the existing plugins with the given ones.
67
+ func (d * Director ) WithPostResponsePlugins (plugins ... PostResponsePlugin ) * Director {
68
+ d .postResponsePlugins = plugins
69
+ return d
60
70
}
61
71
62
72
// HandleRequest orchestrates the request lifecycle:
@@ -104,7 +114,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
104
114
}
105
115
106
116
// Prepare LLMRequest (needed for both saturation detection and Scheduler)
107
- llmReq : = & schedulingtypes.LLMRequest {
117
+ reqCtx . SchedulingRequest = & schedulingtypes.LLMRequest {
108
118
TargetModel : reqCtx .ResolvedTargetModel ,
109
119
RequestId : reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
110
120
Critical : requestCriticality == v1alpha2 .Critical ,
@@ -113,7 +123,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
113
123
}
114
124
logger = logger .WithValues (
115
125
"model" , reqCtx .Model ,
116
- "resolvedTargetModel" , llmReq . TargetModel ,
126
+ "resolvedTargetModel" , reqCtx . ResolvedTargetModel ,
117
127
"criticality" , requestCriticality ,
118
128
)
119
129
ctx = log .IntoContext (ctx , logger )
@@ -126,7 +136,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
126
136
}
127
137
128
138
// --- 3. Dispatch (Calls Scheduler) ---
129
- results , dispatchErr := d .Dispatch (ctx , llmReq )
139
+ results , dispatchErr := d .Dispatch (ctx , reqCtx . SchedulingRequest )
130
140
if dispatchErr != nil {
131
141
return reqCtx , dispatchErr
132
142
}
@@ -193,22 +203,19 @@ func (d *Director) PostDispatch(ctx context.Context, reqCtx *handlers.RequestCon
193
203
endpoint := targetPod .Address + ":" + strconv .Itoa (int (pool .Spec .TargetPortNumber ))
194
204
logger .V (logutil .DEFAULT ).Info ("Request handled" , "model" , reqCtx .Model , "targetModel" , reqCtx .ResolvedTargetModel , "endpoint" , targetPod )
195
205
196
- reqCtx .TargetPod = targetPod . NamespacedName . String ()
206
+ reqCtx .TargetPod = targetPod
197
207
reqCtx .TargetEndpoint = endpoint
198
208
199
209
return reqCtx , nil
200
210
}
201
211
202
212
func (d * Director ) HandleResponse (ctx context.Context , reqCtx * handlers.RequestContext ) (* handlers.RequestContext , error ) {
203
- logger := log .FromContext (ctx )
204
-
205
- llmResp := & schedulingtypes.LLMResponse {
213
+ response := & Response {
206
214
RequestId : reqCtx .Request .Headers [requtil .RequestIdHeaderKey ],
207
215
Headers : reqCtx .Response .Headers ,
208
216
}
209
- logger .V (logutil .DEBUG ).Info ("LLM response assembled" , "response" , llmResp )
210
217
211
- d .scheduler . OnResponse (ctx , llmResp , reqCtx .TargetPod )
218
+ d .runPostResponsePlugins (ctx , reqCtx . SchedulingRequest , response , reqCtx .TargetPod )
212
219
213
220
return reqCtx , nil
214
221
}
@@ -253,3 +260,12 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed
253
260
}
254
261
return ""
255
262
}
263
+
264
+ func (d * Director ) runPostResponsePlugins (ctx context.Context , request * schedulingtypes.LLMRequest , response * Response , targetPod * backend.Pod ) {
265
+ for _ , plugin := range d .postResponsePlugins {
266
+ log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Running post-response plugin" , "plugin" , plugin .Name ())
267
+ before := time .Now ()
268
+ plugin .PostResponse (ctx , request , response , targetPod )
269
+ metrics .RecordRequestControlPluginProcessingLatency (PostResponsePluginType , plugin .Name (), time .Since (before ))
270
+ }
271
+ }
0 commit comments