-
Notifications
You must be signed in to change notification settings - Fork 176
Break PostResponse requestcontrol
plugin into 3 separate plugins to add streamed request functionality
#1661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f9ba10f
4adf72e
0174f0b
bbeb5b6
3565802
f53bf64
26d53d3
91e7c63
ff89686
f73ef27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -27,6 +27,7 @@ import ( | |||
"strings" | ||||
"time" | ||||
|
||||
"github.com/go-logr/logr" | ||||
"sigs.k8s.io/controller-runtime/pkg/log" | ||||
|
||||
v1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" | ||||
|
@@ -62,22 +63,20 @@ type SaturationDetector interface { | |||
// NewDirectorWithConfig creates a new Director instance with all dependencies. | ||||
func NewDirectorWithConfig(datastore Datastore, scheduler Scheduler, saturationDetector SaturationDetector, config *Config) *Director { | ||||
return &Director{ | ||||
datastore: datastore, | ||||
scheduler: scheduler, | ||||
saturationDetector: saturationDetector, | ||||
preRequestPlugins: config.preRequestPlugins, | ||||
postResponsePlugins: config.postResponsePlugins, | ||||
defaultPriority: 0, // define default priority explicitly | ||||
datastore: datastore, | ||||
scheduler: scheduler, | ||||
saturationDetector: saturationDetector, | ||||
requestControlPlugins: *config, | ||||
defaultPriority: 0, // define default priority explicitly | ||||
} | ||||
} | ||||
|
||||
// Director orchestrates the request handling flow, including scheduling. | ||||
type Director struct { | ||||
datastore Datastore | ||||
scheduler Scheduler | ||||
saturationDetector SaturationDetector | ||||
preRequestPlugins []PreRequest | ||||
postResponsePlugins []PostResponse | ||||
datastore Datastore | ||||
scheduler Scheduler | ||||
saturationDetector SaturationDetector | ||||
requestControlPlugins Config | ||||
// we just need a pointer to an int variable since priority is a pointer in InferenceObjective | ||||
// no need to set this in the constructor, since the value we want is the default int val | ||||
// and value types cannot be nil | ||||
|
@@ -278,19 +277,48 @@ func (d *Director) toSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []sch | |||
return pm | ||||
} | ||||
|
||||
func (d *Director) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { | ||||
// HandleResponseReceived is called when the response headers are received. | ||||
func (d *Director) HandleResponseReceived(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { | ||||
response := &Response{ | ||||
RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], | ||||
Headers: reqCtx.Response.Headers, | ||||
} | ||||
|
||||
// TODO: to extend fallback functionality, handle cases where target pod is unavailable | ||||
// https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1224 | ||||
d.runPostResponsePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) | ||||
d.runResponseReceivedPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) | ||||
|
||||
return reqCtx, nil | ||||
} | ||||
|
||||
// HandleResponseBodyStreaming is called every time a chunk of the response body is received. | ||||
func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *handlers.RequestContext, logger logr.Logger) (*handlers.RequestContext, error) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove logger from the args, we don't use logr as argument to functions, we use contextual logging. log.FromContext(ctx).V(logutil.TRACE).Info(....) see for example here:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I asked him to remove that, in the streaming case the high volume of instantiations of the logger will cause allocation/gc pressure. Luke mentioned this as an optimization during flow control benchmarking, and i think the same applies here b/c there will be multiple streaming calls per request. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does From the FromContext() in go-logr package:
|
||||
logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk") | ||||
response := &Response{ | ||||
RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], | ||||
Headers: reqCtx.Response.Headers, | ||||
} | ||||
|
||||
d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) | ||||
logger.V(logutil.TRACE).Info("Exiting HandleResponseBodyChunk") | ||||
return reqCtx, nil | ||||
} | ||||
|
||||
// HandleResponseBodyComplete is called when the response body is fully received. | ||||
func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { | ||||
logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") | ||||
logger.V(logutil.DEBUG).Info("Entering HandleResponseBodyComplete") | ||||
response := &Response{ | ||||
RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], | ||||
Headers: reqCtx.Response.Headers, | ||||
} | ||||
|
||||
d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) | ||||
|
||||
logger.V(logutil.DEBUG).Info("Exiting HandleResponseBodyComplete") | ||||
return reqCtx, nil | ||||
} | ||||
|
||||
func (d *Director) GetRandomPod() *backend.Pod { | ||||
pods := d.datastore.PodList(backendmetrics.AllPodsPredicate) | ||||
if len(pods) == 0 { | ||||
|
@@ -304,22 +332,44 @@ func (d *Director) GetRandomPod() *backend.Pod { | |||
func (d *Director) runPreRequestPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, | ||||
schedulingResult *schedulingtypes.SchedulingResult, targetPort int) { | ||||
loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) | ||||
for _, plugin := range d.preRequestPlugins { | ||||
loggerDebug.Info("Running pre-request plugin", "plugin", plugin.TypedName()) | ||||
for _, plugin := range d.requestControlPlugins.preRequestPlugins { | ||||
loggerDebug.Info("Running PreRequest plugin", "plugin", plugin.TypedName()) | ||||
before := time.Now() | ||||
plugin.PreRequest(ctx, request, schedulingResult, targetPort) | ||||
metrics.RecordPluginProcessingLatency(PreRequestExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) | ||||
loggerDebug.Info("Completed running pre-request plugin successfully", "plugin", plugin.TypedName()) | ||||
loggerDebug.Info("Completed running PreRequest plugin successfully", "plugin", plugin.TypedName()) | ||||
} | ||||
} | ||||
|
||||
func (d *Director) runResponseReceivedPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { | ||||
loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) | ||||
for _, plugin := range d.requestControlPlugins.responseReceivedPlugins { | ||||
loggerDebug.Info("Running ResponseReceived plugin", "plugin", plugin.TypedName()) | ||||
before := time.Now() | ||||
plugin.ResponseReceived(ctx, request, response, targetPod) | ||||
metrics.RecordPluginProcessingLatency(ResponseReceivedExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) | ||||
loggerDebug.Info("Completed running ResponseReceived plugin successfully", "plugin", plugin.TypedName()) | ||||
} | ||||
} | ||||
|
||||
func (d *Director) runResponseStreamingPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { | ||||
loggerTrace := log.FromContext(ctx).V(logutil.TRACE) | ||||
for _, plugin := range d.requestControlPlugins.responseStreamingPlugins { | ||||
loggerTrace.Info("Running ResponseStreaming plugin", "plugin", plugin.TypedName()) | ||||
before := time.Now() | ||||
plugin.ResponseStreaming(ctx, request, response, targetPod) | ||||
metrics.RecordPluginProcessingLatency(ResponseStreamingExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) | ||||
loggerTrace.Info("Completed running ResponseStreaming plugin successfully", "plugin", plugin.TypedName()) | ||||
} | ||||
} | ||||
|
||||
func (d *Director) runPostResponsePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { | ||||
func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { | ||||
loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) | ||||
for _, plugin := range d.postResponsePlugins { | ||||
loggerDebug.Info("Running post-response plugin", "plugin", plugin.TypedName()) | ||||
for _, plugin := range d.requestControlPlugins.responseCompletePlugins { | ||||
loggerDebug.Info("Running ResponseComplete plugin", "plugin", plugin.TypedName()) | ||||
before := time.Now() | ||||
plugin.PostResponse(ctx, request, response, targetPod) | ||||
metrics.RecordPluginProcessingLatency(PostResponseExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) | ||||
loggerDebug.Info("Completed running post-response plugin successfully", "plugin", plugin.TypedName()) | ||||
plugin.ResponseComplete(ctx, request, response, targetPod) | ||||
metrics.RecordPluginProcessingLatency(ResponseCompleteExtensionPoint, plugin.TypedName().Type, plugin.TypedName().Name, time.Since(before)) | ||||
loggerDebug.Info("Completed running ResponseComplete plugin successfully", "plugin", plugin.TypedName()) | ||||
} | ||||
} |
Uh oh!
There was an error while loading. Please reload this page.