kubernetes-sigs
diff --git a/‎cmd/epp/main.go
Lines changed: 4 additions & 1 deletion b/‎cmd/epp/main.go
Lines changed: 4 additions & 1 deletion
diff --git a/‎pkg/epp/handlers/server.go
Lines changed: 4 additions & 1 deletion b/‎pkg/epp/handlers/server.go
Lines changed: 4 additions & 1 deletion
diff --git a/‎pkg/epp/metrics/metrics.go
Lines changed: 19 additions & 0 deletions b/‎pkg/epp/metrics/metrics.go
Lines changed: 19 additions & 0 deletions
diff --git a/‎pkg/epp/plugins/plugins.go
Lines changed: 24 additions & 0 deletions b/‎pkg/epp/plugins/plugins.go
Lines changed: 24 additions & 0 deletions
diff --git a/‎pkg/epp/requestcontrol/director.go
Lines changed: 32 additions & 16 deletions b/‎pkg/epp/requestcontrol/director.go
Lines changed: 32 additions & 16 deletions
@@ -40,6 +40,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics/collectors"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
@@ -227,6 +228,8 @@ func run() error {
 
 	saturationDetector := saturationdetector.NewDetector(sdConfig, datastore, ctrl.Log)
 
+	director := requestcontrol.NewDirector(datastore, scheduler, saturationDetector) // can call "director.WithPostResponsePlugins" to add post response plugins
+
 	// --- Setup ExtProc Server Runner ---
 	serverRunner := &runserver.ExtProcServerRunner{
 		GrpcPort:                                 *grpcPort,
@@ -237,7 +240,7 @@ func run() error {
 		SecureServing:                            *secureServing,
 		CertPath:                                 *certPath,
 		RefreshPrometheusMetricsInterval:         *refreshPrometheusMetricsInterval,
-		Scheduler:                                scheduler,
+		Director:                                 director,
 		SaturationDetector:                       saturationDetector,
 	}
 	if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {
 
@@ -32,6 +32,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
+	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 	requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request"
@@ -79,7 +80,7 @@ type StreamingServer struct {
 // Specifically, there are fields related to the ext-proc protocol, and then fields related to the lifecycle of the request.
 // We should split these apart as this monolithic object exposes too much data to too many layers.
 type RequestContext struct {
-	TargetPod                 string
+	TargetPod                 *backend.Pod
 	TargetEndpoint            string
 	Model                     string
 	ResolvedTargetModel       string
@@ -93,6 +94,8 @@ type RequestContext struct {
 	RequestRunning            bool
 	Request                   *Request
 
+	SchedulingRequest *schedulingtypes.LLMRequest
+
 	RequestState         StreamRequestState
 	modelServerStreaming bool
 
 
@@ -202,6 +202,18 @@ var (
 		[]string{"plugin_type", "plugin_name"},
 	)
 
+	RequestControlPluginProcessingLatencies = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: InferenceExtension,
+			Name:      "request_control_plugin_duration_seconds",
+			Help:      metricsutil.HelpMsgWithStability("RequestControl plugin processing latency distribution in seconds for each plugin type and plugin name.", compbasemetrics.ALPHA),
+			Buckets: []float64{
+				0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
+			},
+		},
+		[]string{"plugin_type", "plugin_name"},
+	)
+
 	// Prefix indexer Metrics
 	PrefixCacheSize = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
@@ -263,6 +275,7 @@ func Register(customCollectors ...prometheus.Collector) {
 		metrics.Registry.MustRegister(inferencePoolReadyPods)
 		metrics.Registry.MustRegister(SchedulerPluginProcessingLatencies)
 		metrics.Registry.MustRegister(SchedulerE2ELatency)
+		metrics.Registry.MustRegister(RequestControlPluginProcessingLatencies)
 		metrics.Registry.MustRegister(InferenceExtensionInfo)
 		metrics.Registry.MustRegister(PrefixCacheSize)
 		metrics.Registry.MustRegister(PrefixCacheHitRatio)
@@ -289,6 +302,7 @@ func Reset() {
 	inferencePoolReadyPods.Reset()
 	SchedulerPluginProcessingLatencies.Reset()
 	SchedulerE2ELatency.Reset()
+	RequestControlPluginProcessingLatencies.Reset()
 	InferenceExtensionInfo.Reset()
 	PrefixCacheSize.Reset()
 	PrefixCacheHitRatio.Reset()
@@ -400,6 +414,11 @@ func RecordSchedulerE2ELatency(duration time.Duration) {
 	SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds())
 }
 
+// RecordRequestControlPluginProcessingLatency records the processing latency for a request-control plugin.
+func RecordRequestControlPluginProcessingLatency(pluginType, pluginName string, duration time.Duration) {
+	RequestControlPluginProcessingLatencies.WithLabelValues(pluginType, pluginName).Observe(duration.Seconds())
+}
+
 // RecordPrefixCacheSize records the size of the prefix indexer in megabytes.
 func RecordPrefixCacheSize(size int64) {
 	PrefixCacheSize.WithLabelValues().Set(float64(size))
 
@@ -0,0 +1,24 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package plugins
+
+// Plugin defines the interface for a plugin.
+// This interface should be embedded in all plugins across the code.
+type Plugin interface {
+	// Name returns the name of the plugin.
+	Name() string
+}
@@ -23,13 +23,15 @@ import (
 	"fmt"
 	"math/rand"
 	"strconv"
+	"time"
 
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -39,24 +41,32 @@ import (
 // Scheduler defines the interface required by the Director for scheduling.
 type Scheduler interface {
 	Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (result map[string]*schedulingtypes.Result, err error)
-	OnResponse(ctx context.Context, resp *schedulingtypes.LLMResponse, targetPodName string)
 }
 
 // SaturationDetector provides a signal indicating whether the backends are considered saturated.
 type SaturationDetector interface {
 	IsSaturated(ctx context.Context) bool
 }
 
+// NewDirector creates a new Director instance with all dependencies.
+// postResponsePlugins remains nil as this is an optional field that can be set using the "WithPostResponsePlugins" function.
+func NewDirector(datastore datastore.Datastore, scheduler Scheduler, saturationDetector SaturationDetector) *Director {
+	return &Director{datastore: datastore, scheduler: scheduler, saturationDetector: saturationDetector}
+}
+
 // Director orchestrates the request handling flow, including scheduling.
 type Director struct {
-	datastore          datastore.Datastore
-	scheduler          Scheduler
-	saturationDetector SaturationDetector
+	datastore           datastore.Datastore
+	scheduler           Scheduler
+	saturationDetector  SaturationDetector
+	postResponsePlugins []PostResponsePlugin
 }
 
-// NewDirector creates a new Director instance with all dependencies.
-func NewDirector(datastore datastore.Datastore, scheduler Scheduler, saturationDetector SaturationDetector) *Director {
-	return &Director{datastore, scheduler, saturationDetector}
+// WithPostResponsePlugins sets the given plugins as the PostResponse plugins.
+// If the Director has PostResponse plugins already, this call replaces the existing plugins with the given ones.
+func (d *Director) WithPostResponsePlugins(plugins ...PostResponsePlugin) *Director {
+	d.postResponsePlugins = plugins
+	return d
 }
 
 // HandleRequest orchestrates the request lifecycle:
@@ -104,7 +114,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 	}
 
 	// Prepare LLMRequest (needed for both saturation detection and Scheduler)
-	llmReq := &schedulingtypes.LLMRequest{
+	reqCtx.SchedulingRequest = &schedulingtypes.LLMRequest{
 		TargetModel: reqCtx.ResolvedTargetModel,
 		RequestId:   reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
 		Critical:    requestCriticality == v1alpha2.Critical,
@@ -113,7 +123,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 	}
 	logger = logger.WithValues(
 		"model", reqCtx.Model,
-		"resolvedTargetModel", llmReq.TargetModel,
+		"resolvedTargetModel", reqCtx.ResolvedTargetModel,
 		"criticality", requestCriticality,
 	)
 	ctx = log.IntoContext(ctx, logger)
@@ -126,7 +136,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
 	}
 
 	// --- 3. Dispatch (Calls Scheduler) ---
-	results, dispatchErr := d.Dispatch(ctx, llmReq)
+	results, dispatchErr := d.Dispatch(ctx, reqCtx.SchedulingRequest)
 	if dispatchErr != nil {
 		return reqCtx, dispatchErr
 	}
@@ -193,22 +203,19 @@ func (d *Director) PostDispatch(ctx context.Context, reqCtx *handlers.RequestCon
 	endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber))
 	logger.V(logutil.DEFAULT).Info("Request handled", "model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod)
 
-	reqCtx.TargetPod = targetPod.NamespacedName.String()
+	reqCtx.TargetPod = targetPod
 	reqCtx.TargetEndpoint = endpoint
 
 	return reqCtx, nil
 }
 
 func (d *Director) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) {
-	logger := log.FromContext(ctx)
-
-	llmResp := &schedulingtypes.LLMResponse{
+	response := &Response{
 		RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
 		Headers:   reqCtx.Response.Headers,
 	}
-	logger.V(logutil.DEBUG).Info("LLM response assembled", "response", llmResp)
 
-	d.scheduler.OnResponse(ctx, llmResp, reqCtx.TargetPod)
+	d.runPostResponsePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod)
 
 	return reqCtx, nil
 }
@@ -253,3 +260,12 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed
 	}
 	return ""
 }
+
+func (d *Director) runPostResponsePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) {
+	for _, plugin := range d.postResponsePlugins {
+		log.FromContext(ctx).V(logutil.DEBUG).Info("Running post-response plugin", "plugin", plugin.Name())
+		before := time.Now()
+		plugin.PostResponse(ctx, request, response, targetPod)
+		metrics.RecordRequestControlPluginProcessingLatency(PostResponsePluginType, plugin.Name(), time.Since(before))
+	}
+}