From 47f33a89670972b412aff90e2755b244f230e207 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Nov 2025 22:23:02 +0000 Subject: [PATCH 01/11] Add latency predictor plugins, deployment, and runner.go integration --- cmd/epp/runner/runner.go | 69 ++- .../manifests/inferencepool-resources-lp.yaml | 450 ++++++++++++++++++ go.mod | 1 + pkg/epp/backend/metrics/metrics.go | 9 + pkg/epp/backend/metrics/metrics_spec.go | 24 +- pkg/epp/datalayer/metrics/extractor.go | 13 +- pkg/epp/datalayer/metrics/mapping.go | 24 +- pkg/epp/datastore/datastore.go | 2 +- .../testdata/request_tpot_seconds_metric | 80 ++++ .../testdata/request_ttft_seconds_metric | 116 +++++ pkg/epp/server/runserver.go | 3 + sidecars/latencypredictorasync/types.go | 1 - 12 files changed, 769 insertions(+), 23 deletions(-) create mode 100644 config/manifests/inferencepool-resources-lp.yaml create mode 100644 pkg/epp/metrics/testdata/request_tpot_seconds_metric create mode 100644 pkg/epp/metrics/testdata/request_ttft_seconds_metric diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index 24a59ebf70..7127e2c946 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -19,6 +19,7 @@ package runner import ( "context" "crypto/tls" + "encoding/json" "errors" "flag" "fmt" @@ -69,6 +70,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" @@ -76,6 +78,7 @@ import ( runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + latencypredictor "sigs.k8s.io/gateway-api-inference-extension/sidecars/latencypredictorasync" "sigs.k8s.io/gateway-api-inference-extension/version" ) @@ -126,6 +129,7 @@ var ( "then a self-signed certificate is used.") // metric flags totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.") + totalRunningRequestsMetric = flag.String("total-running-requests-metric", runserver.DefaultTotalRunningRequestsMetric, "Prometheus metric for the number of running requests.") kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") @@ -145,7 +149,10 @@ var ( modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods") modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", true, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true)") haEnableLeaderElection = flag.Bool("ha-enable-leader-election", false, "Enables leader election for high availability. When enabled, readiness probes will only pass on the leader.") - tracing = flag.Bool("tracing", true, "Enables emitting traces") + + // Latency Predictor Flag + enableLatencyPredictor = flag.Bool("enable-latency-predictor", false, "Enable the regression-based latency predictor and scheduler scorer.") + tracing = flag.Bool("tracing", true, "Enables emitting traces") setupLog = ctrl.Log.WithName("setup") ) @@ -308,6 +315,32 @@ func (r *Runner) Run(ctx context.Context) error { runtime.SetBlockProfileRate(1) } + // =================================================================== + // == Latency Predictor Integration + // =================================================================== + var predictor latencypredictor.PredictorInterface // Use the interface type + if *enableLatencyPredictor { + setupLog.Info("Latency predictor is enabled. Initializing...") + predictor = latencypredictor.New(latencypredictor.ConfigFromEnv(), ctrl.Log.WithName("latency-predictor")) + + // For the runnable, you'll need to type assert back to the concrete type + concretePredictor := predictor.(*latencypredictor.Predictor) + if err := mgr.Add(runnable.NoLeaderElection(&predictorRunnable{predictor: concretePredictor})); err != nil { + setupLog.Error(err, "Failed to register latency predictor runnable") + return err + } + } else { + setupLog.Info("Latency predictor is disabled.") + predictor = nil // This will be a true nil interface + } + // =================================================================== + + err = r.parsePluginsConfiguration(ctx, predictor, datastore) + if err != nil { + setupLog.Error(err, "Failed to parse the configuration") + return err + } + // --- Initialize Core EPP Components --- if r.schedulerConfig == nil { err := errors.New("scheduler config must be set either by config api or through code") @@ -366,6 +399,7 @@ func (r *Runner) Run(ctx context.Context) error { Director: director, SaturationDetector: saturationDetector, UseExperimentalDatalayerV2: r.featureGates[datalayer.FeatureGate], // pluggable data layer feature flag + LatencyPredictor: predictor, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup EPP controllers") @@ -432,6 +466,13 @@ func (r *Runner) registerInTreePlugins() { plugins.Register(testresponsereceived.DestinationEndpointServedVerifierType, testresponsereceived.DestinationEndpointServedVerifierFactory) } +func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface) { + plugins.Register(slo_aware_router.SLOAwareRouterPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return slo_aware_router.NewSLOAwareRouter(predictor, slo_aware_router.HeadroomSelectionStrategy).WithName(name), nil + }) + plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory) +} + func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.EndpointPickerConfig, error) { if *configText == "" && *configFile == "" { return nil, nil // configuring through code, not through file @@ -454,6 +495,12 @@ func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.End loader.RegisterFeatureGate(flowcontrol.FeatureGate) r.registerInTreePlugins() + // If we have a latency predictor enabled and predictor and datastore are not nil, + // register the latency predictor plugins (currently just the SLO scorer). + if *enableLatencyPredictor && predictor != nil { + setupLog.Info("Registering latency predictor plugins") + r.registerLatencyPredictorPlugins(predictor) + } rawConfig, featureGates, err := loader.LoadConfigPhaseOne(configBytes, logger) if err != nil { @@ -538,6 +585,7 @@ func (r *Runner) setupMetricsCollection(setupLog logr.Logger, useExperimentalDat func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) { mapping, err := backendmetrics.NewMetricMapping( *totalQueuedRequestsMetric, + *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, *loraInfoMetric, *cacheInfoMetric, @@ -586,6 +634,7 @@ func setupDatalayer(logger logr.Logger) (datalayer.EndpointFactory, error) { *modelServerMetricsHttpsInsecureSkipVerify, nil) extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric, + *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, *loraInfoMetric, *cacheInfoMetric) @@ -799,3 +848,21 @@ func resolvePoolNamespace(poolNamespace string) string { } return runserver.DefaultPoolNamespace } + +// =================================================================== +// == Latency Predictor Plugin and Helpers +// =================================================================== + +// predictorRunnable implements controller-runtime's Runnable interface to manage the predictor's lifecycle. +type predictorRunnable struct { + predictor *latencypredictor.Predictor +} + +func (p *predictorRunnable) Start(ctx context.Context) error { + setupLog.Info("Starting latency predictor...") + p.predictor.Start(ctx) + <-ctx.Done() + setupLog.Info("Stopping latency predictor...") + p.predictor.Stop() + return nil +} diff --git a/config/manifests/inferencepool-resources-lp.yaml b/config/manifests/inferencepool-resources-lp.yaml new file mode 100644 index 0000000000..c2a3528deb --- /dev/null +++ b/config/manifests/inferencepool-resources-lp.yaml @@ -0,0 +1,450 @@ +# Note: If you change this file, please also change the file used for e2e tests! +# +# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml + +# --- ConfigMaps --- +apiVersion: v1 +kind: ConfigMap +metadata: + name: latency-predictor-config + namespace: default +data: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prediction-server-config + namespace: default +data: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" # Use individual storage + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" +--- +# --- InferencePool --- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp +--- +# --- EPP Service --- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: default +spec: + selector: + app: vllm-llama3-8b-instruct-epp + ports: + - name: epp-grpc + protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + - name: latency-predictor-training + protocol: TCP + port: 8000 + targetPort: 8000 + - name: latency-predictor-1 + protocol: TCP + port: 8001 + targetPort: 8001 + - name: latency-predictor-2 + protocol: TCP + port: 8002 + targetPort: 8002 + - name: latency-predictor-3 + protocol: TCP + port: 8003 + targetPort: 8003 + - name: prometheus + protocol: TCP + port: 9090 + targetPort: 9090 + type: LoadBalancer +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: default +--- +# --- EPP Deployment with Individual Container Volumes --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: default + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 1 # Multiple EPP pods for scaling + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + serviceAccountName: vllm-llama3-8b-instruct-epp + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + # EPP Container + - name: epp + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/slo-routing-epp-exp + imagePullPolicy: Always + args: + - -pool-name + - "vllm-llama3-8b-instruct" + - "-pool-namespace" + - "default" + - --pool-group + - "inference.networking.x-k8s.io" + - -v + - "4" + - --zap-encoder + - "json" + - -grpc-port + - "9002" + - -grpc-health-port + - "9003" + - "--config-file" + - "/config/default-plugins.yaml" + - "-enable-latency-predictor" + env: + - name: PREDICTION_SERVER_URL + value: "http://localhost:8001,http://localhost:8002,http://localhost:8003" # Multiple prediction servers + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" # Single training server for sending training data + - name: LATENCY_MAX_SAMPLE_SIZE + value: "10000" # Maximum sample size for latency prediction + - name: NEG_HEADROOM_TPOT_WEIGHT + value: "0.2" # Weight for TPOT in negative headroom calculation + - name: NEG_HEADROOM_TTFT_WEIGHT + value: "0.8" # Weight for TTFT in negative headroom calculation + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + volumeMounts: + - name: plugins-config-volume + mountPath: "/config" + # Training Server Sidecar Container + - name: training-server + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_training:latest + imagePullPolicy: Always + ports: + - containerPort: 8000 + name: training-port + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + envFrom: + - configMapRef: + name: latency-predictor-config + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "training" + volumeMounts: + - name: training-server-storage + mountPath: /models + # Prediction Server Sidecar Container 1 + - name: prediction-server-1 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8001"] + ports: + - containerPort: 8001 + name: predict-port-1 + livenessProbe: + httpGet: + path: /healthz + port: 8001 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8001 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8001" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-1" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-1-storage + mountPath: /server_models + # Prediction Server Sidecar Container 2 + - name: prediction-server-2 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8002"] + ports: + - containerPort: 8002 + name: predict-port-2 + livenessProbe: + httpGet: + path: /healthz + port: 8002 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8002 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8002" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-2" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-2-storage + mountPath: /server_models + # Prediction Server Sidecar Container 3 + - name: prediction-server-3 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8003"] + ports: + - containerPort: 8003 + name: predict-port-3 + livenessProbe: + httpGet: + path: /healthz + port: 8003 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8003 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8003" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-3" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-3-storage + mountPath: /server_models + volumes: + - name: training-server-storage + emptyDir: + sizeLimit: "20Gi" # Dedicated volume for training server + - name: prediction-server-1-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 1 + - name: prediction-server-2-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 2 + - name: prediction-server-3-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 3 + - name: plugins-config-volume + configMap: + name: plugins-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: plugins-config + namespace: default +data: + default-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer + - type: slo-aware-routing + - type: slo-aware-profile-handler + - type: max-score-picker + schedulingProfiles: + - name: prefix + plugins: + - pluginRef: prefix-cache-scorer + - name: default + plugins: + - pluginRef: slo-aware-routing + weight: 0 + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: max-score-picker + - name: slo + plugins: + - pluginRef: slo-aware-routing + - pluginRef: max-score-picker +--- +# --- RBAC --- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read + namespace: default +rules: +- apiGroups: [ "inference.networking.x-k8s.io" ] + resources: [ "inferenceobjectives", "inferencepools" ] + verbs: [ "get", "watch", "list" ] +- apiGroups: [ "inference.networking.k8s.io" ] + resources: [ "inferencepools" ] + verbs: [ "get", "watch", "list" ] +- apiGroups: [ "" ] + resources: [ "pods" ] + verbs: [ "get", "watch", "list" ] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding + namespace: default +subjects: +- kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: pod-read +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: auth-reviewer +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: auth-reviewer-binding +subjects: +- kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: auth-reviewer \ No newline at end of file diff --git a/go.mod b/go.mod index dc53db3815..fbb1e5de18 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/elastic/crd-ref-docs v0.2.0 github.com/envoyproxy/go-control-plane/envoy v1.36.0 github.com/go-logr/logr v1.4.3 + github.com/go-logr/zapr v1.3.0 github.com/google/go-cmp v0.7.0 github.com/google/uuid v1.6.0 github.com/hashicorp/golang-lru/v2 v2.0.7 diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 5d2a85e96b..d2d9b60f97 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -97,6 +97,15 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } } + if p.MetricMapping.TotalRunningRequests != nil { + running, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalRunningRequests) + if err == nil { + updated.RunningQueueSize = int(running.GetGauge().GetValue()) + } else { + errs = multierr.Append(errs, err) + } + } + if p.MetricMapping.KVCacheUtilization != nil { usage, err := p.getMetric(metricFamilies, *p.MetricMapping.KVCacheUtilization) if err == nil { diff --git a/pkg/epp/backend/metrics/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go index 7407f4ed7b..b3c26db2c5 100644 --- a/pkg/epp/backend/metrics/metrics_spec.go +++ b/pkg/epp/backend/metrics/metrics_spec.go @@ -29,10 +29,11 @@ type MetricSpec struct { // MetricMapping holds named MetricSpecs. type MetricMapping struct { - TotalQueuedRequests *MetricSpec - KVCacheUtilization *MetricSpec - LoraRequestInfo *MetricSpec - CacheConfigInfo *MetricSpec + TotalQueuedRequests *MetricSpec + TotalRunningRequests *MetricSpec + KVCacheUtilization *MetricSpec + LoraRequestInfo *MetricSpec + CacheConfigInfo *MetricSpec } // stringToMetricSpec converts a string to a MetricSpec. @@ -94,11 +95,15 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) { } // NewMetricMapping creates a MetricMapping from string values. -func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) { +func NewMetricMapping(queuedStr, runningStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) { queuedSpec, err := stringToMetricSpec(queuedStr) if err != nil { return nil, fmt.Errorf("error parsing WaitingRequests: %w", err) } + runningSpec, err := stringToMetricSpec(runningStr) + if err != nil { + return nil, fmt.Errorf("error parsing RunningRequests: %w", err) + } kvUsageSpec, err := stringToMetricSpec(kvUsageStr) if err != nil { return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err) @@ -114,10 +119,11 @@ func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric str } mapping := &MetricMapping{ - TotalQueuedRequests: queuedSpec, - KVCacheUtilization: kvUsageSpec, - LoraRequestInfo: loraReqInfoSpec, - CacheConfigInfo: cacheInfoSpec, + TotalQueuedRequests: queuedSpec, + TotalRunningRequests: runningSpec, + KVCacheUtilization: kvUsageSpec, + LoraRequestInfo: loraReqInfoSpec, + CacheConfigInfo: cacheInfoSpec, } return mapping, nil diff --git a/pkg/epp/datalayer/metrics/extractor.go b/pkg/epp/datalayer/metrics/extractor.go index 27b1e07cd1..f08ccf95e9 100644 --- a/pkg/epp/datalayer/metrics/extractor.go +++ b/pkg/epp/datalayer/metrics/extractor.go @@ -66,8 +66,8 @@ func Produces() map[string]any { // configured with the given metrics' specifications. // These are mandatory metrics per the MSP specification, and are used // as the basis for the built-in scheduling plugins. -func NewExtractor(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) { - mapping, err := NewMapping(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec) +func NewExtractor(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) { + mapping, err := NewMapping(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec) if err != nil { return nil, fmt.Errorf("failed to create extractor metrics Mapping - %w", err) } @@ -109,6 +109,15 @@ func (ext *Extractor) Extract(ctx context.Context, data any, ep datalayer.Endpoi } } + if spec := ext.mapping.TotalRunningRequests; spec != nil { // extract running requests + if metric, err := spec.getLatestMetric(families); err != nil { + errs = append(errs, err) + } else { + clone.RunningQueueSize = int(extractValue(metric)) + updated = true + } + } + if spec := ext.mapping.KVCacheUtilization; spec != nil { // extract KV cache usage if metric, err := spec.getLatestMetric(families); err != nil { errs = append(errs, err) diff --git a/pkg/epp/datalayer/metrics/mapping.go b/pkg/epp/datalayer/metrics/mapping.go index fab6cf75f5..7b1fed9c1e 100644 --- a/pkg/epp/datalayer/metrics/mapping.go +++ b/pkg/epp/datalayer/metrics/mapping.go @@ -23,20 +23,25 @@ import ( // Mapping holds specifications for the well-known metrics defined // in the Model Server Protocol. type Mapping struct { - TotalQueuedRequests *Spec - KVCacheUtilization *Spec - LoraRequestInfo *LoRASpec - CacheInfo *Spec + TotalQueuedRequests *Spec + TotalRunningRequests *Spec + KVCacheUtilization *Spec + LoraRequestInfo *LoRASpec + CacheInfo *Spec } // NewMapping creates a metrics.Mapping from the input specification strings. -func NewMapping(queue, kvusage, lora, cacheInfo string) (*Mapping, error) { +func NewMapping(queue, running, kvusage, lora, cacheInfo string) (*Mapping, error) { var errs []error queueSpec, err := parseStringToSpec(queue) if err != nil { errs = append(errs, err) } + runningSpec, err := parseStringToSpec(running) + if err != nil { + errs = append(errs, err) + } kvusageSpec, err := parseStringToSpec(kvusage) if err != nil { errs = append(errs, err) @@ -55,9 +60,10 @@ func NewMapping(queue, kvusage, lora, cacheInfo string) (*Mapping, error) { return nil, errors.Join(errs...) } return &Mapping{ - TotalQueuedRequests: queueSpec, - KVCacheUtilization: kvusageSpec, - LoraRequestInfo: loraSpec, - CacheInfo: cacheInfoSpec, + TotalQueuedRequests: queueSpec, + TotalRunningRequests: runningSpec, + KVCacheUtilization: kvusageSpec, + LoraRequestInfo: loraSpec, + CacheInfo: cacheInfoSpec, }, nil } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 2ab2e98cb0..5153c1eb19 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -62,7 +62,7 @@ type Datastore interface { // PodList lists pods matching the given predicate. PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool - PodDelete(podNAme string) + PodDelete(podName string) // Clears the store state, happens when the pool gets deleted. Clear() diff --git a/pkg/epp/metrics/testdata/request_tpot_seconds_metric b/pkg/epp/metrics/testdata/request_tpot_seconds_metric new file mode 100644 index 0000000000..beee502717 --- /dev/null +++ b/pkg/epp/metrics/testdata/request_tpot_seconds_metric @@ -0,0 +1,80 @@ +# HELP inference_model_request_tpot_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_tpot_seconds histogram +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0005"} 0 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0025"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.005"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.01"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.02"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.04"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.06"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.08"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.1"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.125"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.15"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.4"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.8"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="4.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="12"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="18"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="24"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="30"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="36"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="48"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="60"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="90"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="120"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="180"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="270"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="360"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="Inf"} 2 +inference_model_request_tpot_seconds_sum{model_name="m20", target_model_name="t10"} 0.161 +inference_model_request_tpot_seconds_count{model_name="m20", target_model_name="t10"} 2 + + +iinference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0005"} 0 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0025"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.005"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.01"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.02"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.04"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.06"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.08"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.1"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.125"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.15"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.4"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.8"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="4.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="12"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="18"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="24"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="30"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="36"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="48"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="60"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="90"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="120"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="180"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="270"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="360"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="Inf"} 2 +inference_model_request_tpot_seconds_sum{model_name="m20", target_model_name="t10"} 0.161 +inference_model_request_tpot_seconds_count{model_name="m20", target_model_name="t10"} 2 \ No newline at end of file diff --git a/pkg/epp/metrics/testdata/request_ttft_seconds_metric b/pkg/epp/metrics/testdata/request_ttft_seconds_metric new file mode 100644 index 0000000000..3154907274 --- /dev/null +++ b/pkg/epp/metrics/testdata/request_ttft_seconds_metric @@ -0,0 +1,116 @@ +# HELP inference_model_request_ttft_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_ttft_seconds histogram +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_model_request_ttft_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_model_request_ttft_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_ttft_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_model_request_ttft_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_ttft_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_model_request_ttft_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index e43d849232..39ea9a85fd 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -43,6 +43,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" + latencypredictor "sigs.k8s.io/gateway-api-inference-extension/sidecars/latencypredictorasync" ) // ExtProcServerRunner provides methods to manage an external process server. @@ -59,6 +60,7 @@ type ExtProcServerRunner struct { Director *requestcontrol.Director SaturationDetector *saturationdetector.Detector UseExperimentalDatalayerV2 bool // Pluggable data layer feature flag + LatencyPredictor latencypredictor.PredictorInterface // This should only be used in tests. We won't need this once we do not inject metrics in the tests. // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup @@ -78,6 +80,7 @@ const ( DefaultHealthChecking = false // default for --health-checking DefaultEnablePprof = true // default for --enable-pprof DefaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" // default for --total-queued-requests-metric + DefaultTotalRunningRequestsMetric = "vllm:num_requests_running" // default for --total-running-requests-metric DefaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" // default for --kv-cache-usage-percentage-metric DefaultLoraInfoMetric = "vllm:lora_requests_info" // default for --lora-info-metric DefaultCacheInfoMetric = "vllm:cache_config_info" // default for --cache-info-metric diff --git a/sidecars/latencypredictorasync/types.go b/sidecars/latencypredictorasync/types.go index 4b4a1ca0b9..c8eadefe23 100644 --- a/sidecars/latencypredictorasync/types.go +++ b/sidecars/latencypredictorasync/types.go @@ -120,7 +120,6 @@ type PredictorInterface interface { PredictBulk(ctx context.Context, requests []PredictionRequest) (*BulkPredictionResponse, error) PredictBulkStrict(ctx context.Context, requests []PredictionRequest) (*BulkPredictionResponse, error) AddTrainingDataBulk(entry []TrainingEntry) error - GetServerStatus(ctx context.Context) (*ServerStatusResponse, error) } // --- Data Models --- From 8c77f1f95accd030d209fda9f3246bd16618e945 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 7 Nov 2025 00:55:31 +0000 Subject: [PATCH 02/11] Update dockerfile, fix issues with SLO context not being set when prediciton id off --- Dockerfile | 1 + .../manifests/inferencepool-resources-lp.yaml | 384 +++++++++++++++++- 2 files changed, 379 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index d2fd2300b1..70038a0778 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ COPY internal ./internal COPY apix ./apix COPY api ./api COPY version ./version +COPY sidecars ./sidecars WORKDIR /src/cmd/epp RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp diff --git a/config/manifests/inferencepool-resources-lp.yaml b/config/manifests/inferencepool-resources-lp.yaml index c2a3528deb..8a7951e96b 100644 --- a/config/manifests/inferencepool-resources-lp.yaml +++ b/config/manifests/inferencepool-resources-lp.yaml @@ -17,6 +17,7 @@ data: LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" LATENCY_MODEL_TYPE: "xgboost" LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" --- apiVersion: v1 kind: ConfigMap @@ -74,6 +75,34 @@ spec: protocol: TCP port: 8003 targetPort: 8003 + - name: latency-predictor-4 + protocol: TCP + port: 8004 + targetPort: 8004 + - name: latency-predictor-5 + protocol: TCP + port: 8005 + targetPort: 8005 + - name: latency-predictor-6 + protocol: TCP + port: 8006 + targetPort: 8006 + - name: latency-predictor-7 + protocol: TCP + port: 8007 + targetPort: 8007 + - name: latency-predictor-8 + protocol: TCP + port: 8008 + targetPort: 8008 + - name: latency-predictor-9 + protocol: TCP + port: 8009 + targetPort: 8009 + - name: latency-predictor-10 + protocol: TCP + port: 8010 + targetPort: 8010 - name: prometheus protocol: TCP port: 9090 @@ -106,7 +135,6 @@ spec: spec: serviceAccountName: vllm-llama3-8b-instruct-epp # Conservatively, this timeout should mirror the longest grace period of the pods within the pool - terminationGracePeriodSeconds: 130 containers: # EPP Container - name: epp @@ -132,15 +160,15 @@ spec: - "-enable-latency-predictor" env: - name: PREDICTION_SERVER_URL - value: "http://localhost:8001,http://localhost:8002,http://localhost:8003" # Multiple prediction servers + value: "http://localhost:8001,http://localhost:8002,http://localhost:8003,http://localhost:8004,http://localhost:8005,http://localhost:8006,http://localhost:8007,http://localhost:8008,http://localhost:8009,http://localhost:8010" # All 10 prediction servers - name: TRAINING_SERVER_URL value: "http://localhost:8000" # Single training server for sending training data - name: LATENCY_MAX_SAMPLE_SIZE value: "10000" # Maximum sample size for latency prediction - - name: NEG_HEADROOM_TPOT_WEIGHT - value: "0.2" # Weight for TPOT in negative headroom calculation - - name: NEG_HEADROOM_TTFT_WEIGHT - value: "0.8" # Weight for TTFT in negative headroom calculation + + + + ports: - containerPort: 9002 - containerPort: 9003 @@ -338,6 +366,328 @@ spec: volumeMounts: - name: prediction-server-3-storage mountPath: /server_models + # Prediction Server Sidecar Container 4 + - name: prediction-server-4 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8004"] + ports: + - containerPort: 8004 + name: predict-port-4 + livenessProbe: + httpGet: + path: /healthz + port: 8004 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8004 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8004" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-4" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-4-storage + mountPath: /server_models + # Prediction Server Sidecar Container 5 + - name: prediction-server-5 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8005"] + ports: + - containerPort: 8005 + name: predict-port-5 + livenessProbe: + httpGet: + path: /healthz + port: 8005 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8005 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8005" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-5" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-5-storage + mountPath: /server_models + # Prediction Server Sidecar Container 6 + - name: prediction-server-6 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8006"] + ports: + - containerPort: 8006 + name: predict-port-6 + livenessProbe: + httpGet: + path: /healthz + port: 8006 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8006 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8006" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-6" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-6-storage + mountPath: /server_models + # Prediction Server Sidecar Container 7 + - name: prediction-server-7 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8007"] + ports: + - containerPort: 8007 + name: predict-port-7 + livenessProbe: + httpGet: + path: /healthz + port: 8007 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8007 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8007" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-7" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-7-storage + mountPath: /server_models + # Prediction Server Sidecar Container 8 + - name: prediction-server-8 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8008"] + ports: + - containerPort: 8008 + name: predict-port-8 + livenessProbe: + httpGet: + path: /healthz + port: 8008 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8008 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8008" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-8" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-8-storage + mountPath: /server_models + # Prediction Server Sidecar Container 9 + - name: prediction-server-9 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8009"] + ports: + - containerPort: 8009 + name: predict-port-9 + livenessProbe: + httpGet: + path: /healthz + port: 8009 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8009 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8009" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-9" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-9-storage + mountPath: /server_models + # Prediction Server Sidecar Container 10 + - name: prediction-server-10 + image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest + imagePullPolicy: Always + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8010"] + ports: + - containerPort: 8010 + name: predict-port-10 + livenessProbe: + httpGet: + path: /healthz + port: 8010 + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: 8010 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + envFrom: + - configMapRef: + name: prediction-server-config + env: + - name: PREDICT_PORT + value: "8010" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-10" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + volumeMounts: + - name: prediction-server-10-storage + mountPath: /server_models volumes: - name: training-server-storage emptyDir: @@ -351,6 +701,27 @@ spec: - name: prediction-server-3-storage emptyDir: sizeLimit: "10Gi" # Dedicated volume for prediction server 3 + - name: prediction-server-4-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 4 + - name: prediction-server-5-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 5 + - name: prediction-server-6-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 6 + - name: prediction-server-7-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 7 + - name: prediction-server-8-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 8 + - name: prediction-server-9-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 9 + - name: prediction-server-10-storage + emptyDir: + sizeLimit: "10Gi" # Dedicated volume for prediction server 10 - name: plugins-config-volume configMap: name: plugins-config @@ -386,6 +757,7 @@ data: plugins: - pluginRef: slo-aware-routing - pluginRef: max-score-picker + --- # --- RBAC --- kind: Role From 8ca02145193068aa8d37cefb90dd569109de99ae Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Sat, 8 Nov 2025 00:10:22 +0000 Subject: [PATCH 03/11] Remove outdated inferencepool-resources deployment --- .../manifests/inferencepool-resources-lp.yaml | 822 ------------------ 1 file changed, 822 deletions(-) delete mode 100644 config/manifests/inferencepool-resources-lp.yaml diff --git a/config/manifests/inferencepool-resources-lp.yaml b/config/manifests/inferencepool-resources-lp.yaml deleted file mode 100644 index 8a7951e96b..0000000000 --- a/config/manifests/inferencepool-resources-lp.yaml +++ /dev/null @@ -1,822 +0,0 @@ -# Note: If you change this file, please also change the file used for e2e tests! -# -# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml - -# --- ConfigMaps --- -apiVersion: v1 -kind: ConfigMap -metadata: - name: latency-predictor-config - namespace: default -data: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prediction-server-config - namespace: default -data: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" # Use individual storage - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" ---- -# --- InferencePool --- -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: vllm-llama3-8b-instruct -spec: - targetPortNumber: 8000 - selector: - app: vllm-llama3-8b-instruct - extensionRef: - name: vllm-llama3-8b-instruct-epp ---- -# --- EPP Service --- -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default -spec: - selector: - app: vllm-llama3-8b-instruct-epp - ports: - - name: epp-grpc - protocol: TCP - port: 9002 - targetPort: 9002 - appProtocol: http2 - - name: latency-predictor-training - protocol: TCP - port: 8000 - targetPort: 8000 - - name: latency-predictor-1 - protocol: TCP - port: 8001 - targetPort: 8001 - - name: latency-predictor-2 - protocol: TCP - port: 8002 - targetPort: 8002 - - name: latency-predictor-3 - protocol: TCP - port: 8003 - targetPort: 8003 - - name: latency-predictor-4 - protocol: TCP - port: 8004 - targetPort: 8004 - - name: latency-predictor-5 - protocol: TCP - port: 8005 - targetPort: 8005 - - name: latency-predictor-6 - protocol: TCP - port: 8006 - targetPort: 8006 - - name: latency-predictor-7 - protocol: TCP - port: 8007 - targetPort: 8007 - - name: latency-predictor-8 - protocol: TCP - port: 8008 - targetPort: 8008 - - name: latency-predictor-9 - protocol: TCP - port: 8009 - targetPort: 8009 - - name: latency-predictor-10 - protocol: TCP - port: 8010 - targetPort: 8010 - - name: prometheus - protocol: TCP - port: 9090 - targetPort: 9090 - type: LoadBalancer ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default ---- -# --- EPP Deployment with Individual Container Volumes --- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-llama3-8b-instruct-epp - namespace: default - labels: - app: vllm-llama3-8b-instruct-epp -spec: - replicas: 1 # Multiple EPP pods for scaling - selector: - matchLabels: - app: vllm-llama3-8b-instruct-epp - template: - metadata: - labels: - app: vllm-llama3-8b-instruct-epp - spec: - serviceAccountName: vllm-llama3-8b-instruct-epp - # Conservatively, this timeout should mirror the longest grace period of the pods within the pool - containers: - # EPP Container - - name: epp - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/slo-routing-epp-exp - imagePullPolicy: Always - args: - - -pool-name - - "vllm-llama3-8b-instruct" - - "-pool-namespace" - - "default" - - --pool-group - - "inference.networking.x-k8s.io" - - -v - - "4" - - --zap-encoder - - "json" - - -grpc-port - - "9002" - - -grpc-health-port - - "9003" - - "--config-file" - - "/config/default-plugins.yaml" - - "-enable-latency-predictor" - env: - - name: PREDICTION_SERVER_URL - value: "http://localhost:8001,http://localhost:8002,http://localhost:8003,http://localhost:8004,http://localhost:8005,http://localhost:8006,http://localhost:8007,http://localhost:8008,http://localhost:8009,http://localhost:8010" # All 10 prediction servers - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" # Single training server for sending training data - - name: LATENCY_MAX_SAMPLE_SIZE - value: "10000" # Maximum sample size for latency prediction - - - - - ports: - - containerPort: 9002 - - containerPort: 9003 - - name: metrics - containerPort: 9090 - livenessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - grpc: - port: 9003 - service: inference-extension - initialDelaySeconds: 5 - periodSeconds: 10 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - # Training Server Sidecar Container - - name: training-server - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_training:latest - imagePullPolicy: Always - ports: - - containerPort: 8000 - name: training-port - livenessProbe: - httpGet: - path: /healthz - port: 8000 - initialDelaySeconds: 30 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8000 - initialDelaySeconds: 45 - periodSeconds: 10 - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - envFrom: - - configMapRef: - name: latency-predictor-config - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "training" - volumeMounts: - - name: training-server-storage - mountPath: /models - # Prediction Server Sidecar Container 1 - - name: prediction-server-1 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8001"] - ports: - - containerPort: 8001 - name: predict-port-1 - livenessProbe: - httpGet: - path: /healthz - port: 8001 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8001 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8001" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-1" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-1-storage - mountPath: /server_models - # Prediction Server Sidecar Container 2 - - name: prediction-server-2 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8002"] - ports: - - containerPort: 8002 - name: predict-port-2 - livenessProbe: - httpGet: - path: /healthz - port: 8002 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8002 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8002" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-2" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-2-storage - mountPath: /server_models - # Prediction Server Sidecar Container 3 - - name: prediction-server-3 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8003"] - ports: - - containerPort: 8003 - name: predict-port-3 - livenessProbe: - httpGet: - path: /healthz - port: 8003 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8003" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-3" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-3-storage - mountPath: /server_models - # Prediction Server Sidecar Container 4 - - name: prediction-server-4 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8004"] - ports: - - containerPort: 8004 - name: predict-port-4 - livenessProbe: - httpGet: - path: /healthz - port: 8004 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8004 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8004" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-4" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-4-storage - mountPath: /server_models - # Prediction Server Sidecar Container 5 - - name: prediction-server-5 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8005"] - ports: - - containerPort: 8005 - name: predict-port-5 - livenessProbe: - httpGet: - path: /healthz - port: 8005 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8005 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8005" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-5" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-5-storage - mountPath: /server_models - # Prediction Server Sidecar Container 6 - - name: prediction-server-6 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8006"] - ports: - - containerPort: 8006 - name: predict-port-6 - livenessProbe: - httpGet: - path: /healthz - port: 8006 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8006 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8006" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-6" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-6-storage - mountPath: /server_models - # Prediction Server Sidecar Container 7 - - name: prediction-server-7 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8007"] - ports: - - containerPort: 8007 - name: predict-port-7 - livenessProbe: - httpGet: - path: /healthz - port: 8007 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8007 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8007" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-7" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-7-storage - mountPath: /server_models - # Prediction Server Sidecar Container 8 - - name: prediction-server-8 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8008"] - ports: - - containerPort: 8008 - name: predict-port-8 - livenessProbe: - httpGet: - path: /healthz - port: 8008 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8008 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8008" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-8" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-8-storage - mountPath: /server_models - # Prediction Server Sidecar Container 9 - - name: prediction-server-9 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8009"] - ports: - - containerPort: 8009 - name: predict-port-9 - livenessProbe: - httpGet: - path: /healthz - port: 8009 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8009 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8009" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-9" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-9-storage - mountPath: /server_models - # Prediction Server Sidecar Container 10 - - name: prediction-server-10 - image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_prediction:latest - imagePullPolicy: Always - command: ["uvicorn"] - args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8010"] - ports: - - containerPort: 8010 - name: predict-port-10 - livenessProbe: - httpGet: - path: /healthz - port: 8010 - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - port: 8010 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - envFrom: - - configMapRef: - name: prediction-server-config - env: - - name: PREDICT_PORT - value: "8010" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SERVER_TYPE - value: "prediction-10" - - name: TRAINING_SERVER_URL - value: "http://localhost:8000" - volumeMounts: - - name: prediction-server-10-storage - mountPath: /server_models - volumes: - - name: training-server-storage - emptyDir: - sizeLimit: "20Gi" # Dedicated volume for training server - - name: prediction-server-1-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 1 - - name: prediction-server-2-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 2 - - name: prediction-server-3-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 3 - - name: prediction-server-4-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 4 - - name: prediction-server-5-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 5 - - name: prediction-server-6-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 6 - - name: prediction-server-7-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 7 - - name: prediction-server-8-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 8 - - name: prediction-server-9-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 9 - - name: prediction-server-10-storage - emptyDir: - sizeLimit: "10Gi" # Dedicated volume for prediction server 10 - - name: plugins-config-volume - configMap: - name: plugins-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: plugins-config - namespace: default -data: - default-plugins.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: queue-scorer - - type: kv-cache-utilization-scorer - - type: prefix-cache-scorer - - type: slo-aware-routing - - type: slo-aware-profile-handler - - type: max-score-picker - schedulingProfiles: - - name: prefix - plugins: - - pluginRef: prefix-cache-scorer - - name: default - plugins: - - pluginRef: slo-aware-routing - weight: 0 - - pluginRef: queue-scorer - - pluginRef: kv-cache-utilization-scorer - - pluginRef: max-score-picker - - name: slo - plugins: - - pluginRef: slo-aware-routing - - pluginRef: max-score-picker - ---- -# --- RBAC --- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read - namespace: default -rules: -- apiGroups: [ "inference.networking.x-k8s.io" ] - resources: [ "inferenceobjectives", "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "inference.networking.k8s.io" ] - resources: [ "inferencepools" ] - verbs: [ "get", "watch", "list" ] -- apiGroups: [ "" ] - resources: [ "pods" ] - verbs: [ "get", "watch", "list" ] ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read-binding - namespace: default -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pod-read ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: auth-reviewer-binding -subjects: -- kind: ServiceAccount - name: vllm-llama3-8b-instruct-epp - namespace: default -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: auth-reviewer \ No newline at end of file From b337ade04d86b0c35c5a8020e1d09345f745052a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Mon, 10 Nov 2025 21:20:22 +0000 Subject: [PATCH 04/11] Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook --- pkg/epp/requestcontrol/director.go | 5 +++-- .../slo_aware_router/requestcontrol_hooks.go | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go index c4f4f1c1b9..8cb356890a 100644 --- a/pkg/epp/requestcontrol/director.go +++ b/pkg/epp/requestcontrol/director.go @@ -305,8 +305,9 @@ func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *hand logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk") response := &Response{ - RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], - Headers: reqCtx.Response.Headers, + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + EndOfStream: reqCtx.ResponseComplete, } d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go index d505b11e38..c1e762815d 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go @@ -114,7 +114,11 @@ func (t *SLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtype } targetPod := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName].TargetPods[0].GetPod() +<<<<<<< HEAD if !t.checkPredictor(logger, targetPod) { +======= + if !t.CheckPredictor(logger, targetPod) { +>>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } @@ -157,7 +161,11 @@ func (t *SLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtype func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) { logger := log.FromContext(ctx) +<<<<<<< HEAD if !t.checkPredictor(logger, targetPod) { +======= + if !t.CheckPredictor(logger, targetPod) { +>>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } @@ -169,7 +177,11 @@ func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *scheduli return } +<<<<<<< HEAD if err := processHeaderForLatencyPrediction(ctx, t.latencypredictor, sloCtx); err != nil { +======= + if err := ProcessHeaderForLatencyPrediction(ctx, t.latencypredictor, sloCtx); err != nil { +>>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) logger.V(logutil.DEBUG).Error(err, "ProcessHeader in latencypredictor failed") } @@ -177,7 +189,11 @@ func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *scheduli func (t *SLOAwareRouter) ResponseStreaming(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *backend.Pod) { logger := log.FromContext(ctx) +<<<<<<< HEAD if !t.checkPredictor(logger, pod) || response.EndOfStream { +======= + if !t.CheckPredictor(logger, pod) || response.EndOfStream { +>>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } From 0ae94dfbf9dcdf8b7f42bc74e849239c7b8f020a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Nov 2025 01:35:39 +0000 Subject: [PATCH 05/11] add guide, update helm charts and readme, minor scorer changes --- config/charts/inferencepool/README.md | 40 ++++++++ .../inferencepool/templates/epp-config.yaml | 41 +++++++- .../templates/epp-deployment.yaml | 98 +++++++++++++++++++ .../inferencepool/templates/epp-service.yaml | 12 +++ site-src/guides/index.md | 6 ++ site-src/guides/slo-aware-routing.md | 85 ++++++++++++++++ 6 files changed, 278 insertions(+), 4 deletions(-) create mode 100644 site-src/guides/slo-aware-routing.md diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index cf48fcaa6a..02f09330ca 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -121,6 +121,46 @@ $ helm install triton-llama3-8b-instruct \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` +### Install with SLO-Aware Routing + +To enable SLO-aware routing, you must enable the latency predictor, which is deployed as a set of sidecar containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. + +You can enable the latency predictor by setting `inferenceExtension.latencyPredictor.enabled` to `true` in your `values.yaml` file, or by using the `--set` flag on the command line. + +Here is an example of how to install the chart with SLO-aware routing enabled: + +```txt +$ helm install vllm-llama3-8b-instruct . \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferenceExtension.latencyPredictor.enabled=true \ + --set provider.name=gke +``` + +#### SLO-Aware Router Environment Variables + +The behavior of the SLO-aware router can be fine-tuned using the following environment variables in the Endpoint Picker deployment. These can be set under `inferenceExtension.env` in your `values.yaml` file. + +| Environment Variable | Description | Default | +| -------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------- | +| `SAMPLING_MEAN` | The sampling mean (lambda) for the Poisson distribution of token sampling. | `100.0` | +| `MAX_SAMPLED_TOKENS` | The maximum number of tokens to sample for TPOT prediction. | `20` | +| `SLO_BUFFER_FACTOR` | A buffer to apply to the SLO to make it more or less strict. | `1.0` | +| `NEG_HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has negative headroom. | `0.8` | +| `NEG_HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has negative headroom. | `0.2` | +| `HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has positive headroom. | `0.8` | +| `HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has positive headroom. | `0.2` | +| `HEADROOM_SELECTION_STRATEGY` | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least` | +| `COMPOSITE_KV_WEIGHT` | The weight to give to the KV cache utilization in the composite score. | `1` | +| `COMPOSITE_QUEUE_WEIGHT` | The weight to give to the queue size in the composite score. | `1` | +| `COMPOSITE_PREFIX_WEIGHT` | The weight to give to the prefix cache score in the composite score. | `1` | +| `STICKY_EPSILON` | The probability of exploring a non-sticky pod. | `0.01` | +| `NEG_HEADROOM_EPSILON` | The probability of exploring a pod with negative headroom. | `0.01` | +| `AFFINITY_GATE_TAU` | The stickiness threshold for the affinity gate. | `0.80` | +| `AFFINITY_GATE_TAU_GLOBAL` | The global stickiness threshold for the affinity gate. | `0.99` | +| `POD_SELECTION_MODE` | The mode for selecting a pod from the weighted list. Options: `linear` (weighted random), `max` (argmax). | `linear` | + +**Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more. + ### Install with High Availability (HA) To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index 31b9e532ad..057b217e48 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -11,7 +11,28 @@ data: - type: queue-scorer - type: kv-cache-utilization-scorer - type: prefix-cache-scorer + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - type: slo-aware-routing + - type: slo-aware-profile-handler + - type: max-score-picker + {{- end }} schedulingProfiles: + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: prefix + plugins: + - pluginRef: prefix-cache-scorer + - name: default + plugins: + - pluginRef: slo-aware-routing + weight: 0 + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: max-score-picker + - name: slo + plugins: + - pluginRef: slo-aware-routing + - pluginRef: max-score-picker + {{- else }} - name: default plugins: - pluginRef: queue-scorer @@ -20,17 +41,29 @@ data: weight: 2 - pluginRef: prefix-cache-scorer weight: 3 + {{- end }} {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} {{- end }} - --- -{{- if .Values.inferenceExtension.sidecar.enabled }} +{{- if .Values.inferenceExtension.latencyPredictor.enabled }} apiVersion: v1 kind: ConfigMap metadata: - name: {{ .Values.inferenceExtension.sidecar.configMap.name }} + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training namespace: {{ .Release.Namespace }} data: - {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction + namespace: {{ .Release.Namespace }} +data: + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} {{- end }} diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index be6a39ead9..9edd64dd17 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -96,6 +96,9 @@ spec: {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} - --ha-enable-leader-election {{- end }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - --enable-latency-predictor + {{- end }} # Pass additional flags via the inferenceExtension.flags field in values.yaml. {{- range $key, $value := .Values.inferenceExtension.flags }} - --{{ $key }} @@ -151,6 +154,20 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: PREDICTION_SERVER_URL + value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}} + {{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}} + {{- range $i := until $count -}} + {{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }} + {{- end }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- end }} {{- if .Values.inferenceExtension.tracing.enabled }} - name: OTEL_SERVICE_NAME value: "gateway-api-inference-extension" @@ -181,6 +198,77 @@ spec: volumeMounts: - name: plugins-config-volume mountPath: "/config" + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + # Training Server Sidecar Container + - name: training-server + image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} + ports: + - containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + name: training-port + livenessProbe: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.livenessProbe | nindent 10 }} + readinessProbe: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.readinessProbe | nindent 10 }} + resources: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.resources | nindent 10 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "training" + volumeMounts: + - name: training-server-storage + mountPath: /models + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + # Prediction Server Sidecar Container {{ add $i 1 }} + - name: prediction-server-{{ add $i 1 }} + image: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }} + imagePullPolicy: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }} + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"] + ports: + - containerPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + name: predict-port-{{ add $i 1 }} + livenessProbe: + httpGet: + path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.httpGet.path }} + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.periodSeconds }} + readinessProbe: + httpGet: + path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.httpGet.path }} + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.periodSeconds }} + failureThreshold: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.failureThreshold }} + resources: + {{- toYaml $.Values.inferenceExtension.latencyPredictor.predictionServers.resources | nindent 10 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-prediction + env: + - name: PREDICT_PORT + value: "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-{{ add $i 1 }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:{{ $.Values.inferenceExtension.latencyPredictor.trainingServer.port }}" + volumeMounts: + - name: prediction-server-{{ add $i 1 }}-storage + mountPath: /server_models + {{- end }} + {{- end }} volumes: {{- if .Values.inferenceExtension.sidecar.volumes }} {{- tpl (toYaml .Values.inferenceExtension.sidecar.volumes) $ | nindent 6 }} @@ -188,6 +276,16 @@ spec: - name: plugins-config-volume configMap: name: {{ include "gateway-api-inference-extension.name" . }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: training-server-storage + emptyDir: + sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + - name: prediction-server-{{ add $i 1 }}-storage + emptyDir: + sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} + {{- end }} + {{- end }} {{- if .Values.inferenceExtension.affinity }} affinity: {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml index b1a48df911..fe76d71c92 100644 --- a/config/charts/inferencepool/templates/epp-service.yaml +++ b/config/charts/inferencepool/templates/epp-service.yaml @@ -15,6 +15,18 @@ spec: - name: http-metrics protocol: TCP port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: latency-predictor-training + protocol: TCP + port: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + targetPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + - name: latency-predictor-{{ add $i 1 }} + protocol: TCP + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + targetPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + {{- end }} + {{- end }} {{- with .Values.inferenceExtension.extraServicePorts }} {{- toYaml . | nindent 4 }} {{- end }} diff --git a/site-src/guides/index.md b/site-src/guides/index.md index b655e56cd0..11d3eb5016 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -274,6 +274,12 @@ Deploy the sample InferenceObjective which allows you to specify priority of req --8<-- "site-src/_includes/bbr.md" +### Next Steps: Advanced Features + +You have now deployed a basic Inference Gateway with a simple routing strategy. To explore more advanced features, such as SLO-aware routing, please refer to the following guide: + +* [SLO-Aware Routing](./slo-aware-routing.md) + ### Cleanup The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md new file mode 100644 index 0000000000..e2af53e5a2 --- /dev/null +++ b/site-src/guides/slo-aware-routing.md @@ -0,0 +1,85 @@ +# SLO-Aware Routing + +> For deployment instructions, jump to [Deploying with SLO-Aware Routing](#deploying-with-slo-aware-routing). + +SLO-aware routing is a feature of the Inference Gateway that enables intelligent routing of inference requests based on Service Level Objectives (SLOs). It uses a latency predictor to estimate the Time to First Token (TTFT) and Time Per Output Token (TPOT) for each request on each available model server. This allows the gateway to select the optimal server that can meet the request's SLOs, while also considering the overall health and utilization of the model servers. + +## How it Works + +The SLO-aware routing feature is implemented as a plugin for the Endpoint Picker (EPP). When a request is received, the plugin performs the following steps: + +1. **SLO Extraction**: The plugin extracts the TTFT and TPOT SLOs from the request headers (`x-slo-ttft-ms` and `x-slo-tpot-ms`). It also checks for the `x-prediction-based-scheduling` header to determine if SLO-aware routing should be used for this request. + +2. **Latency Prediction**: The plugin uses a latency predictor, deployed as a set of sidecar containers to the EPP, to predict the TTFT and TPOT for the request on each of the available model servers. The prediction is based on the current state of the server, including its KV cache utilization, and the number of running and waiting requests. + +3. **Headroom Calculation**: For each model server, the plugin calculates the "headroom", which is the difference between the predicted latency and the SLO. A positive headroom means the server is expected to meet the SLO, while a negative headroom means it is not. + +4. **Pod Selection**: The plugin selects a model server based on the calculated headrooms and a configurable selection strategy. The goal is to pick a server that can meet the SLOs without being overloaded. + +5. **Fallback**: If the latency predictor is not available or fails to make a prediction, the plugin falls back to a "composite scoring" mechanism. This mechanism uses a combination of metrics, including prefix cache scores and queue sizes, to make a routing decision. + +## Request Headers + +To use SLO-aware routing, you need to include the following headers in your inference requests: + +- `x-prediction-based-scheduling`: Set to `true` to enable SLO-aware routing for the request. +- `x-slo-ttft-ms`: The Time to First Token SLO in milliseconds. +- `x-slo-tpot-ms`: The Time Per Output Token SLO in milliseconds. + +## Headroom Selection Strategies + +The SLO-aware routing plugin provides several strategies for selecting a model server based on the calculated headrooms: + +- `least`: (Default) Prefers the pod with the least positive headroom. This strategy is good for packing pods tightly and maximizing utilization. +- `most`: Prefers the pod with the most positive headroom. This strategy is more conservative and leaves more room for unexpected latency spikes. +- `composite-least`: A strategy that considers a composite score of various metrics, and prefers the pod with the lowest score. +- `composite-most`: A strategy that considers a composite score of various metrics, and prefers the pod with the highest score. +- `composite-only`: This strategy only uses the composite score and ignores latency predictions. + +The selection strategy can be configured via the `HEADROOM_SELECTION_STRATEGY` environment variable in the Endpoint Picker deployment. + +## Deploying with SLO-Aware Routing + +### Prerequisites + +Before you begin, ensure you have a functional Inference Gateway with at least one model server deployed. If you haven't set this up yet, please follow the [Getting started guide](../index.md). + +### Deployment + +To use SLO-aware routing, you must deploy the Endpoint Picker with the latency predictor sidecars. This can be done via the Helm chart by setting the `inferenceExtension.latencyPredictor.enabled` flag to `true`. When this flag is set, the necessary `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. + +For specific deployment instructions and details on configuring environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md). + +## Monitoring + +When SLO-aware routing is enabled, a number of Prometheus metrics are exposed to allow for monitoring and observability of the feature. These metrics provide insight into the performance of the latency predictor and the effectiveness of the SLO-based routing. + +Key categories of metrics include: + +- **Actual vs. Predicted Latency**: Metrics for both actual and predicted Time to First Token (TTFT) and Time Per Output Token (TPOT) are available. This allows you to compare the accuracy of the latency predictor. +- **Prediction Duration**: The time it takes for the latency predictor to generate a prediction is also measured. +- **SLO Violations**: Counters and gauges are available to track when SLOs are violated. This can be used to alert on SLO breaches. +- **SLO Thresholds**: The current SLO thresholds for TTFT and TPOT are also exposed as metrics. + +The following is a comprehensive list of the Prometheus metrics exposed: + +| Metric Name | Description | +| :--------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------- | +| `inference_objective_request_ttft_seconds` | Inference model TTFT distribution in seconds for each model and target model. | +| `inference_objective_request_ttft_seconds_gauge` | Inference model TTFT gauge in seconds for each model and target model. | +| `inference_objective_request_predicted_ttft_seconds` | Inference model Predicted TTFT distribution in seconds for each model and target model. | +| `inference_objective_request_predicted_ttft_seconds_gauge` | Inference model Predicted TTFT gauge in seconds for each model and target model. | +| `inference_objective_request_ttft_prediction_duration_seconds` | Duration taken to generate TTFT predictions in seconds for each model and target model. | +| `inference_objective_request_ttft_prediction_duration_seconds_gauge` | Latest duration taken to generate TTFT predictions in seconds for each model and target model. | +| `inference_objective_request_tpot_seconds` | Inference model TPOT distribution in seconds for each model and target model. | +| `inference_objective_request_tpot_seconds_gauge` | Inference model TPOT gauge in seconds for each model and target model. | +| `inference_objective_request_predicted_tpot_seconds` | Inference model Predicted TPOT distribution in seconds for each model and target model. | +| `inference_objective_request_predicted_tpot_seconds_gauge` | Inference model Predicted TPOT gauge in seconds for each model and target model. | +| `inference_objective_request_tpot_prediction_duration_seconds` | Duration taken to generate TPOT predictions in seconds for each model and target model. | +| `inference_objective_request_tpot_prediction_duration_seconds_gauge` | Latest duration taken to generate TPOT predictions in seconds for each model and target model. | +| `inference_objective_request_ttft_slo_violation` | Boolean indicator (0 or 1) of whether the last TTFT measurement violated the SLO threshold for each model and target model. | +| `inference_objective_request_ttft_slo_violation_total` | Counter of TTFT SLO violations for each model and target model. | +| `inference_objective_request_tpot_slo_violation` | Boolean indicator (0 or 1) of whether the last TPOT measurement violated the SLO threshold for each model and target model. | +| `inference_objective_request_tpot_slo_violation_total` | Counter of TPOT SLO violations for each model and target model. | +| `inference_objective_request_ttft_slo_threshold_seconds` | Current TTFT SLO threshold in seconds for each model and target model. | +| `inference_objective_request_tpot_slo_threshold_seconds` | Current TPOT SLO threshold in seconds for each model and target model. | From 2e220d7e8c4bfdde6d89465a2f67cad2c321e5ac Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 14 Nov 2025 01:48:35 +0000 Subject: [PATCH 06/11] Make small guide update --- site-src/guides/slo-aware-routing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md index e2af53e5a2..c93d6cc30f 100644 --- a/site-src/guides/slo-aware-routing.md +++ b/site-src/guides/slo-aware-routing.md @@ -48,7 +48,7 @@ Before you begin, ensure you have a functional Inference Gateway with at least o To use SLO-aware routing, you must deploy the Endpoint Picker with the latency predictor sidecars. This can be done via the Helm chart by setting the `inferenceExtension.latencyPredictor.enabled` flag to `true`. When this flag is set, the necessary `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. -For specific deployment instructions and details on configuring environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md). +For specific deployment instructions and details on configuring environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md#slo-aware-router-environment-variables). ## Monitoring From 729c53b9b8fe07a453e9544082ea59a920025e8a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 20 Nov 2025 21:40:05 +0000 Subject: [PATCH 07/11] Add helm values and polish README and SLO routing guide --- config/charts/inferencepool/README.md | 12 +-- config/charts/inferencepool/values.yaml | 115 ++++++++++++++++++------ site-src/guides/slo-aware-routing.md | 4 +- 3 files changed, 93 insertions(+), 38 deletions(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 02f09330ca..1edfca793f 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -132,8 +132,10 @@ Here is an example of how to install the chart with SLO-aware routing enabled: ```txt $ helm install vllm-llama3-8b-instruct . \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferenceExtension.monitoring.gke.enabled=true \ --set inferenceExtension.latencyPredictor.enabled=true \ - --set provider.name=gke + --set provider.name=gke \ + -f values.yaml ``` #### SLO-Aware Router Environment Variables @@ -150,14 +152,6 @@ The behavior of the SLO-aware router can be fine-tuned using the following envir | `HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has positive headroom. | `0.8` | | `HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has positive headroom. | `0.2` | | `HEADROOM_SELECTION_STRATEGY` | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least` | -| `COMPOSITE_KV_WEIGHT` | The weight to give to the KV cache utilization in the composite score. | `1` | -| `COMPOSITE_QUEUE_WEIGHT` | The weight to give to the queue size in the composite score. | `1` | -| `COMPOSITE_PREFIX_WEIGHT` | The weight to give to the prefix cache score in the composite score. | `1` | -| `STICKY_EPSILON` | The probability of exploring a non-sticky pod. | `0.01` | -| `NEG_HEADROOM_EPSILON` | The probability of exploring a pod with negative headroom. | `0.01` | -| `AFFINITY_GATE_TAU` | The stickiness threshold for the affinity gate. | `0.80` | -| `AFFINITY_GATE_TAU_GLOBAL` | The global stickiness threshold for the affinity gate. | `0.99` | -| `POD_SELECTION_MODE` | The mode for selecting a pod from the weighted list. Options: `linear` (weighted random), `max` (argmax). | `linear` | **Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more. diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index bed1f46e38..35d929dd16 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -1,9 +1,9 @@ inferenceExtension: replicas: 1 image: - name: epp - hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension - tag: main + name: epp-wlp-latencypredictor-helm-v2 + hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo + tag: latest pullPolicy: Always extProcPort: 9002 env: [] @@ -12,11 +12,6 @@ inferenceExtension: extraContainerPorts: [] # Define additional service ports extraServicePorts: [] -# extraServicePorts: -# - name: http -# port: 8081 -# protocol: TCP -# targetPort: 8081 # This is the plugins configuration file. # pluginsCustomConfig: @@ -43,10 +38,6 @@ inferenceExtension: affinity: {} tolerations: [] - - # Sidecar configuration for EPP - sidecar: - enabled: false # Monitoring configuration for EPP monitoring: @@ -71,6 +62,89 @@ inferenceExtension: sampler: "parentbased_traceidratio" samplerArg: "0.1" + # Latency Predictor Configuration + latencyPredictor: + enabled: false + + # Training Server Configuration + trainingServer: + image: + hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo + name: latencypredictor-v3-training-server + tag: latest + pullPolicy: Always + port: 8000 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + # Prediction Server Configuration + predictionServers: + count: 10 + startPort: 8001 + image: + hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo + name: latencypredictor-v3-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /healthz + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # EPP Environment Variables for Latency Predictor + eppEnv: + LATENCY_MAX_SAMPLE_SIZE: "10000" + inferencePool: targetPorts: - number: 8000 @@ -94,25 +168,12 @@ provider: # Set to true if the cluster is an Autopilot cluster. autopilot: false - # Istio-specific configuration. - # This block is only used if name is "istio". - istio: - destinationRule: - # Provide a way to override the default calculated host - host: "" - # Optional: Enables customization of the traffic policy - trafficPolicy: {} - # connectionPool: - # http: - # maxRequestsPerConnection: 256000 - -# DEPRECATED and will be removed in v1.3. Instead, use `provider.istio.*`. istio: destinationRule: # Provide a way to override the default calculated host - host: "" + host: "" # Optional: Enables customization of the traffic policy trafficPolicy: {} # connectionPool: # http: - # maxRequestsPerConnection: 256000 + # maxRequestsPerConnection: 256000 \ No newline at end of file diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md index c93d6cc30f..818bc0d22c 100644 --- a/site-src/guides/slo-aware-routing.md +++ b/site-src/guides/slo-aware-routing.md @@ -61,6 +61,8 @@ Key categories of metrics include: - **SLO Violations**: Counters and gauges are available to track when SLOs are violated. This can be used to alert on SLO breaches. - **SLO Thresholds**: The current SLO thresholds for TTFT and TPOT are also exposed as metrics. +NOTE: TPOT is equivalen to vLLM's **ITL** (Inter Token Latency), as vLLM defines TPOT as the average time per output token *including the TTFT*. This is commonly known as NTPOT in other contexts, and we don't capture that metric here. + The following is a comprehensive list of the Prometheus metrics exposed: | Metric Name | Description | @@ -81,5 +83,3 @@ The following is a comprehensive list of the Prometheus metrics exposed: | `inference_objective_request_ttft_slo_violation_total` | Counter of TTFT SLO violations for each model and target model. | | `inference_objective_request_tpot_slo_violation` | Boolean indicator (0 or 1) of whether the last TPOT measurement violated the SLO threshold for each model and target model. | | `inference_objective_request_tpot_slo_violation_total` | Counter of TPOT SLO violations for each model and target model. | -| `inference_objective_request_ttft_slo_threshold_seconds` | Current TTFT SLO threshold in seconds for each model and target model. | -| `inference_objective_request_tpot_slo_threshold_seconds` | Current TPOT SLO threshold in seconds for each model and target model. | From 7b59026274f00ad533401f696800628c164523d4 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 20 Nov 2025 22:15:52 +0000 Subject: [PATCH 08/11] Clean up errors from rebase, add running request metric to datasource, add predictor to new 2 phase configuration parser --- cmd/epp/runner/runner.go | 61 ++++++++++--------- pkg/epp/datalayer/metrics/datasource_test.go | 2 +- pkg/epp/datalayer/metrics/extractor_test.go | 5 +- .../slo_aware_router/requestcontrol_hooks.go | 16 ----- 4 files changed, 37 insertions(+), 47 deletions(-) diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index 7127e2c946..c1b6b34dac 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -234,7 +234,20 @@ func (r *Runner) Run(ctx context.Context) error { return err } - rawConfig, err := r.parseConfigurationPhaseOne(ctx) + // =================================================================== + // == Latency Predictor Integration + // =================================================================== + var predictor latencypredictor.PredictorInterface // Use the interface type + if *enableLatencyPredictor { + setupLog.Info("Latency predictor is enabled. Initializing...") + predictor = latencypredictor.New(latencypredictor.ConfigFromEnv(), ctrl.Log.WithName("latency-predictor")) + } else { + setupLog.Info("Latency predictor is disabled.") + predictor = nil // This will be a true nil interface + } + // =================================================================== + + rawConfig, err := r.parseConfigurationPhaseOne(ctx, predictor) if err != nil { setupLog.Error(err, "Failed to parse configuration") return err @@ -315,32 +328,6 @@ func (r *Runner) Run(ctx context.Context) error { runtime.SetBlockProfileRate(1) } - // =================================================================== - // == Latency Predictor Integration - // =================================================================== - var predictor latencypredictor.PredictorInterface // Use the interface type - if *enableLatencyPredictor { - setupLog.Info("Latency predictor is enabled. Initializing...") - predictor = latencypredictor.New(latencypredictor.ConfigFromEnv(), ctrl.Log.WithName("latency-predictor")) - - // For the runnable, you'll need to type assert back to the concrete type - concretePredictor := predictor.(*latencypredictor.Predictor) - if err := mgr.Add(runnable.NoLeaderElection(&predictorRunnable{predictor: concretePredictor})); err != nil { - setupLog.Error(err, "Failed to register latency predictor runnable") - return err - } - } else { - setupLog.Info("Latency predictor is disabled.") - predictor = nil // This will be a true nil interface - } - // =================================================================== - - err = r.parsePluginsConfiguration(ctx, predictor, datastore) - if err != nil { - setupLog.Error(err, "Failed to parse the configuration") - return err - } - // --- Initialize Core EPP Components --- if r.schedulerConfig == nil { err := errors.New("scheduler config must be set either by config api or through code") @@ -417,6 +404,12 @@ func (r *Runner) Run(ctx context.Context) error { return err } + if *enableLatencyPredictor && predictor != nil { + if err := registerLatencyPredictorServer(mgr, predictor); err != nil { + return err + } + } + // --- Start Manager --- // This blocks until a signal is received. setupLog.Info("Controller manager starting") @@ -473,7 +466,7 @@ func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.Pred plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory) } -func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.EndpointPickerConfig, error) { +func (r *Runner) parseConfigurationPhaseOne(ctx context.Context, predictor latencypredictor.PredictorInterface) (*configapi.EndpointPickerConfig, error) { if *configText == "" && *configFile == "" { return nil, nil // configuring through code, not through file } @@ -702,6 +695,18 @@ func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore. return nil } +// registerLatencyPredictorServer adds the Latency Predictor server as a Runnable to the given manager. +func registerLatencyPredictorServer(mgr manager.Manager, predictor latencypredictor.PredictorInterface) error { + // For the runnable, you'll need to type assert back to the concrete type + concretePredictor := predictor.(*latencypredictor.Predictor) + if err := mgr.Add(runnable.NoLeaderElection(&predictorRunnable{predictor: concretePredictor})); err != nil { + setupLog.Error(err, "Failed to register latency predictor runnable") + return err + } + setupLog.Info("Latency predictor runnable added to manager.") + return nil +} + func validateFlags() error { if (*poolName != "" && *endpointSelector != "") || (*poolName == "" && *endpointSelector == "") { return errors.New("either pool-name or endpoint-selector must be set") diff --git a/pkg/epp/datalayer/metrics/datasource_test.go b/pkg/epp/datalayer/metrics/datasource_test.go index 7c293753f5..016622a40d 100644 --- a/pkg/epp/datalayer/metrics/datasource_test.go +++ b/pkg/epp/datalayer/metrics/datasource_test.go @@ -29,7 +29,7 @@ import ( func TestDatasource(t *testing.T) { source := NewDataSource("https", "/metrics", true, nil) - extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, "", "", "") + extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, "", "", "", "") assert.Nil(t, err, "failed to create extractor") name := source.Name() diff --git a/pkg/epp/datalayer/metrics/extractor_test.go b/pkg/epp/datalayer/metrics/extractor_test.go index d8aaca5566..5ca7fe2214 100644 --- a/pkg/epp/datalayer/metrics/extractor_test.go +++ b/pkg/epp/datalayer/metrics/extractor_test.go @@ -31,6 +31,7 @@ import ( const ( // use hardcoded values - importing causes cycle defaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" + defaultTotalRunningRequestsMetric = "vllm:num_requests_running" defaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" defaultLoraInfoMetric = "vllm:lora_requests_info" defaultCacheInfoMetric = "vllm:cache_config_info" @@ -39,11 +40,11 @@ const ( func TestExtractorExtract(t *testing.T) { ctx := context.Background() - if _, err := NewExtractor("vllm: dummy", "", "", ""); err == nil { + if _, err := NewExtractor("vllm: dummy", "", "", "", ""); err == nil { t.Error("expected to fail to create extractor with invalid specification") } - extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, + extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, defaultTotalRunningRequestsMetric, defaultKvCacheUsagePercentageMetric, defaultLoraInfoMetric, defaultCacheInfoMetric) if err != nil { t.Fatalf("failed to create extractor: %v", err) diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go index c1e762815d..d505b11e38 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go +++ b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks.go @@ -114,11 +114,7 @@ func (t *SLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtype } targetPod := schedulingResult.ProfileResults[schedulingResult.PrimaryProfileName].TargetPods[0].GetPod() -<<<<<<< HEAD if !t.checkPredictor(logger, targetPod) { -======= - if !t.CheckPredictor(logger, targetPod) { ->>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } @@ -161,11 +157,7 @@ func (t *SLOAwareRouter) PreRequest(ctx context.Context, request *schedulingtype func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) { logger := log.FromContext(ctx) -<<<<<<< HEAD if !t.checkPredictor(logger, targetPod) { -======= - if !t.CheckPredictor(logger, targetPod) { ->>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } @@ -177,11 +169,7 @@ func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *scheduli return } -<<<<<<< HEAD if err := processHeaderForLatencyPrediction(ctx, t.latencypredictor, sloCtx); err != nil { -======= - if err := ProcessHeaderForLatencyPrediction(ctx, t.latencypredictor, sloCtx); err != nil { ->>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) logger.V(logutil.DEBUG).Error(err, "ProcessHeader in latencypredictor failed") } @@ -189,11 +177,7 @@ func (t *SLOAwareRouter) ResponseReceived(ctx context.Context, request *scheduli func (t *SLOAwareRouter) ResponseStreaming(ctx context.Context, request *schedulingtypes.LLMRequest, response *requestcontrol.Response, pod *backend.Pod) { logger := log.FromContext(ctx) -<<<<<<< HEAD if !t.checkPredictor(logger, pod) || response.EndOfStream { -======= - if !t.CheckPredictor(logger, pod) || response.EndOfStream { ->>>>>>> b2a7d45 (Fix streamed request being called one final time after request complete, add predictor check to the beginning of each requestcontrol hook) return } From 07f7ae153a6f458265303c86add591cd0ff5a11d Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 20 Nov 2025 22:47:14 +0000 Subject: [PATCH 09/11] Add running request metric to extractor test --- pkg/epp/datalayer/metrics/extractor_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/epp/datalayer/metrics/extractor_test.go b/pkg/epp/datalayer/metrics/extractor_test.go index 5ca7fe2214..bb408f6db1 100644 --- a/pkg/epp/datalayer/metrics/extractor_test.go +++ b/pkg/epp/datalayer/metrics/extractor_test.go @@ -107,6 +107,14 @@ func TestExtractorExtract(t *testing.T) { }, }, }, + defaultTotalRunningRequestsMetric: &dto.MetricFamily{ + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{Value: ptr.To(1.0)}, + }, + }, + }, defaultKvCacheUsagePercentageMetric: &dto.MetricFamily{ Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ From 678f608c8c2bfe2c01f316f1d548bb3cc49a058a Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 20 Nov 2025 23:22:13 +0000 Subject: [PATCH 10/11] Fix epp image and add placeholder docker repos for latency sidecars --- cmd/epp/runner/runner.go | 6 +++- .../inferencepool/templates/epp-config.yaml | 12 ++++++- config/charts/inferencepool/values.yaml | 36 +++++++++++++++---- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index c1b6b34dac..211b87cf89 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -865,7 +865,11 @@ type predictorRunnable struct { func (p *predictorRunnable) Start(ctx context.Context) error { setupLog.Info("Starting latency predictor...") - p.predictor.Start(ctx) + if err := p.predictor.Start(ctx); err != nil { + setupLog.Error(err, "Failed to start latency predictor") + return err + } + setupLog.Info("Latency predictor started.") <-ctx.Done() setupLog.Info("Stopping latency predictor...") p.predictor.Stop() diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index 057b217e48..ca562d5823 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -46,6 +46,16 @@ data: {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} {{- end }} --- +{{- if .Values.inferenceExtension.sidecar.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.inferenceExtension.sidecar.configMap.name }} + namespace: {{ .Release.Namespace }} +data: + {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} +{{- end }} +--- {{- if .Values.inferenceExtension.latencyPredictor.enabled }} apiVersion: v1 kind: ConfigMap @@ -55,7 +65,7 @@ metadata: data: {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} {{ $key }}: {{ $value | quote }} - {{- end }} +{{- end }} --- apiVersion: v1 kind: ConfigMap diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 35d929dd16..1c4d8bbcfa 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -1,9 +1,9 @@ inferenceExtension: replicas: 1 image: - name: epp-wlp-latencypredictor-helm-v2 - hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo - tag: latest + name: epp + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + tag: main pullPolicy: Always extProcPort: 9002 env: [] @@ -12,6 +12,11 @@ inferenceExtension: extraContainerPorts: [] # Define additional service ports extraServicePorts: [] + # extraServicePorts: + # - name: http + # port: 8081 + # protocol: TCP + # targetPort: 8081 # This is the plugins configuration file. # pluginsCustomConfig: @@ -39,6 +44,10 @@ inferenceExtension: tolerations: [] + # Sidecar configuration for EPP + sidecar: + enabled: false + # Monitoring configuration for EPP monitoring: interval: "10s" @@ -69,7 +78,7 @@ inferenceExtension: # Training Server Configuration trainingServer: image: - hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars name: latencypredictor-v3-training-server tag: latest pullPolicy: Always @@ -110,7 +119,7 @@ inferenceExtension: count: 10 startPort: 8001 image: - hub: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars name: latencypredictor-v3-prediction-server tag: latest pullPolicy: Always @@ -168,12 +177,25 @@ provider: # Set to true if the cluster is an Autopilot cluster. autopilot: false + # Istio-specific configuration. + # This block is only used if name is "istio". + istio: + destinationRule: + # Provide a way to override the default calculated host + host: "" + # Optional: Enables customization of the traffic policy + trafficPolicy: {} + # connectionPool: + # http: + # maxRequestsPerConnection: 256000 + +# DEPRECATED and will be removed in v1.3. Instead, use `provider.istio.*`. istio: destinationRule: # Provide a way to override the default calculated host - host: "" + host: "" # Optional: Enables customization of the traffic policy trafficPolicy: {} # connectionPool: # http: - # maxRequestsPerConnection: 256000 \ No newline at end of file + # maxRequestsPerConnection: 256000 From ddee4c7651b80e51cba18efcc9575e4b5aa789bb Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Fri, 21 Nov 2025 00:54:42 +0000 Subject: [PATCH 11/11] Update guide, README, and values.yaml --- config/charts/inferencepool/README.md | 15 +------- config/charts/inferencepool/values.yaml | 4 +-- site-src/guides/slo-aware-routing.md | 46 ++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 1edfca793f..ab69fa1a8a 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -123,20 +123,7 @@ $ helm install triton-llama3-8b-instruct \ ### Install with SLO-Aware Routing -To enable SLO-aware routing, you must enable the latency predictor, which is deployed as a set of sidecar containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. - -You can enable the latency predictor by setting `inferenceExtension.latencyPredictor.enabled` to `true` in your `values.yaml` file, or by using the `--set` flag on the command line. - -Here is an example of how to install the chart with SLO-aware routing enabled: - -```txt -$ helm install vllm-llama3-8b-instruct . \ - --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ - --set inferenceExtension.monitoring.gke.enabled=true \ - --set inferenceExtension.latencyPredictor.enabled=true \ - --set provider.name=gke \ - -f values.yaml -``` +For full details see the dedicated [SLO-Aware Routing Guide](../../../site-src/guides/slo-aware-routing.md) #### SLO-Aware Router Environment Variables diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 1c4d8bbcfa..290dd57550 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -79,7 +79,7 @@ inferenceExtension: trainingServer: image: hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-v3-training-server + name: latencypredictor-training-server tag: latest pullPolicy: Always port: 8000 @@ -120,7 +120,7 @@ inferenceExtension: startPort: 8001 image: hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-v3-prediction-server + name: latencypredictor-prediction-server tag: latest pullPolicy: Always resources: diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md index 818bc0d22c..4e981cf081 100644 --- a/site-src/guides/slo-aware-routing.md +++ b/site-src/guides/slo-aware-routing.md @@ -22,9 +22,9 @@ The SLO-aware routing feature is implemented as a plugin for the Endpoint Picker To use SLO-aware routing, you need to include the following headers in your inference requests: -- `x-prediction-based-scheduling`: Set to `true` to enable SLO-aware routing for the request. +- `x-prediction-based-scheduling`: Set to `true` to enable SLO-aware routing for the request, setting this to false or omiting the header will use non-SLO routing, but will still use the latency data to train the predictor. - `x-slo-ttft-ms`: The Time to First Token SLO in milliseconds. -- `x-slo-tpot-ms`: The Time Per Output Token SLO in milliseconds. +- `x-slo-tpot-ms`: The Time Per Output Token SLO in milliseconds (this is vLLMs equivalent of ITL, is it **not** NTPOT). ## Headroom Selection Strategies @@ -42,13 +42,49 @@ The selection strategy can be configured via the `HEADROOM_SELECTION_STRATEGY` e ### Prerequisites -Before you begin, ensure you have a functional Inference Gateway with at least one model server deployed. If you haven't set this up yet, please follow the [Getting started guide](../index.md). +Before you begin, ensure you have a functional Inference Gateway with at least one model server deployed. If you haven't set this up yet, please follow the [Getting Started Guide](./getting-started-latest.md). ### Deployment -To use SLO-aware routing, you must deploy the Endpoint Picker with the latency predictor sidecars. This can be done via the Helm chart by setting the `inferenceExtension.latencyPredictor.enabled` flag to `true`. When this flag is set, the necessary `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. +To enable SLO-aware routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. -For specific deployment instructions and details on configuring environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md#slo-aware-router-environment-variables). +#### Steps: + +1. Build the predictor and sidecar images from inside the `latencypredictor` package. See the [Latency Predictor - Build Guide](../../../latencypredictor/README.md) for instructions. + +2. Set your Docker repository path by replacing the placeholders in Helm chart [values.yaml](../../../config/charts/inferencepool/values.yaml) in the format `us-docker.pkg.dev/PROJECT_ID/REPOSITORY` based on what you used to build the sidecars in the Build Guide from step 1. + +3. Deploy the chart with the latency predictor enabled by setting `inferenceExtension.latencyPredictor.enabled` to `true` in your `values.yaml` file, or by using the `--set` flag on the command line: + +```txt +helm install vllm-llama3-8b-instruct . \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferenceExtension.monitoring.gke.enabled=true \ + --set inferenceExtension.latencyPredictor.enabled=true \ + --set provider.name=gke \ + -f values.yaml +``` + +After these steps, Inference Gateway will be prepared to predict, train, and route requests based on their SLOs. + +For details on configuring specific environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md#slo-aware-router-environment-variables). + +### Sending Requests + +To send a request with SLO-Aware Routing, you will need to specify the request SLOs and whether to route or not in the request header. See [Request Headers](#request-headers) section above. + +If you have a standard setup via using the [Getting Started Guide](./getting-started-latest.md) and then followed the steps outlined above, below is an example inference request with SLOs specified and routing enabled: + +```txt +export GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'):80 + +curl -v $GW_IP/v1/completions -H 'Content-Type: application/json' -H 'x-slo-ttft-ms: 100' -H 'x-slo-tpot-ms: 100' -H 'x-prediction-based-scheduling: true' -d '{ +"model": "meta-llama/Llama-3.1-8B-Instruct", +"prompt": "Write as if you were a critic: San Francisco where the ", +"max_tokens": 100, +"temperature": 0, "stream_options": {"include_usage": "true"}, "stream" : "true" +}' +``` ## Monitoring