diff --git a/Dockerfile b/Dockerfile index d2fd2300b..70038a077 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ COPY internal ./internal COPY apix ./apix COPY api ./api COPY version ./version +COPY sidecars ./sidecars WORKDIR /src/cmd/epp RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go index 24a59ebf7..211b87cf8 100644 --- a/cmd/epp/runner/runner.go +++ b/cmd/epp/runner/runner.go @@ -19,6 +19,7 @@ package runner import ( "context" "crypto/tls" + "encoding/json" "errors" "flag" "fmt" @@ -69,6 +70,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" @@ -76,6 +78,7 @@ import ( runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + latencypredictor "sigs.k8s.io/gateway-api-inference-extension/sidecars/latencypredictorasync" "sigs.k8s.io/gateway-api-inference-extension/version" ) @@ -126,6 +129,7 @@ var ( "then a self-signed certificate is used.") // metric flags totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.") + totalRunningRequestsMetric = flag.String("total-running-requests-metric", runserver.DefaultTotalRunningRequestsMetric, "Prometheus metric for the number of running requests.") kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") // LoRA metrics loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") @@ -145,7 +149,10 @@ var ( modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods") modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", true, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true)") haEnableLeaderElection = flag.Bool("ha-enable-leader-election", false, "Enables leader election for high availability. When enabled, readiness probes will only pass on the leader.") - tracing = flag.Bool("tracing", true, "Enables emitting traces") + + // Latency Predictor Flag + enableLatencyPredictor = flag.Bool("enable-latency-predictor", false, "Enable the regression-based latency predictor and scheduler scorer.") + tracing = flag.Bool("tracing", true, "Enables emitting traces") setupLog = ctrl.Log.WithName("setup") ) @@ -227,7 +234,20 @@ func (r *Runner) Run(ctx context.Context) error { return err } - rawConfig, err := r.parseConfigurationPhaseOne(ctx) + // =================================================================== + // == Latency Predictor Integration + // =================================================================== + var predictor latencypredictor.PredictorInterface // Use the interface type + if *enableLatencyPredictor { + setupLog.Info("Latency predictor is enabled. Initializing...") + predictor = latencypredictor.New(latencypredictor.ConfigFromEnv(), ctrl.Log.WithName("latency-predictor")) + } else { + setupLog.Info("Latency predictor is disabled.") + predictor = nil // This will be a true nil interface + } + // =================================================================== + + rawConfig, err := r.parseConfigurationPhaseOne(ctx, predictor) if err != nil { setupLog.Error(err, "Failed to parse configuration") return err @@ -366,6 +386,7 @@ func (r *Runner) Run(ctx context.Context) error { Director: director, SaturationDetector: saturationDetector, UseExperimentalDatalayerV2: r.featureGates[datalayer.FeatureGate], // pluggable data layer feature flag + LatencyPredictor: predictor, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup EPP controllers") @@ -383,6 +404,12 @@ func (r *Runner) Run(ctx context.Context) error { return err } + if *enableLatencyPredictor && predictor != nil { + if err := registerLatencyPredictorServer(mgr, predictor); err != nil { + return err + } + } + // --- Start Manager --- // This blocks until a signal is received. setupLog.Info("Controller manager starting") @@ -432,7 +459,14 @@ func (r *Runner) registerInTreePlugins() { plugins.Register(testresponsereceived.DestinationEndpointServedVerifierType, testresponsereceived.DestinationEndpointServedVerifierFactory) } -func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.EndpointPickerConfig, error) { +func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface) { + plugins.Register(slo_aware_router.SLOAwareRouterPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return slo_aware_router.NewSLOAwareRouter(predictor, slo_aware_router.HeadroomSelectionStrategy).WithName(name), nil + }) + plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory) +} + +func (r *Runner) parseConfigurationPhaseOne(ctx context.Context, predictor latencypredictor.PredictorInterface) (*configapi.EndpointPickerConfig, error) { if *configText == "" && *configFile == "" { return nil, nil // configuring through code, not through file } @@ -454,6 +488,12 @@ func (r *Runner) parseConfigurationPhaseOne(ctx context.Context) (*configapi.End loader.RegisterFeatureGate(flowcontrol.FeatureGate) r.registerInTreePlugins() + // If we have a latency predictor enabled and predictor and datastore are not nil, + // register the latency predictor plugins (currently just the SLO scorer). + if *enableLatencyPredictor && predictor != nil { + setupLog.Info("Registering latency predictor plugins") + r.registerLatencyPredictorPlugins(predictor) + } rawConfig, featureGates, err := loader.LoadConfigPhaseOne(configBytes, logger) if err != nil { @@ -538,6 +578,7 @@ func (r *Runner) setupMetricsCollection(setupLog logr.Logger, useExperimentalDat func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) { mapping, err := backendmetrics.NewMetricMapping( *totalQueuedRequestsMetric, + *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, *loraInfoMetric, *cacheInfoMetric, @@ -586,6 +627,7 @@ func setupDatalayer(logger logr.Logger) (datalayer.EndpointFactory, error) { *modelServerMetricsHttpsInsecureSkipVerify, nil) extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric, + *totalRunningRequestsMetric, *kvCacheUsagePercentageMetric, *loraInfoMetric, *cacheInfoMetric) @@ -653,6 +695,18 @@ func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore. return nil } +// registerLatencyPredictorServer adds the Latency Predictor server as a Runnable to the given manager. +func registerLatencyPredictorServer(mgr manager.Manager, predictor latencypredictor.PredictorInterface) error { + // For the runnable, you'll need to type assert back to the concrete type + concretePredictor := predictor.(*latencypredictor.Predictor) + if err := mgr.Add(runnable.NoLeaderElection(&predictorRunnable{predictor: concretePredictor})); err != nil { + setupLog.Error(err, "Failed to register latency predictor runnable") + return err + } + setupLog.Info("Latency predictor runnable added to manager.") + return nil +} + func validateFlags() error { if (*poolName != "" && *endpointSelector != "") || (*poolName == "" && *endpointSelector == "") { return errors.New("either pool-name or endpoint-selector must be set") @@ -799,3 +853,25 @@ func resolvePoolNamespace(poolNamespace string) string { } return runserver.DefaultPoolNamespace } + +// =================================================================== +// == Latency Predictor Plugin and Helpers +// =================================================================== + +// predictorRunnable implements controller-runtime's Runnable interface to manage the predictor's lifecycle. +type predictorRunnable struct { + predictor *latencypredictor.Predictor +} + +func (p *predictorRunnable) Start(ctx context.Context) error { + setupLog.Info("Starting latency predictor...") + if err := p.predictor.Start(ctx); err != nil { + setupLog.Error(err, "Failed to start latency predictor") + return err + } + setupLog.Info("Latency predictor started.") + <-ctx.Done() + setupLog.Info("Stopping latency predictor...") + p.predictor.Stop() + return nil +} diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index cf48fcaa6..ab69fa1a8 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -121,6 +121,27 @@ $ helm install triton-llama3-8b-instruct \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` +### Install with SLO-Aware Routing + +For full details see the dedicated [SLO-Aware Routing Guide](../../../site-src/guides/slo-aware-routing.md) + +#### SLO-Aware Router Environment Variables + +The behavior of the SLO-aware router can be fine-tuned using the following environment variables in the Endpoint Picker deployment. These can be set under `inferenceExtension.env` in your `values.yaml` file. + +| Environment Variable | Description | Default | +| -------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------- | +| `SAMPLING_MEAN` | The sampling mean (lambda) for the Poisson distribution of token sampling. | `100.0` | +| `MAX_SAMPLED_TOKENS` | The maximum number of tokens to sample for TPOT prediction. | `20` | +| `SLO_BUFFER_FACTOR` | A buffer to apply to the SLO to make it more or less strict. | `1.0` | +| `NEG_HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has negative headroom. | `0.8` | +| `NEG_HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has negative headroom. | `0.2` | +| `HEADROOM_TTFT_WEIGHT` | The weight to give to the TTFT when a pod has positive headroom. | `0.8` | +| `HEADROOM_TPOT_WEIGHT` | The weight to give to the TPOT when a pod has positive headroom. | `0.2` | +| `HEADROOM_SELECTION_STRATEGY` | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least` | + +**Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more. + ### Install with High Availability (HA) To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index 31b9e532a..ca562d582 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -11,7 +11,28 @@ data: - type: queue-scorer - type: kv-cache-utilization-scorer - type: prefix-cache-scorer + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - type: slo-aware-routing + - type: slo-aware-profile-handler + - type: max-score-picker + {{- end }} schedulingProfiles: + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: prefix + plugins: + - pluginRef: prefix-cache-scorer + - name: default + plugins: + - pluginRef: slo-aware-routing + weight: 0 + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: max-score-picker + - name: slo + plugins: + - pluginRef: slo-aware-routing + - pluginRef: max-score-picker + {{- else }} - name: default plugins: - pluginRef: queue-scorer @@ -20,10 +41,10 @@ data: weight: 2 - pluginRef: prefix-cache-scorer weight: 3 + {{- end }} {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} {{- end }} - --- {{- if .Values.inferenceExtension.sidecar.enabled }} apiVersion: v1 @@ -34,3 +55,25 @@ metadata: data: {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} {{- end }} +--- +{{- if .Values.inferenceExtension.latencyPredictor.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training + namespace: {{ .Release.Namespace }} +data: + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} + {{ $key }}: {{ $value | quote }} +{{- end }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction + namespace: {{ .Release.Namespace }} +data: + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} +{{- end }} diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index be6a39ead..9edd64dd1 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -96,6 +96,9 @@ spec: {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} - --ha-enable-leader-election {{- end }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - --enable-latency-predictor + {{- end }} # Pass additional flags via the inferenceExtension.flags field in values.yaml. {{- range $key, $value := .Values.inferenceExtension.flags }} - --{{ $key }} @@ -151,6 +154,20 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: PREDICTION_SERVER_URL + value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}} + {{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}} + {{- range $i := until $count -}} + {{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }} + {{- end }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}" + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- end }} {{- if .Values.inferenceExtension.tracing.enabled }} - name: OTEL_SERVICE_NAME value: "gateway-api-inference-extension" @@ -181,6 +198,77 @@ spec: volumeMounts: - name: plugins-config-volume mountPath: "/config" + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + # Training Server Sidecar Container + - name: training-server + image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }} + ports: + - containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + name: training-port + livenessProbe: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.livenessProbe | nindent 10 }} + readinessProbe: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.readinessProbe | nindent 10 }} + resources: + {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.resources | nindent 10 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "training" + volumeMounts: + - name: training-server-storage + mountPath: /models + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + # Prediction Server Sidecar Container {{ add $i 1 }} + - name: prediction-server-{{ add $i 1 }} + image: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }} + imagePullPolicy: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }} + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"] + ports: + - containerPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + name: predict-port-{{ add $i 1 }} + livenessProbe: + httpGet: + path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.httpGet.path }} + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.periodSeconds }} + readinessProbe: + httpGet: + path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.httpGet.path }} + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.periodSeconds }} + failureThreshold: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.failureThreshold }} + resources: + {{- toYaml $.Values.inferenceExtension.latencyPredictor.predictionServers.resources | nindent 10 }} + envFrom: + - configMapRef: + name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-prediction + env: + - name: PREDICT_PORT + value: "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-{{ add $i 1 }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:{{ $.Values.inferenceExtension.latencyPredictor.trainingServer.port }}" + volumeMounts: + - name: prediction-server-{{ add $i 1 }}-storage + mountPath: /server_models + {{- end }} + {{- end }} volumes: {{- if .Values.inferenceExtension.sidecar.volumes }} {{- tpl (toYaml .Values.inferenceExtension.sidecar.volumes) $ | nindent 6 }} @@ -188,6 +276,16 @@ spec: - name: plugins-config-volume configMap: name: {{ include "gateway-api-inference-extension.name" . }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: training-server-storage + emptyDir: + sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }} + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + - name: prediction-server-{{ add $i 1 }}-storage + emptyDir: + sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }} + {{- end }} + {{- end }} {{- if .Values.inferenceExtension.affinity }} affinity: {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml index b1a48df91..fe76d71c9 100644 --- a/config/charts/inferencepool/templates/epp-service.yaml +++ b/config/charts/inferencepool/templates/epp-service.yaml @@ -15,6 +15,18 @@ spec: - name: http-metrics protocol: TCP port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: latency-predictor-training + protocol: TCP + port: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + targetPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }} + {{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }} + - name: latency-predictor-{{ add $i 1 }} + protocol: TCP + port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + targetPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }} + {{- end }} + {{- end }} {{- with .Values.inferenceExtension.extraServicePorts }} {{- toYaml . | nindent 4 }} {{- end }} diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index bed1f46e3..290dd5755 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -12,11 +12,11 @@ inferenceExtension: extraContainerPorts: [] # Define additional service ports extraServicePorts: [] -# extraServicePorts: -# - name: http -# port: 8081 -# protocol: TCP -# targetPort: 8081 + # extraServicePorts: + # - name: http + # port: 8081 + # protocol: TCP + # targetPort: 8081 # This is the plugins configuration file. # pluginsCustomConfig: @@ -43,7 +43,7 @@ inferenceExtension: affinity: {} tolerations: [] - + # Sidecar configuration for EPP sidecar: enabled: false @@ -71,6 +71,89 @@ inferenceExtension: sampler: "parentbased_traceidratio" samplerArg: "0.1" + # Latency Predictor Configuration + latencyPredictor: + enabled: false + + # Training Server Configuration + trainingServer: + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + port: 8000 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + # Prediction Server Configuration + predictionServers: + count: 10 + startPort: 8001 + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /healthz + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # EPP Environment Variables for Latency Predictor + eppEnv: + LATENCY_MAX_SAMPLE_SIZE: "10000" + inferencePool: targetPorts: - number: 8000 diff --git a/go.mod b/go.mod index dc53db381..fbb1e5de1 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/elastic/crd-ref-docs v0.2.0 github.com/envoyproxy/go-control-plane/envoy v1.36.0 github.com/go-logr/logr v1.4.3 + github.com/go-logr/zapr v1.3.0 github.com/google/go-cmp v0.7.0 github.com/google/uuid v1.6.0 github.com/hashicorp/golang-lru/v2 v2.0.7 diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 5d2a85e96..d2d9b60f9 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -97,6 +97,15 @@ func (p *PodMetricsClientImpl) promToPodMetrics( } } + if p.MetricMapping.TotalRunningRequests != nil { + running, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalRunningRequests) + if err == nil { + updated.RunningQueueSize = int(running.GetGauge().GetValue()) + } else { + errs = multierr.Append(errs, err) + } + } + if p.MetricMapping.KVCacheUtilization != nil { usage, err := p.getMetric(metricFamilies, *p.MetricMapping.KVCacheUtilization) if err == nil { diff --git a/pkg/epp/backend/metrics/metrics_spec.go b/pkg/epp/backend/metrics/metrics_spec.go index 7407f4ed7..b3c26db2c 100644 --- a/pkg/epp/backend/metrics/metrics_spec.go +++ b/pkg/epp/backend/metrics/metrics_spec.go @@ -29,10 +29,11 @@ type MetricSpec struct { // MetricMapping holds named MetricSpecs. type MetricMapping struct { - TotalQueuedRequests *MetricSpec - KVCacheUtilization *MetricSpec - LoraRequestInfo *MetricSpec - CacheConfigInfo *MetricSpec + TotalQueuedRequests *MetricSpec + TotalRunningRequests *MetricSpec + KVCacheUtilization *MetricSpec + LoraRequestInfo *MetricSpec + CacheConfigInfo *MetricSpec } // stringToMetricSpec converts a string to a MetricSpec. @@ -94,11 +95,15 @@ func stringToMetricSpec(specStr string) (*MetricSpec, error) { } // NewMetricMapping creates a MetricMapping from string values. -func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) { +func NewMetricMapping(queuedStr, runningStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric string) (*MetricMapping, error) { queuedSpec, err := stringToMetricSpec(queuedStr) if err != nil { return nil, fmt.Errorf("error parsing WaitingRequests: %w", err) } + runningSpec, err := stringToMetricSpec(runningStr) + if err != nil { + return nil, fmt.Errorf("error parsing RunningRequests: %w", err) + } kvUsageSpec, err := stringToMetricSpec(kvUsageStr) if err != nil { return nil, fmt.Errorf("error parsing KVCacheUsage: %w", err) @@ -114,10 +119,11 @@ func NewMetricMapping(queuedStr, kvUsageStr, loraReqInfoStr, cacheInfoMetric str } mapping := &MetricMapping{ - TotalQueuedRequests: queuedSpec, - KVCacheUtilization: kvUsageSpec, - LoraRequestInfo: loraReqInfoSpec, - CacheConfigInfo: cacheInfoSpec, + TotalQueuedRequests: queuedSpec, + TotalRunningRequests: runningSpec, + KVCacheUtilization: kvUsageSpec, + LoraRequestInfo: loraReqInfoSpec, + CacheConfigInfo: cacheInfoSpec, } return mapping, nil diff --git a/pkg/epp/datalayer/metrics/datasource_test.go b/pkg/epp/datalayer/metrics/datasource_test.go index 7c293753f..016622a40 100644 --- a/pkg/epp/datalayer/metrics/datasource_test.go +++ b/pkg/epp/datalayer/metrics/datasource_test.go @@ -29,7 +29,7 @@ import ( func TestDatasource(t *testing.T) { source := NewDataSource("https", "/metrics", true, nil) - extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, "", "", "") + extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, "", "", "", "") assert.Nil(t, err, "failed to create extractor") name := source.Name() diff --git a/pkg/epp/datalayer/metrics/extractor.go b/pkg/epp/datalayer/metrics/extractor.go index 27b1e07cd..f08ccf95e 100644 --- a/pkg/epp/datalayer/metrics/extractor.go +++ b/pkg/epp/datalayer/metrics/extractor.go @@ -66,8 +66,8 @@ func Produces() map[string]any { // configured with the given metrics' specifications. // These are mandatory metrics per the MSP specification, and are used // as the basis for the built-in scheduling plugins. -func NewExtractor(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) { - mapping, err := NewMapping(queueSpec, kvusageSpec, loraSpec, cacheInfoSpec) +func NewExtractor(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec string) (*Extractor, error) { + mapping, err := NewMapping(queueSpec, runningSpec, kvusageSpec, loraSpec, cacheInfoSpec) if err != nil { return nil, fmt.Errorf("failed to create extractor metrics Mapping - %w", err) } @@ -109,6 +109,15 @@ func (ext *Extractor) Extract(ctx context.Context, data any, ep datalayer.Endpoi } } + if spec := ext.mapping.TotalRunningRequests; spec != nil { // extract running requests + if metric, err := spec.getLatestMetric(families); err != nil { + errs = append(errs, err) + } else { + clone.RunningQueueSize = int(extractValue(metric)) + updated = true + } + } + if spec := ext.mapping.KVCacheUtilization; spec != nil { // extract KV cache usage if metric, err := spec.getLatestMetric(families); err != nil { errs = append(errs, err) diff --git a/pkg/epp/datalayer/metrics/extractor_test.go b/pkg/epp/datalayer/metrics/extractor_test.go index d8aaca556..bb408f6db 100644 --- a/pkg/epp/datalayer/metrics/extractor_test.go +++ b/pkg/epp/datalayer/metrics/extractor_test.go @@ -31,6 +31,7 @@ import ( const ( // use hardcoded values - importing causes cycle defaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" + defaultTotalRunningRequestsMetric = "vllm:num_requests_running" defaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" defaultLoraInfoMetric = "vllm:lora_requests_info" defaultCacheInfoMetric = "vllm:cache_config_info" @@ -39,11 +40,11 @@ const ( func TestExtractorExtract(t *testing.T) { ctx := context.Background() - if _, err := NewExtractor("vllm: dummy", "", "", ""); err == nil { + if _, err := NewExtractor("vllm: dummy", "", "", "", ""); err == nil { t.Error("expected to fail to create extractor with invalid specification") } - extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, + extractor, err := NewExtractor(defaultTotalQueuedRequestsMetric, defaultTotalRunningRequestsMetric, defaultKvCacheUsagePercentageMetric, defaultLoraInfoMetric, defaultCacheInfoMetric) if err != nil { t.Fatalf("failed to create extractor: %v", err) @@ -106,6 +107,14 @@ func TestExtractorExtract(t *testing.T) { }, }, }, + defaultTotalRunningRequestsMetric: &dto.MetricFamily{ + Type: dto.MetricType_GAUGE.Enum(), + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{Value: ptr.To(1.0)}, + }, + }, + }, defaultKvCacheUsagePercentageMetric: &dto.MetricFamily{ Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ diff --git a/pkg/epp/datalayer/metrics/mapping.go b/pkg/epp/datalayer/metrics/mapping.go index fab6cf75f..7b1fed9c1 100644 --- a/pkg/epp/datalayer/metrics/mapping.go +++ b/pkg/epp/datalayer/metrics/mapping.go @@ -23,20 +23,25 @@ import ( // Mapping holds specifications for the well-known metrics defined // in the Model Server Protocol. type Mapping struct { - TotalQueuedRequests *Spec - KVCacheUtilization *Spec - LoraRequestInfo *LoRASpec - CacheInfo *Spec + TotalQueuedRequests *Spec + TotalRunningRequests *Spec + KVCacheUtilization *Spec + LoraRequestInfo *LoRASpec + CacheInfo *Spec } // NewMapping creates a metrics.Mapping from the input specification strings. -func NewMapping(queue, kvusage, lora, cacheInfo string) (*Mapping, error) { +func NewMapping(queue, running, kvusage, lora, cacheInfo string) (*Mapping, error) { var errs []error queueSpec, err := parseStringToSpec(queue) if err != nil { errs = append(errs, err) } + runningSpec, err := parseStringToSpec(running) + if err != nil { + errs = append(errs, err) + } kvusageSpec, err := parseStringToSpec(kvusage) if err != nil { errs = append(errs, err) @@ -55,9 +60,10 @@ func NewMapping(queue, kvusage, lora, cacheInfo string) (*Mapping, error) { return nil, errors.Join(errs...) } return &Mapping{ - TotalQueuedRequests: queueSpec, - KVCacheUtilization: kvusageSpec, - LoraRequestInfo: loraSpec, - CacheInfo: cacheInfoSpec, + TotalQueuedRequests: queueSpec, + TotalRunningRequests: runningSpec, + KVCacheUtilization: kvusageSpec, + LoraRequestInfo: loraSpec, + CacheInfo: cacheInfoSpec, }, nil } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 2ab2e98cb..5153c1eb1 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -62,7 +62,7 @@ type Datastore interface { // PodList lists pods matching the given predicate. PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool - PodDelete(podNAme string) + PodDelete(podName string) // Clears the store state, happens when the pool gets deleted. Clear() diff --git a/pkg/epp/metrics/testdata/request_tpot_seconds_metric b/pkg/epp/metrics/testdata/request_tpot_seconds_metric new file mode 100644 index 000000000..beee50271 --- /dev/null +++ b/pkg/epp/metrics/testdata/request_tpot_seconds_metric @@ -0,0 +1,80 @@ +# HELP inference_model_request_tpot_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_tpot_seconds histogram +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0005"} 0 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0025"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.005"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.01"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.02"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.04"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.06"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.08"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.1"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.125"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.15"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.4"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.8"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="4.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="12"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="18"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="24"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="30"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="36"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="48"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="60"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="90"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="120"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="180"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="270"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="360"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="Inf"} 2 +inference_model_request_tpot_seconds_sum{model_name="m20", target_model_name="t10"} 0.161 +inference_model_request_tpot_seconds_count{model_name="m20", target_model_name="t10"} 2 + + +iinference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0005"} 0 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.0025"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.005"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.01"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.02"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.04"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.06"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.08"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.1"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.125"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.15"} 1 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.4"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="0.8"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="1.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="2"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="3"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="4.5"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="6"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="12"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="18"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="24"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="30"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="36"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="48"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="60"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="90"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="120"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="180"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="270"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="360"} 2 +inference_model_request_tpot_seconds_bucket{model_name="m20", target_model_name="t10", le="Inf"} 2 +inference_model_request_tpot_seconds_sum{model_name="m20", target_model_name="t10"} 0.161 +inference_model_request_tpot_seconds_count{model_name="m20", target_model_name="t10"} 2 \ No newline at end of file diff --git a/pkg/epp/metrics/testdata/request_ttft_seconds_metric b/pkg/epp/metrics/testdata/request_ttft_seconds_metric new file mode 100644 index 000000000..315490727 --- /dev/null +++ b/pkg/epp/metrics/testdata/request_ttft_seconds_metric @@ -0,0 +1,116 @@ +# HELP inference_model_request_ttft_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_ttft_seconds histogram +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_model_request_ttft_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_model_request_ttft_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_ttft_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_model_request_ttft_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_model_request_ttft_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_ttft_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_model_request_ttft_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go index c4f4f1c1b..8cb356890 100644 --- a/pkg/epp/requestcontrol/director.go +++ b/pkg/epp/requestcontrol/director.go @@ -305,8 +305,9 @@ func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *hand logger := log.FromContext(ctx).WithValues("stage", "bodyChunk") logger.V(logutil.TRACE).Info("Entering HandleResponseBodyChunk") response := &Response{ - RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], - Headers: reqCtx.Response.Headers, + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + EndOfStream: reqCtx.ResponseComplete, } d.runResponseStreamingPlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index e43d84923..39ea9a85f 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -43,6 +43,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" + latencypredictor "sigs.k8s.io/gateway-api-inference-extension/sidecars/latencypredictorasync" ) // ExtProcServerRunner provides methods to manage an external process server. @@ -59,6 +60,7 @@ type ExtProcServerRunner struct { Director *requestcontrol.Director SaturationDetector *saturationdetector.Detector UseExperimentalDatalayerV2 bool // Pluggable data layer feature flag + LatencyPredictor latencypredictor.PredictorInterface // This should only be used in tests. We won't need this once we do not inject metrics in the tests. // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup @@ -78,6 +80,7 @@ const ( DefaultHealthChecking = false // default for --health-checking DefaultEnablePprof = true // default for --enable-pprof DefaultTotalQueuedRequestsMetric = "vllm:num_requests_waiting" // default for --total-queued-requests-metric + DefaultTotalRunningRequestsMetric = "vllm:num_requests_running" // default for --total-running-requests-metric DefaultKvCacheUsagePercentageMetric = "vllm:gpu_cache_usage_perc" // default for --kv-cache-usage-percentage-metric DefaultLoraInfoMetric = "vllm:lora_requests_info" // default for --lora-info-metric DefaultCacheInfoMetric = "vllm:cache_config_info" // default for --cache-info-metric diff --git a/sidecars/latencypredictorasync/types.go b/sidecars/latencypredictorasync/types.go index 4b4a1ca0b..c8eadefe2 100644 --- a/sidecars/latencypredictorasync/types.go +++ b/sidecars/latencypredictorasync/types.go @@ -120,7 +120,6 @@ type PredictorInterface interface { PredictBulk(ctx context.Context, requests []PredictionRequest) (*BulkPredictionResponse, error) PredictBulkStrict(ctx context.Context, requests []PredictionRequest) (*BulkPredictionResponse, error) AddTrainingDataBulk(entry []TrainingEntry) error - GetServerStatus(ctx context.Context) (*ServerStatusResponse, error) } // --- Data Models --- diff --git a/site-src/guides/index.md b/site-src/guides/index.md index b655e56cd..11d3eb501 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -274,6 +274,12 @@ Deploy the sample InferenceObjective which allows you to specify priority of req --8<-- "site-src/_includes/bbr.md" +### Next Steps: Advanced Features + +You have now deployed a basic Inference Gateway with a simple routing strategy. To explore more advanced features, such as SLO-aware routing, please refer to the following guide: + +* [SLO-Aware Routing](./slo-aware-routing.md) + ### Cleanup The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. diff --git a/site-src/guides/slo-aware-routing.md b/site-src/guides/slo-aware-routing.md new file mode 100644 index 000000000..4e981cf08 --- /dev/null +++ b/site-src/guides/slo-aware-routing.md @@ -0,0 +1,121 @@ +# SLO-Aware Routing + +> For deployment instructions, jump to [Deploying with SLO-Aware Routing](#deploying-with-slo-aware-routing). + +SLO-aware routing is a feature of the Inference Gateway that enables intelligent routing of inference requests based on Service Level Objectives (SLOs). It uses a latency predictor to estimate the Time to First Token (TTFT) and Time Per Output Token (TPOT) for each request on each available model server. This allows the gateway to select the optimal server that can meet the request's SLOs, while also considering the overall health and utilization of the model servers. + +## How it Works + +The SLO-aware routing feature is implemented as a plugin for the Endpoint Picker (EPP). When a request is received, the plugin performs the following steps: + +1. **SLO Extraction**: The plugin extracts the TTFT and TPOT SLOs from the request headers (`x-slo-ttft-ms` and `x-slo-tpot-ms`). It also checks for the `x-prediction-based-scheduling` header to determine if SLO-aware routing should be used for this request. + +2. **Latency Prediction**: The plugin uses a latency predictor, deployed as a set of sidecar containers to the EPP, to predict the TTFT and TPOT for the request on each of the available model servers. The prediction is based on the current state of the server, including its KV cache utilization, and the number of running and waiting requests. + +3. **Headroom Calculation**: For each model server, the plugin calculates the "headroom", which is the difference between the predicted latency and the SLO. A positive headroom means the server is expected to meet the SLO, while a negative headroom means it is not. + +4. **Pod Selection**: The plugin selects a model server based on the calculated headrooms and a configurable selection strategy. The goal is to pick a server that can meet the SLOs without being overloaded. + +5. **Fallback**: If the latency predictor is not available or fails to make a prediction, the plugin falls back to a "composite scoring" mechanism. This mechanism uses a combination of metrics, including prefix cache scores and queue sizes, to make a routing decision. + +## Request Headers + +To use SLO-aware routing, you need to include the following headers in your inference requests: + +- `x-prediction-based-scheduling`: Set to `true` to enable SLO-aware routing for the request, setting this to false or omiting the header will use non-SLO routing, but will still use the latency data to train the predictor. +- `x-slo-ttft-ms`: The Time to First Token SLO in milliseconds. +- `x-slo-tpot-ms`: The Time Per Output Token SLO in milliseconds (this is vLLMs equivalent of ITL, is it **not** NTPOT). + +## Headroom Selection Strategies + +The SLO-aware routing plugin provides several strategies for selecting a model server based on the calculated headrooms: + +- `least`: (Default) Prefers the pod with the least positive headroom. This strategy is good for packing pods tightly and maximizing utilization. +- `most`: Prefers the pod with the most positive headroom. This strategy is more conservative and leaves more room for unexpected latency spikes. +- `composite-least`: A strategy that considers a composite score of various metrics, and prefers the pod with the lowest score. +- `composite-most`: A strategy that considers a composite score of various metrics, and prefers the pod with the highest score. +- `composite-only`: This strategy only uses the composite score and ignores latency predictions. + +The selection strategy can be configured via the `HEADROOM_SELECTION_STRATEGY` environment variable in the Endpoint Picker deployment. + +## Deploying with SLO-Aware Routing + +### Prerequisites + +Before you begin, ensure you have a functional Inference Gateway with at least one model server deployed. If you haven't set this up yet, please follow the [Getting Started Guide](./getting-started-latest.md). + +### Deployment + +To enable SLO-aware routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured. + +#### Steps: + +1. Build the predictor and sidecar images from inside the `latencypredictor` package. See the [Latency Predictor - Build Guide](../../../latencypredictor/README.md) for instructions. + +2. Set your Docker repository path by replacing the placeholders in Helm chart [values.yaml](../../../config/charts/inferencepool/values.yaml) in the format `us-docker.pkg.dev/PROJECT_ID/REPOSITORY` based on what you used to build the sidecars in the Build Guide from step 1. + +3. Deploy the chart with the latency predictor enabled by setting `inferenceExtension.latencyPredictor.enabled` to `true` in your `values.yaml` file, or by using the `--set` flag on the command line: + +```txt +helm install vllm-llama3-8b-instruct . \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferenceExtension.monitoring.gke.enabled=true \ + --set inferenceExtension.latencyPredictor.enabled=true \ + --set provider.name=gke \ + -f values.yaml +``` + +After these steps, Inference Gateway will be prepared to predict, train, and route requests based on their SLOs. + +For details on configuring specific environment variables for SLO-aware routing, refer to the [InferencePool Helm Chart README](../../config/charts/inferencepool/README.md#slo-aware-router-environment-variables). + +### Sending Requests + +To send a request with SLO-Aware Routing, you will need to specify the request SLOs and whether to route or not in the request header. See [Request Headers](#request-headers) section above. + +If you have a standard setup via using the [Getting Started Guide](./getting-started-latest.md) and then followed the steps outlined above, below is an example inference request with SLOs specified and routing enabled: + +```txt +export GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'):80 + +curl -v $GW_IP/v1/completions -H 'Content-Type: application/json' -H 'x-slo-ttft-ms: 100' -H 'x-slo-tpot-ms: 100' -H 'x-prediction-based-scheduling: true' -d '{ +"model": "meta-llama/Llama-3.1-8B-Instruct", +"prompt": "Write as if you were a critic: San Francisco where the ", +"max_tokens": 100, +"temperature": 0, "stream_options": {"include_usage": "true"}, "stream" : "true" +}' +``` + +## Monitoring + +When SLO-aware routing is enabled, a number of Prometheus metrics are exposed to allow for monitoring and observability of the feature. These metrics provide insight into the performance of the latency predictor and the effectiveness of the SLO-based routing. + +Key categories of metrics include: + +- **Actual vs. Predicted Latency**: Metrics for both actual and predicted Time to First Token (TTFT) and Time Per Output Token (TPOT) are available. This allows you to compare the accuracy of the latency predictor. +- **Prediction Duration**: The time it takes for the latency predictor to generate a prediction is also measured. +- **SLO Violations**: Counters and gauges are available to track when SLOs are violated. This can be used to alert on SLO breaches. +- **SLO Thresholds**: The current SLO thresholds for TTFT and TPOT are also exposed as metrics. + +NOTE: TPOT is equivalen to vLLM's **ITL** (Inter Token Latency), as vLLM defines TPOT as the average time per output token *including the TTFT*. This is commonly known as NTPOT in other contexts, and we don't capture that metric here. + +The following is a comprehensive list of the Prometheus metrics exposed: + +| Metric Name | Description | +| :--------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------- | +| `inference_objective_request_ttft_seconds` | Inference model TTFT distribution in seconds for each model and target model. | +| `inference_objective_request_ttft_seconds_gauge` | Inference model TTFT gauge in seconds for each model and target model. | +| `inference_objective_request_predicted_ttft_seconds` | Inference model Predicted TTFT distribution in seconds for each model and target model. | +| `inference_objective_request_predicted_ttft_seconds_gauge` | Inference model Predicted TTFT gauge in seconds for each model and target model. | +| `inference_objective_request_ttft_prediction_duration_seconds` | Duration taken to generate TTFT predictions in seconds for each model and target model. | +| `inference_objective_request_ttft_prediction_duration_seconds_gauge` | Latest duration taken to generate TTFT predictions in seconds for each model and target model. | +| `inference_objective_request_tpot_seconds` | Inference model TPOT distribution in seconds for each model and target model. | +| `inference_objective_request_tpot_seconds_gauge` | Inference model TPOT gauge in seconds for each model and target model. | +| `inference_objective_request_predicted_tpot_seconds` | Inference model Predicted TPOT distribution in seconds for each model and target model. | +| `inference_objective_request_predicted_tpot_seconds_gauge` | Inference model Predicted TPOT gauge in seconds for each model and target model. | +| `inference_objective_request_tpot_prediction_duration_seconds` | Duration taken to generate TPOT predictions in seconds for each model and target model. | +| `inference_objective_request_tpot_prediction_duration_seconds_gauge` | Latest duration taken to generate TPOT predictions in seconds for each model and target model. | +| `inference_objective_request_ttft_slo_violation` | Boolean indicator (0 or 1) of whether the last TTFT measurement violated the SLO threshold for each model and target model. | +| `inference_objective_request_ttft_slo_violation_total` | Counter of TTFT SLO violations for each model and target model. | +| `inference_objective_request_tpot_slo_violation` | Boolean indicator (0 or 1) of whether the last TPOT measurement violated the SLO threshold for each model and target model. | +| `inference_objective_request_tpot_slo_violation_total` | Counter of TPOT SLO violations for each model and target model. |