codefromthecrypt
diff --git a/‎examples/inference-pool/with-annotations.yaml‎
Lines changed: 252 additions & 0 deletions b/‎examples/inference-pool/with-annotations.yaml‎
Lines changed: 252 additions & 0 deletions
@@ -0,0 +1,252 @@
+# Copyright Envoy AI Gateway Authors
+# SPDX-License-Identifier: Apache-2.0
+# The full text of the Apache license is available in the LICENSE file at
+# the root of the repo.
+
+# This example demonstrates how to use InferencePool annotations to configure
+# the external processor's processing mode and allow mode override settings.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: mistral-upstream
+  namespace: default
+spec:
+  selector:
+    app: mistral-upstream
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-upstream
+  namespace: default
+  labels:
+    app: mistral-upstream
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-upstream
+  template:
+    metadata:
+      labels:
+        app: mistral-upstream
+    spec:
+      containers:
+        - name: upstream
+          image: registry.k8s.io/ai-gateway/testupstream:v0.0.0-latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 8080
+          env:
+            - name: UPSTREAM_PORT
+              value: "8080"
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8080
+            initialDelaySeconds: 1
+            periodSeconds: 1
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: mistral-with-annotations
+  namespace: default
+  annotations:
+    # Configure processing body mode: "duplex" (default) or "buffered"
+    # "duplex" corresponds to ProcessingMode_FULL_DUPLEX_STREAMED
+    # "buffered" corresponds to ProcessingMode_BUFFERED
+    aigateway.envoyproxy.io/processing-body-mode: "buffered"
+
+    # Configure allow mode override: "false" (default) or "true"
+    # This corresponds to the AllowModeOverride field in Envoy's ExternalProcessor
+    aigateway.envoyproxy.io/allow-mode-override: "true"
+spec:
+  targetPortNumber: 8080
+  selector:
+    app: mistral-upstream
+  extensionRef:
+    name: mistral-epp-with-annotations
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistral-with-annotations
+  namespace: default
+spec:
+  modelName: mistral:latest
+  criticality: Critical
+  poolRef:
+    # Bind the InferenceModel to the InferencePool.
+    name: mistral-with-annotations
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: mistral-epp-with-annotations
+  namespace: default
+spec:
+  selector:
+    app: mistral-epp-with-annotations
+  ports:
+    - protocol: TCP
+      port: 9002
+      targetPort: 9002
+      appProtocol: http2
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-epp-with-annotations
+  namespace: default
+  labels:
+    app: mistral-epp-with-annotations
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-epp-with-annotations
+  template:
+    metadata:
+      labels:
+        app: mistral-epp-with-annotations
+    spec:
+      # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
+      terminationGracePeriodSeconds: 130
+      containers:
+        - name: epp
+          image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
+          imagePullPolicy: IfNotPresent
+          args:
+            - -poolName
+            - "mistral-with-annotations"
+            - "-poolNamespace"
+            - "default"
+            - -v
+            - "4"
+            - --zap-encoder
+            - "json"
+            - -grpcPort
+            - "9002"
+            - -grpcHealthPort
+            - "9003"
+            - "-configFile"
+            - "/config/default-plugins.yaml"
+          ports:
+            - containerPort: 9002
+            - containerPort: 9003
+            - name: metrics
+              containerPort: 9090
+          livenessProbe:
+            grpc:
+              port: 9003
+              service: inference-extension
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          readinessProbe:
+            grpc:
+              port: 9003
+              service: inference-extension
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          volumeMounts:
+            - name: plugins-config-volume
+              mountPath: "/config"
+      volumes:
+        - name: plugins-config-volume
+          configMap:
+            name: plugins-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: plugins-config
+  namespace: default
+data:
+  default-plugins.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+    - type: low-queue-filter
+      parameters:
+        threshold: 128
+    - type: lora-affinity-filter
+      parameters:
+        threshold: 0.999
+    - type: least-queue-filter
+    - type: least-kv-cache-filter
+    - type: decision-tree-filter
+      name: low-latency-filter
+      parameters:
+        current:
+          pluginRef: low-queue-filter
+        nextOnSuccess:
+          decisionTree:
+            current:
+              pluginRef: lora-affinity-filter
+            nextOnSuccess:
+              pluginRef: least-queue-filter
+            nextOnFailure:
+              pluginRef: least-kv-cache-filter
+        nextOnFailure:
+          pluginRef: least-kv-cache-filter
+    - type: queue-scorer
+      parameters:
+        maxQueueSize: 128
+    - type: kv-cache-scorer
+      parameters:
+        maxKVCacheUsage: 0.95
+    - type: prefix-cache-scorer
+      parameters:
+        hashBlockSize: 64
+        maxPrefixBlocksToMatch: 256
+        lruCapacityPerServer: 31250
+    - type: max-score-picker
+      parameters:
+        maxNumOfEndpoints: 1
+    - type: single-profile-handler
+    schedulingProfiles:
+    - name: default
+      plugins:
+      - pluginRef: queue-scorer
+        weight: 1
+      - pluginRef: kv-cache-scorer
+        weight: 1
+      - pluginRef: prefix-cache-scorer
+        weight: 1
+      - pluginRef: max-score-picker
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+rules:
+  - apiGroups: ["inference.networking.x-k8s.io"]
+    resources: ["inferencepools"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: ["inference.networking.x-k8s.io"]
+    resources: ["inferencemodels"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "watch", "list"]
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+subjects:
+  - kind: ServiceAccount
+    name: default
+    namespace: default
+roleRef:
+  kind: ClusterRole
+  name: pod-read
+  apiGroup: rbac.authorization.k8s.io