Add support for plugin configuration in the InferencePool helm chart (#1168)

ahg-g · nirrozenbaum · commit f18cf4cba5d2 · 2025-07-16T18:14:42.000+03:00
* Add support for plugin configuration in the InferencePool helm chart

* rename flags
diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
@@ -348,6 +348,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
 		return fmt.Errorf("failed to load the configuration - %w", err)
 	}
 
+	setupLog.Info("Configuration file loaded", "config", config)
+
 	r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)
 	if err != nil {
 		return fmt.Errorf("failed to create Scheduler configuration - %w", err)
diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml
@@ -0,0 +1,85 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}
+  namespace: {{ .Release.Namespace }}
+data:
+  default-plugins.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+    - type: low-queue-filter
+      parameters:
+        threshold: 128
+    - type: lora-affinity-filter
+      parameters:
+        threshold: 0.999
+    - type: least-queue-filter
+    - type: least-kv-cache-filter
+    - type: decision-tree-filter
+      name: low-latency-filter
+      parameters:
+        current:
+          pluginRef: low-queue-filter
+        nextOnSuccess:
+          decisionTree:
+            current:
+              pluginRef: lora-affinity-filter
+            nextOnSuccessOrFailure:
+              decisionTree:
+                current:
+                  pluginRef: least-queue-filter
+                nextOnSuccessOrFailure:
+                  decisionTree:
+                    current:
+                      pluginRef: least-kv-cache-filter
+        nextOnFailure:
+          decisionTree:
+            current:
+              pluginRef: least-queue-filter
+            nextOnSuccessOrFailure:
+              decisionTree:
+                current:
+                  pluginRef: lora-affinity-filter
+                nextOnSuccessOrFailure:
+                  decisionTree:
+                    current:
+                      pluginRef: least-kv-cache-filter
+    - type: random-picker
+      parameters:
+        maxNumOfEndpoints: 1
+    - type: single-profile-handler
+    schedulingProfiles:
+    - name: default
+      plugins:
+      - pluginRef: low-latency-filter
+      - pluginRef: random-picker
+  plugins-v2.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+    - type: queue-scorer
+    - type: kv-cache-scorer
+    - type: prefix-cache-scorer
+      parameters:
+        hashBlockSize: 64
+        maxPrefixBlocksToMatch: 256
+        lruCapacityPerServer: 31250
+    - type: max-score-picker
+      parameters:
+        maxNumOfEndpoints: 1
+    - type: single-profile-handler
+    schedulingProfiles:
+    - name: default
+      plugins:
+      - pluginRef: queue-scorer
+        weight: 1
+      - pluginRef: kv-cache-scorer
+        weight: 1
+      - pluginRef: prefix-cache-scorer
+        weight: 1
+      - pluginRef: max-score-picker
+  {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
+  {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
+  {{- end }}
+  
diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -35,6 +35,8 @@ spec:
         - "9003"
         - -metricsPort
         - "9090"
+        - -configFile
+        - "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
         # https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
         - "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
@@ -69,3 +71,10 @@ spec:
         - name: {{ $key }}
           value: {{ $value | quote }}
         {{- end }}
+        volumeMounts:
+        - name: plugins-config-volume
+          mountPath: "/config"
+      volumes:
+      - name: plugins-config-volume
+        configMap:
+          name: {{ include "gateway-api-inference-extension.name" . }}
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -8,6 +8,26 @@ inferenceExtension:
   extProcPort: 9002
   env: {}
   enablePprof: true # Enable pprof handlers for profiling and debugging
+  # This is the plugins configuration file. 
+  pluginsConfigFile: "default-plugins.yaml"
+  # pluginsCustomConfig:
+  #   custom-plugins.yaml: |
+  #     apiVersion: inference.networking.x-k8s.io/v1alpha1
+  #     kind: EndpointPickerConfig
+  #     plugins:
+  #     - type: custom-scorer
+  #       parameters:
+  #         custom-threshold: 64
+  #     - type: max-score-picker
+  #     - type: single-profile-handler
+  #     schedulingProfiles:
+  #     - name: default
+  #       plugins:
+  #       - pluginRef: custom-scorer
+  #         weight: 1
+  #       - pluginRef: max-score-picker
+  #         weight: 1
+
   # Example environment variables:
   # env:
   #   KV_CACHE_SCORE_WEIGHT: "1"

Original file line number	Diff line number	Diff line change
`@@ -348,6 +348,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {`
`348`	`348`	`return fmt.Errorf("failed to load the configuration - %w", err)`
`349`	`349`	`}`
`350`	`350`
	`351`	`+ setupLog.Info("Configuration file loaded", "config", config)`
	`352`	`+`
`351`	`353`	`r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)`
`352`	`354`	`if err != nil {`
`353`	`355`	`return fmt.Errorf("failed to create Scheduler configuration - %w", err)`