Skip to content

Commit f18cf4c

Browse files
ahg-gnirrozenbaum
authored andcommitted
Add support for plugin configuration in the InferencePool helm chart (#1168)
* Add support for plugin configuration in the InferencePool helm chart * rename flags
1 parent 73fd266 commit f18cf4c

File tree

4 files changed

+116
-0
lines changed

4 files changed

+116
-0
lines changed

cmd/epp/runner/runner.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
348348
return fmt.Errorf("failed to load the configuration - %w", err)
349349
}
350350

351+
setupLog.Info("Configuration file loaded", "config", config)
352+
351353
r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)
352354
if err != nil {
353355
return fmt.Errorf("failed to create Scheduler configuration - %w", err)
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: {{ include "gateway-api-inference-extension.name" . }}
5+
namespace: {{ .Release.Namespace }}
6+
data:
7+
default-plugins.yaml: |
8+
apiVersion: inference.networking.x-k8s.io/v1alpha1
9+
kind: EndpointPickerConfig
10+
plugins:
11+
- type: low-queue-filter
12+
parameters:
13+
threshold: 128
14+
- type: lora-affinity-filter
15+
parameters:
16+
threshold: 0.999
17+
- type: least-queue-filter
18+
- type: least-kv-cache-filter
19+
- type: decision-tree-filter
20+
name: low-latency-filter
21+
parameters:
22+
current:
23+
pluginRef: low-queue-filter
24+
nextOnSuccess:
25+
decisionTree:
26+
current:
27+
pluginRef: lora-affinity-filter
28+
nextOnSuccessOrFailure:
29+
decisionTree:
30+
current:
31+
pluginRef: least-queue-filter
32+
nextOnSuccessOrFailure:
33+
decisionTree:
34+
current:
35+
pluginRef: least-kv-cache-filter
36+
nextOnFailure:
37+
decisionTree:
38+
current:
39+
pluginRef: least-queue-filter
40+
nextOnSuccessOrFailure:
41+
decisionTree:
42+
current:
43+
pluginRef: lora-affinity-filter
44+
nextOnSuccessOrFailure:
45+
decisionTree:
46+
current:
47+
pluginRef: least-kv-cache-filter
48+
- type: random-picker
49+
parameters:
50+
maxNumOfEndpoints: 1
51+
- type: single-profile-handler
52+
schedulingProfiles:
53+
- name: default
54+
plugins:
55+
- pluginRef: low-latency-filter
56+
- pluginRef: random-picker
57+
plugins-v2.yaml: |
58+
apiVersion: inference.networking.x-k8s.io/v1alpha1
59+
kind: EndpointPickerConfig
60+
plugins:
61+
- type: queue-scorer
62+
- type: kv-cache-scorer
63+
- type: prefix-cache-scorer
64+
parameters:
65+
hashBlockSize: 64
66+
maxPrefixBlocksToMatch: 256
67+
lruCapacityPerServer: 31250
68+
- type: max-score-picker
69+
parameters:
70+
maxNumOfEndpoints: 1
71+
- type: single-profile-handler
72+
schedulingProfiles:
73+
- name: default
74+
plugins:
75+
- pluginRef: queue-scorer
76+
weight: 1
77+
- pluginRef: kv-cache-scorer
78+
weight: 1
79+
- pluginRef: prefix-cache-scorer
80+
weight: 1
81+
- pluginRef: max-score-picker
82+
{{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
83+
{{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
84+
{{- end }}
85+

config/charts/inferencepool/templates/epp-deployment.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ spec:
3535
- "9003"
3636
- -metricsPort
3737
- "9090"
38+
- -configFile
39+
- "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
3840
# https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
3941
- "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
4042
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
@@ -69,3 +71,10 @@ spec:
6971
- name: {{ $key }}
7072
value: {{ $value | quote }}
7173
{{- end }}
74+
volumeMounts:
75+
- name: plugins-config-volume
76+
mountPath: "/config"
77+
volumes:
78+
- name: plugins-config-volume
79+
configMap:
80+
name: {{ include "gateway-api-inference-extension.name" . }}

config/charts/inferencepool/values.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,26 @@ inferenceExtension:
88
extProcPort: 9002
99
env: {}
1010
enablePprof: true # Enable pprof handlers for profiling and debugging
11+
# This is the plugins configuration file.
12+
pluginsConfigFile: "default-plugins.yaml"
13+
# pluginsCustomConfig:
14+
# custom-plugins.yaml: |
15+
# apiVersion: inference.networking.x-k8s.io/v1alpha1
16+
# kind: EndpointPickerConfig
17+
# plugins:
18+
# - type: custom-scorer
19+
# parameters:
20+
# custom-threshold: 64
21+
# - type: max-score-picker
22+
# - type: single-profile-handler
23+
# schedulingProfiles:
24+
# - name: default
25+
# plugins:
26+
# - pluginRef: custom-scorer
27+
# weight: 1
28+
# - pluginRef: max-score-picker
29+
# weight: 1
30+
1131
# Example environment variables:
1232
# env:
1333
# KV_CACHE_SCORE_WEIGHT: "1"

0 commit comments

Comments
 (0)