Skip to content

Commit 5c00a5f

Browse files
committed
chore: Ensure EPP flags are configurable via Helm chart
1 parent 3dd33b7 commit 5c00a5f

File tree

2 files changed

+42
-6
lines changed

2 files changed

+42
-6
lines changed

config/charts/inferencepool/templates/epp-deployment.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,20 @@ spec:
4444
- "--model-server-metrics-path={{ .Values.inferenceExtension.modelServerMetricsPath }}"
4545
- "--model-server-metrics-scheme={{ .Values.inferenceExtension.modelServerMetricsScheme }}"
4646
- "--model-server-metrics-https-insecure-skip-verify={{ .Values.inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify }}"
47+
- "--model-server-metrics-port={{ .Values.inferenceExtension.modelServerMetricsPort }}"
48+
- "--secure-serving={{ .Values.inferenceExtension.secureServing }}"
49+
- "--health-checking={{ .Values.inferenceExtension.healthChecking }}"
50+
- "--cert-path={{ .Values.inferenceExtension.certPath }}"
51+
- "--destination-endpoint-hint-key={{ .Values.inferenceExtension.destinationEndpointHintKey }}"
52+
- "--destination-endpoint-hint-metadata-namespace={{ .Values.inferenceExtension.destinationEndpointHintMetadataNamespace }}"
53+
- "--fairness-id-header-key={{ .Values.inferenceExtension.fairnessIDHeaderKey }}"
54+
- "--total-queued-requests-metric={{ .Values.inferenceExtension.totalQueuedRequestsMetric }}"
55+
- "--kv-cache-usage-percentage-metric={{ .Values.inferenceExtension.kvCacheUsagePercentageMetric }}"
56+
- "--lora-info-metric={{ .Values.inferenceExtension.loraInfoMetric }}"
57+
- "--refresh-metrics-interval={{ .Values.inferenceExtension.refreshMetricsInterval }}"
58+
- "--refresh-prometheus-metrics-interval={{ .Values.inferenceExtension.refreshPrometheusMetricsInterval }}"
59+
- "--metrics-staleness-threshold={{ .Values.inferenceExtension.metricsStalenessThreshold }}"
60+
- "--config-text={{ .Values.inferenceExtension.configText }}"
4761
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
4862
- --total-queued-requests-metric
4963
- "nv_trt_llm_request_metrics{request_type=waiting}"

config/charts/inferencepool/values.yaml

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,34 @@ inferenceExtension:
77
pullPolicy: Always
88
extProcPort: 9002
99
env: []
10-
enablePprof: true # Enable pprof handlers for profiling and debugging
10+
enablePprof: true # Enable pprof handlers for profiling and debugging
1111
modelServerMetricsPath: "/metrics"
1212
modelServerMetricsScheme: "http"
1313
modelServerMetricsHttpsInsecureSkipVerify: true
14-
# This is the plugins configuration file.
14+
grpcPort: 9002
15+
grpcHealthPort: 9003
16+
metricsPort: 9090
17+
destinationEndpointHintMetadataNamespace: "envoy.lb"
18+
destinationEndpointHintKey: "x-gateway-destination-endpoint"
19+
fairnessIDHeaderKey: "x-gateway-inference-fairness-id"
20+
poolName: ""
21+
poolNamespace: "default"
22+
refreshMetricsInterval: "50ms"
23+
refreshPrometheusMetricsInterval: "5s"
24+
secureServing: true
25+
healthChecking: false
26+
totalQueuedRequestsMetric: "vllm:num_requests_waiting"
27+
kvCacheUsagePercentageMetric: "vllm:gpu_cache_usage_perc"
28+
loraInfoMetric: "vllm:lora_requests_info"
29+
certPath: ""
30+
configFile: ""
31+
configText: ""
32+
metricsStalenessThreshold: "2s"
33+
1534
pluginsConfigFile: "default-plugins.yaml"
35+
logVerbosity: 1
36+
37+
# This is the plugins configuration file.
1638
# pluginsCustomConfig:
1739
# custom-plugins.yaml: |
1840
# apiVersion: inference.networking.x-k8s.io/v1alpha1
@@ -34,18 +56,18 @@ inferenceExtension:
3456
# Example environment variables:
3557
# env:
3658
# KV_CACHE_SCORE_WEIGHT: "1"
37-
3859
# Define additional container ports
60+
modelServerMetricsPort: 0
3961
extraContainerPorts: []
4062
# Define additional service ports
4163
extraServicePorts: []
4264

4365
inferencePool:
4466
targetPortNumber: 8000
4567
modelServerType: vllm # vllm, triton-tensorrt-llm
46-
# modelServers: # REQUIRED
47-
# matchLabels:
48-
# app: vllm-llama3-8b-instruct
68+
modelServers:
69+
matchLabels:
70+
app: vllm-llama3-8b-instruct
4971

5072
provider:
5173
name: none

0 commit comments

Comments
 (0)