Skip to content

Commit 6d0cad6

Browse files
config: read main config from file directly (#713)
* read main config from file directly Signed-off-by: Lionel Villard <villard@us.ibm.com> * address copilot review Signed-off-by: Lionel Villard <villard@us.ibm.com> * removed unused constant Signed-off-by: Lionel Villard <villard@us.ibm.com> --------- Signed-off-by: Lionel Villard <villard@us.ibm.com>
1 parent c1ba964 commit 6d0cad6

27 files changed

+991
-1894
lines changed

charts/workload-variant-autoscaler/templates/manager/wva-configmap.yaml

Lines changed: 39 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -10,106 +10,45 @@ metadata:
1010
immutable: true
1111
{{- end }}
1212
data:
13-
# ============================================================================
14-
# UNIFIED CONFIGURATION SYSTEM
15-
# ============================================================================
16-
# This ConfigMap contains both static (immutable) and dynamic (mutable) settings.
13+
# The main configuration is stored as a single YAML file (config.yaml) that is
14+
# volume-mounted into the controller container at /etc/wva/config.yaml.
1715
#
18-
# IMMUTABLE PARAMETERS (require controller restart to change):
19-
# - PROMETHEUS_BASE_URL: Prometheus connection endpoint
20-
# - TLS certificate paths (security-sensitive)
21-
# - Metrics/Probe bind addresses (infrastructure)
22-
# - Leader election ID (coordination)
16+
# Precedence: CLI flags > environment variables > config file > defaults
2317
#
24-
# MUTABLE PARAMETERS (can be changed at runtime via ConfigMap updates):
25-
# - GLOBAL_OPT_INTERVAL: Optimization interval
26-
# - WVA_SCALE_TO_ZERO: Feature flag (can be changed, but may require restart for full effect)
27-
# - Prometheus cache settings
28-
#
29-
# Attempts to change immutable parameters at runtime will be rejected and emit
30-
# Warning events. See documentation for details.
31-
# ============================================================================
32-
33-
# ----------------------------------------------------------------------------
34-
# IMMUTABLE: Prometheus Configuration (requires restart to change)
35-
# ----------------------------------------------------------------------------
36-
# REQUIRED: Set your Prometheus server URL
37-
# Examples:
38-
# - General: "https://prometheus:9090"
39-
# - OpenShift: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091"
40-
# - KIND cluster: "https://kube-prometheus-stack-prometheus.workload-variant-autoscaler-monitoring.svc.cluster.local:9090"
41-
#PROMETHEUS_BASE_URL: "https://kube-prometheus-stack-prometheus.workload-variant-autoscaler-monitoring.svc.cluster.local:9090"
42-
PROMETHEUS_BASE_URL: {{ .Values.wva.prometheus.baseURL | quote }}
43-
44-
# TLS Configuration (TLS is always enabled for HTTPS-only support)
45-
# PROMETHEUS_TLS_INSECURE_SKIP_VERIFY: "true" # Skip certificate verification (development/testing only)
46-
PROMETHEUS_CA_CERT_PATH: {{ .Values.wva.prometheus.tls.caCertPath | default "/etc/ssl/certs/prometheus-ca.crt" | quote }} # CA certificate for server validation
47-
# PROMETHEUS_CLIENT_CERT_PATH: "/path/to/client.crt" # Client certificate for mutual TLS
48-
# PROMETHEUS_CLIENT_KEY_PATH: "/path/to/client.key" # Client private key for mutual TLS
49-
# PROMETHEUS_SERVER_NAME: "prometheus.example.com" # Expected server name for SNI
50-
PROMETHEUS_TLS_INSECURE_SKIP_VERIFY: {{ if and .Values.wva.prometheus.tls (hasKey .Values.wva.prometheus.tls "insecureSkipVerify") }}{{ .Values.wva.prometheus.tls.insecureSkipVerify | quote }}{{ else }}"true"{{ end }}
51-
52-
# Authentication Configuration (BearerToken takes precedence over TokenPath)
53-
# PROMETHEUS_BEARER_TOKEN: "your-token-here" # Direct bearer token (development/testing)
54-
# PROMETHEUS_TOKEN_PATH: "/path/to/token/file" # Path to bearer token file (production with mounted secrets)
55-
56-
# ----------------------------------------------------------------------------
57-
# MUTABLE: EPP Integration Configuration (runtime-updatable)
58-
# ----------------------------------------------------------------------------
59-
# EPP metric reader bearer token for pod scraping
60-
EPP_METRIC_READER_BEARER_TOKEN: ""
61-
62-
# ----------------------------------------------------------------------------
63-
# MUTABLE: Optimization Configuration (runtime-updatable)
64-
# ----------------------------------------------------------------------------
65-
# Global optimization interval - how often the controller runs optimization cycles
66-
# Can be changed at runtime via ConfigMap updates (no restart required)
67-
GLOBAL_OPT_INTERVAL: {{ .Values.wva.reconcileInterval | quote }}
68-
69-
# ----------------------------------------------------------------------------
70-
# MUTABLE: Feature Flags (runtime-updatable, but may require restart for full effect)
71-
# ----------------------------------------------------------------------------
72-
# Option to scale variants to zero replicas (default: false)
73-
# Note: While this can be changed at runtime, some features may require restart
74-
WVA_SCALE_TO_ZERO: {{ .Values.wva.scaleToZero | default "false" | quote }}
75-
76-
# ----------------------------------------------------------------------------
77-
# MUTABLE: Prometheus Metrics Cache Configuration (runtime-updatable)
78-
# ----------------------------------------------------------------------------
79-
# Each collector (Prometheus, EPP, etc.) has its own cache configuration
80-
# Enable/disable Prometheus metrics caching (default: "true") - this is for debugging purposes, can be removed in the future
81-
PROMETHEUS_METRICS_CACHE_ENABLED: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.enabled | default "true" | quote }}{{ else }}"true"{{ end }}
82-
# Prometheus cache TTL - how long metrics are cached before expiring (default: "30s")
83-
PROMETHEUS_METRICS_CACHE_TTL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.ttl | default "30s" | quote }}{{ else }}"30s"{{ end }}
84-
# Interval for background cleanup of expired Prometheus cache entries (default: "1m")
85-
PROMETHEUS_METRICS_CACHE_CLEANUP_INTERVAL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.cleanupInterval | default "1m" | quote }}{{ else }}"1m"{{ end }}
86-
# Background fetch interval - how often to fetch metrics in background (default: "30s", 0 = disable)
87-
PROMETHEUS_METRICS_CACHE_FETCH_INTERVAL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.fetchInterval | default "30s" | quote }}{{ else }}"30s"{{ end }}
88-
# Freshness thresholds - when metrics are considered fresh/stale/unavailable
89-
PROMETHEUS_METRICS_CACHE_FRESH_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.freshThreshold | default "1m" | quote }}{{ else }}"1m"{{ end }}
90-
PROMETHEUS_METRICS_CACHE_STALE_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.staleThreshold | default "2m" | quote }}{{ else }}"2m"{{ end }}
91-
PROMETHEUS_METRICS_CACHE_UNAVAILABLE_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.unavailableThreshold | default "5m" | quote }}{{ else }}"5m"{{ end }}
92-
93-
# EPP metrics cache configuration (for future EPP collector)
94-
# Uncomment and configure when EPP collector is implemented - future implementation
95-
# EPP_METRICS_CACHE_ENABLED: "true"
96-
# EPP_METRICS_CACHE_TTL: "15s"
97-
# EPP_METRICS_CACHE_MAX_SIZE: "500"
98-
# EPP_METRICS_CACHE_CLEANUP_INTERVAL: "30s"
99-
# ============================================================================
100-
# END OF CONFIGURATION
101-
# ============================================================================
102-
# For more information about immutable vs mutable parameters, see:
103-
# https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/docs/user-guide/configuration.md#unified-configuration-system
104-
#
105-
# IMMUTABLE CONFIGMAP:
106-
# If wva.configMap.immutable is set to true, this ConfigMap becomes immutable
107-
# after creation. This provides security benefits:
108-
# - Prevents accidental configuration changes
109-
# - Protects against malicious modifications
110-
# - Ensures configuration integrity
111-
# However, this disables runtime config updates. To change configuration:
112-
# 1. Delete the ConfigMap (kubectl delete configmap <name>)
113-
# 2. Update Helm values and upgrade the release
114-
# 3. Restart the controller pod
18+
# For more information see:
19+
# https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/docs/user-guide/configuration.md
20+
config.yaml: |
21+
# Prometheus Configuration (REQUIRED)
22+
# Base URL for Prometheus API (must use HTTPS).
23+
PROMETHEUS_BASE_URL: {{ .Values.wva.prometheus.baseURL | quote }}
24+
# Filesystem path to the CA certificate used to verify Prometheus TLS cert.
25+
PROMETHEUS_CA_CERT_PATH: {{ .Values.wva.prometheus.tls.caCertPath | default "/etc/ssl/certs/prometheus-ca.crt" | quote }}
26+
# Whether to skip TLS certificate verification when connecting to Prometheus.
27+
PROMETHEUS_TLS_INSECURE_SKIP_VERIFY: {{ if and .Values.wva.prometheus.tls (hasKey .Values.wva.prometheus.tls "insecureSkipVerify") }}{{ .Values.wva.prometheus.tls.insecureSkipVerify | quote }}{{ else }}"true"{{ end }}
28+
29+
# EPP Integration
30+
# Bearer token used to authenticate metric reads from EPP.
31+
EPP_METRIC_READER_BEARER_TOKEN: ""
32+
33+
# Optimization
34+
# Global optimization loop interval for autoscaling decisions.
35+
GLOBAL_OPT_INTERVAL: {{ .Values.wva.reconcileInterval | quote }}
36+
37+
# Feature Flags
38+
# Enables scale-to-zero behavior across managed workloads.
39+
WVA_SCALE_TO_ZERO: {{ .Values.wva.scaleToZero | default "false" | quote }}
40+
41+
# Prometheus Metrics Cache
42+
# Time-to-live for cached Prometheus metric responses.
43+
PROMETHEUS_METRICS_CACHE_TTL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.ttl | default "30s" | quote }}{{ else }}"30s"{{ end }}
44+
# Interval for cleaning up expired entries from the metrics cache.
45+
PROMETHEUS_METRICS_CACHE_CLEANUP_INTERVAL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.cleanupInterval | default "1m" | quote }}{{ else }}"1m"{{ end }}
46+
# Interval for background refresh of metrics cache entries.
47+
PROMETHEUS_METRICS_CACHE_FETCH_INTERVAL: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.fetchInterval | default "30s" | quote }}{{ else }}"30s"{{ end }}
48+
# Maximum age for metrics to be considered fresh.
49+
PROMETHEUS_METRICS_CACHE_FRESH_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.freshThreshold | default "1m" | quote }}{{ else }}"1m"{{ end }}
50+
# Maximum age for metrics to be considered stale (before unavailable).
51+
PROMETHEUS_METRICS_CACHE_STALE_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.staleThreshold | default "2m" | quote }}{{ else }}"2m"{{ end }}
52+
# Maximum age for metrics before they are considered unavailable.
53+
PROMETHEUS_METRICS_CACHE_UNAVAILABLE_THRESHOLD: {{ if and .Values.wva.prometheus .Values.wva.prometheus.metricsCache }}{{ .Values.wva.prometheus.metricsCache.unavailableThreshold | default "5m" | quote }}{{ else }}"5m"{{ end }}
11554
{{- end }}

charts/workload-variant-autoscaler/templates/manager/wva-deployment-controller-manager.yaml

Lines changed: 8 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,7 @@ spec:
2121
control-plane: controller-manager
2222
{{- include "workload-variant-autoscaler.selectorLabels" . | nindent 8 }}
2323
spec:
24-
# TODO(user): Uncomment the following code to configure the nodeAffinity expression
25-
# according to the platforms which are supported by your solution.
26-
# It is considered best practice to support multiple architectures. You can
27-
# build your manager image using the makefile target docker-buildx.
28-
# affinity:
29-
# nodeAffinity:
30-
# requiredDuringSchedulingIgnoredDuringExecution:
31-
# nodeSelectorTerms:
32-
# - matchExpressions:
33-
# - key: kubernetes.io/arch
34-
# operator: In
35-
# values:
36-
# - amd64
37-
# - arm64
38-
# - ppc64le
39-
# - s390x
40-
# - key: kubernetes.io/os
41-
# operator: In
42-
# values:
43-
# - linux
4424
securityContext:
45-
# Projects are configured by default to adhere to the "restricted" Pod Security Standards.
46-
# This ensures that deployments meet the highest security requirements for Kubernetes.
47-
# For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
4825
runAsNonRoot: true
4926
seccompProfile:
5027
type: RuntimeDefault
@@ -54,6 +31,7 @@ spec:
5431
args:
5532
- --leader-elect=true
5633
- --health-probe-bind-address=:8081
34+
- --config-file=/etc/wva/config.yaml
5735
{{- if .Values.wva.namespaceScoped }}
5836
- --watch-namespace=$(POD_NAMESPACE)
5937
{{- end }}
@@ -64,39 +42,14 @@ spec:
6442
image: "{{ .Values.wva.image.repository }}:{{ .Values.wva.image.tag }}"
6543
imagePullPolicy: "{{ .Values.wva.imagePullPolicy }}"
6644
env:
67-
- name: EPP_METRIC_READER_BEARER_TOKEN
68-
valueFrom:
69-
configMapKeyRef:
70-
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
71-
key: EPP_METRIC_READER_BEARER_TOKEN
7245
- name: LOG_LEVEL
7346
value: {{ if .Values.wva.logging }}{{ .Values.wva.logging.level | default "info" | quote }}{{ else }}"info"{{ end }}
7447
- name: CONFIG_MAP_NAME
7548
value: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
7649
- name: SATURATION_CONFIG_MAP_NAME
7750
value: {{ include "workload-variant-autoscaler.fullname" . }}-wva-saturation-scaling-config
78-
- name: PROMETHEUS_BASE_URL
79-
valueFrom:
80-
configMapKeyRef:
81-
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
82-
key: PROMETHEUS_BASE_URL
83-
- name: PROMETHEUS_TLS_INSECURE_SKIP_VERIFY
84-
valueFrom:
85-
configMapKeyRef:
86-
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
87-
key: PROMETHEUS_TLS_INSECURE_SKIP_VERIFY
88-
- name: PROMETHEUS_CA_CERT_PATH
89-
valueFrom:
90-
configMapKeyRef:
91-
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
92-
key: PROMETHEUS_CA_CERT_PATH
9351
- name: PROMETHEUS_TOKEN_PATH
9452
value: "/var/run/secrets/kubernetes.io/serviceaccount/token"
95-
- name: WVA_SCALE_TO_ZERO
96-
valueFrom:
97-
configMapKeyRef:
98-
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
99-
key: WVA_SCALE_TO_ZERO
10053
- name: WVA_LIMITED_MODE
10154
value: {{ .Values.wva.limitedMode | quote }}
10255
- name: WVA_NODE_SELECTOR
@@ -136,8 +89,6 @@ spec:
13689
port: 8081
13790
initialDelaySeconds: 5
13891
periodSeconds: 10
139-
# TODO(user): Configure the resources accordingly based on the project requirements.
140-
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
14192
resources:
14293
limits:
14394
cpu: 500m
@@ -146,11 +97,18 @@ spec:
14697
cpu: 10m
14798
memory: 64Mi
14899
volumeMounts:
100+
- name: wva-config
101+
mountPath: /etc/wva/config.yaml
102+
subPath: config.yaml
103+
readOnly: true
149104
- name: prometheus-ca-cert
150105
mountPath: /etc/ssl/certs/prometheus-ca.crt
151106
subPath: ca.crt
152107
readOnly: true
153108
volumes:
109+
- name: wva-config
110+
configMap:
111+
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
154112
- name: prometheus-ca-cert
155113
configMap:
156114
name: {{ include "workload-variant-autoscaler.fullname" . }}-prometheus-ca

0 commit comments

Comments
 (0)