Skip to content

Commit a3b801b

Browse files
committed
fix: Run dcgm-exporter on GKE
1 parent 8925beb commit a3b801b

File tree

5 files changed

+59
-17
lines changed

5 files changed

+59
-17
lines changed

charts/kvisor/templates/_helpers.tpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,13 @@ Only used as a fallback when controller.extraArgs.cloud-provider is not set.
345345
{{- dig "castai" "provider" "" (.Values.global | default dict) -}}
346346
{{- end }}
347347

348+
{{/*
349+
Returns true if GPU metrics collection is enabled.
350+
*/}}
351+
{{- define "kvisor.gpuEnabled" -}}
352+
{{- dig "gpu" "enabled" false .Values.agent -}}
353+
{{- end }}
354+
348355
{{/*
349356
Resolve CASTAI_API_GRPC_ADDR: global.castai.grpcURL > .Values.castai.grpcAddr
350357
*/}}

charts/kvisor/templates/agent.yaml

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ spec:
5656
securityContext:
5757
{{- toYaml .Values.agent.securityContext | nindent 8 }}
5858
priorityClassName: {{ .Values.agent.priorityClass }}
59-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
59+
{{- if (include "kvisor.gpuEnabled" .) }}
6060
initContainers:
6161
- name: gpu-feature-discovery
6262
image: "{{ .Values.image.repository }}-agent:{{ .Values.image.tag | default .Chart.AppVersion }}"
@@ -108,7 +108,7 @@ spec:
108108
{{- range $key, $value := .Values.agent.extraArgs }}
109109
- "--{{ $key }}={{ $value }}"
110110
{{- end }}
111-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
111+
{{- if (include "kvisor.gpuEnabled" .) }}
112112
- "--gpu-enabled=true"
113113
- "--gpu-export-interval={{ .Values.agent.gpu.exportInterval }}"
114114
- "--gpu-dcgm-port={{ .Values.agent.gpu.dcgmExporter.port }}"
@@ -197,7 +197,7 @@ spec:
197197
{{- with .Values.agent.extraVolumeMounts }}
198198
{{- toYaml . | nindent 12 }}
199199
{{- end }}
200-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
200+
{{- if (include "kvisor.gpuEnabled" .) }}
201201
- name: gpu-shared
202202
mountPath: {{ .Values.agent.gpu.sharedDir }}
203203
{{- end }}
@@ -307,11 +307,13 @@ spec:
307307
readOnly: true
308308
{{- end }}
309309
{{- end }}
310-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
310+
{{- if (include "kvisor.gpuEnabled" .) }}
311+
{{- $provider := include "kvisor.cloudProvider" . }}
311312
- name: dcgm-exporter
312313
image: "{{ .Values.agent.gpu.dcgmExporter.image.repository }}:{{ .Values.agent.gpu.dcgmExporter.image.tag }}"
313314
imagePullPolicy: {{ .Values.agent.gpu.dcgmExporter.image.pullPolicy }}
314315
securityContext:
316+
privileged: {{ eq $provider "gke" }}
315317
runAsNonRoot: false
316318
runAsUser: 0
317319
allowPrivilegeEscalation: true
@@ -331,7 +333,32 @@ spec:
331333
echo "Existing DCGM exporter found on node. Standing by."
332334
while true; do sleep 3600; done
333335
fi
336+
{{- if eq $provider "gke" }}
337+
# Wait for GKE NVIDIA driver installation (cos-nvidia-installer runs asynchronously).
338+
# 30s timeout handles CPU-only nodes where drivers will never appear.
339+
elapsed=0
340+
while [ $elapsed -lt 30 ]; do
341+
if ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
342+
echo "nvml drivers found"
343+
break
344+
fi
345+
echo "waiting for nvml drivers ($elapsed/30s)"
346+
sleep 1
347+
elapsed=$((elapsed + 1))
348+
done
349+
if ! ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
350+
echo "nvml drivers not found after 30s (CPU-only node). Standing by."
351+
while true; do sleep 3600; done
352+
fi
334353
hostname $NODE_NAME; exec dcgm-exporter -f /etc/dcgm-exporter/counters.csv
354+
{{- else }}
355+
hostname $NODE_NAME
356+
for ((;;)); do
357+
dcgm-exporter -f /etc/dcgm-exporter/counters.csv
358+
echo "dcgm-exporter exited (no GPU or driver not ready). Retrying in 60s."
359+
sleep 60
360+
done
361+
{{- end }}
335362
ports:
336363
- name: dcgm-metrics
337364
containerPort: {{ .Values.agent.gpu.dcgmExporter.port }}
@@ -349,6 +376,10 @@ spec:
349376
valueFrom:
350377
fieldRef:
351378
fieldPath: spec.nodeName
379+
{{- if eq $provider "gke" }}
380+
- name: DISABLE_STARTUP_VALIDATE
381+
value: "true"
382+
{{- end }}
352383
volumeMounts:
353384
- name: gpu-shared
354385
mountPath: {{ .Values.agent.gpu.sharedDir }}
@@ -358,6 +389,11 @@ spec:
358389
- name: dcgm-config
359390
mountPath: /etc/dcgm-exporter
360391
readOnly: true
392+
{{- if eq $provider "gke" }}
393+
- name: nvidia-install-dir-host
394+
mountPath: /usr/local/nvidia
395+
readOnly: true
396+
{{- end }}
361397
{{- end }}
362398
dnsPolicy: {{.Values.agent.dnsPolicy}}
363399
{{- with .Values.agent.nodeSelector }}
@@ -412,7 +448,7 @@ spec:
412448
fieldRef:
413449
fieldPath: metadata.namespace
414450
{{- end }}
415-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
451+
{{- if (include "kvisor.gpuEnabled" .) }}
416452
- name: gpu-shared
417453
emptyDir: {}
418454
- name: pod-gpu-resources
@@ -421,6 +457,12 @@ spec:
421457
- name: dcgm-config
422458
configMap:
423459
name: {{ include "kvisor.agent.fullname" . }}-dcgm-config
460+
{{- if eq (include "kvisor.cloudProvider" .) "gke" }}
461+
- name: nvidia-install-dir-host
462+
hostPath:
463+
path: /home/kubernetes/bin/nvidia
464+
type: DirectoryOrCreate
465+
{{- end }}
424466
- name: gpu-sa-token
425467
projected:
426468
sources:
@@ -473,7 +515,7 @@ rules:
473515
- get
474516
- list
475517
- watch
476-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
518+
{{- if (include "kvisor.gpuEnabled" .) }}
477519
- apiGroups:
478520
- ""
479521
resources:

charts/kvisor/templates/controller.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -403,14 +403,6 @@ rules:
403403
- get
404404
- list
405405
- watch
406-
- apiGroups:
407-
- "argoproj.io"
408-
resources:
409-
- rollouts
410-
verbs:
411-
- get
412-
- list
413-
- watch
414406
- apiGroups:
415407
- "networking.k8s.io"
416408
resources:

charts/kvisor/templates/dcgm-configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{{- if (dig "gpu" "enabled" false .Values.agent) }}
1+
{{- if (include "kvisor.gpuEnabled" .) }}
22
apiVersion: v1
33
kind: ConfigMap
44
metadata:

charts/kvisor/values.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,10 @@ agent:
189189
port: 9400
190190
resources:
191191
requests:
192-
memory: 64Mi
192+
cpu: 100m
193+
memory: 128Mi
193194
limits:
194-
memory: 256Mi
195+
memory: 500Mi
195196

196197
# Reliability metrics collection via OBI (OpenTelemetry eBPF Instrumentation).
197198
# Deploys OBI as a sidecar container in the kvisor agent DaemonSet to collect

0 commit comments

Comments
 (0)