5656 securityContext :
5757 {{- toYaml .Values.agent.securityContext | nindent 8 }}
5858 priorityClassName : {{ .Values.agent.priorityClass }}
59- {{- if (dig "gpu" "enabled" false .Values.agent ) }}
59+ {{- if (include "kvisor.gpuEnabled" . ) }}
6060 initContainers :
6161 - name : gpu-feature-discovery
6262 image : " {{ .Values.image.repository }}-agent:{{ .Values.image.tag | default .Chart.AppVersion }}"
@@ -108,7 +108,7 @@ spec:
108108 {{- range $key, $value := .Values.agent.extraArgs }}
109109 - " --{{ $key }}={{ $value }}"
110110 {{- end }}
111- {{- if (dig "gpu" "enabled" false .Values.agent ) }}
111+ {{- if (include "kvisor.gpuEnabled" . ) }}
112112 - " --gpu-enabled=true"
113113 - " --gpu-export-interval={{ .Values.agent.gpu.exportInterval }}"
114114 - " --gpu-dcgm-port={{ .Values.agent.gpu.dcgmExporter.port }}"
@@ -197,7 +197,7 @@ spec:
197197 {{- with .Values.agent.extraVolumeMounts }}
198198 {{- toYaml . | nindent 12 }}
199199 {{- end }}
200- {{- if (dig "gpu" "enabled" false .Values.agent ) }}
200+ {{- if (include "kvisor.gpuEnabled" . ) }}
201201 - name : gpu-shared
202202 mountPath : {{ .Values.agent.gpu.sharedDir }}
203203 {{- end }}
@@ -307,11 +307,13 @@ spec:
307307 readOnly : true
308308 {{- end }}
309309 {{- end }}
310- {{- if (dig "gpu" "enabled" false .Values.agent) }}
310+ {{- if (include "kvisor.gpuEnabled" .) }}
311+ {{- $provider := include "kvisor.cloudProvider" . }}
311312 - name : dcgm-exporter
312313 image : " {{ .Values.agent.gpu.dcgmExporter.image.repository }}:{{ .Values.agent.gpu.dcgmExporter.image.tag }}"
313314 imagePullPolicy : {{ .Values.agent.gpu.dcgmExporter.image.pullPolicy }}
314315 securityContext :
316+ privileged : {{ eq $provider "gke" }}
315317 runAsNonRoot : false
316318 runAsUser : 0
317319 allowPrivilegeEscalation : true
@@ -331,7 +333,32 @@ spec:
331333 echo "Existing DCGM exporter found on node. Standing by."
332334 while true; do sleep 3600; done
333335 fi
336+ {{- if eq $provider "gke" }}
337+ # Wait for GKE NVIDIA driver installation (cos-nvidia-installer runs asynchronously).
338+ # 30s timeout handles CPU-only nodes where drivers will never appear.
339+ elapsed=0
340+ while [ $elapsed -lt 30 ]; do
341+ if ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
342+ echo "nvml drivers found"
343+ break
344+ fi
345+ echo "waiting for nvml drivers ($elapsed/30s)"
346+ sleep 1
347+ elapsed=$((elapsed + 1))
348+ done
349+ if ! ls "/usr/local/nvidia/lib64/libnvidia-ml.so.1" &>/dev/null; then
350+ echo "nvml drivers not found after 30s (CPU-only node). Standing by."
351+ while true; do sleep 3600; done
352+ fi
334353 hostname $NODE_NAME; exec dcgm-exporter -f /etc/dcgm-exporter/counters.csv
354+ {{- else }}
355+ hostname $NODE_NAME
356+ for ((;;)); do
357+ dcgm-exporter -f /etc/dcgm-exporter/counters.csv
358+ echo "dcgm-exporter exited (no GPU or driver not ready). Retrying in 60s."
359+ sleep 60
360+ done
361+ {{- end }}
335362 ports :
336363 - name : dcgm-metrics
337364 containerPort : {{ .Values.agent.gpu.dcgmExporter.port }}
@@ -349,6 +376,10 @@ spec:
349376 valueFrom :
350377 fieldRef :
351378 fieldPath : spec.nodeName
379+ {{- if eq $provider "gke" }}
380+ - name : DISABLE_STARTUP_VALIDATE
381+ value : " true"
382+ {{- end }}
352383 volumeMounts :
353384 - name : gpu-shared
354385 mountPath : {{ .Values.agent.gpu.sharedDir }}
@@ -358,6 +389,11 @@ spec:
358389 - name : dcgm-config
359390 mountPath : /etc/dcgm-exporter
360391 readOnly : true
392+ {{- if eq $provider "gke" }}
393+ - name : nvidia-install-dir-host
394+ mountPath : /usr/local/nvidia
395+ readOnly : true
396+ {{- end }}
361397 {{- end }}
362398 dnsPolicy : {{.Values.agent.dnsPolicy}}
363399 {{- with .Values.agent.nodeSelector }}
@@ -412,7 +448,7 @@ spec:
412448 fieldRef :
413449 fieldPath : metadata.namespace
414450 {{- end }}
415- {{- if (dig "gpu" "enabled" false .Values.agent ) }}
451+ {{- if (include "kvisor.gpuEnabled" . ) }}
416452 - name : gpu-shared
417453 emptyDir : {}
418454 - name : pod-gpu-resources
@@ -421,6 +457,12 @@ spec:
421457 - name : dcgm-config
422458 configMap :
423459 name : {{ include "kvisor.agent.fullname" . }}-dcgm-config
460+ {{- if eq (include "kvisor.cloudProvider" .) "gke" }}
461+ - name : nvidia-install-dir-host
462+ hostPath :
463+ path : /home/kubernetes/bin/nvidia
464+ type : DirectoryOrCreate
465+ {{- end }}
424466 - name : gpu-sa-token
425467 projected :
426468 sources :
@@ -473,7 +515,7 @@ rules:
473515 - get
474516 - list
475517 - watch
476- {{- if (dig "gpu" "enabled" false .Values.agent ) }}
518+ {{- if (include "kvisor.gpuEnabled" . ) }}
477519 - apiGroups :
478520 - " "
479521 resources :
0 commit comments