Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deployments/helm/nvidia-device-plugin/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
name: nvidia-device-plugin
type: application
description: A Helm chart for the nvidia-device-plugin on Kubernetes
version: "0.18.0"
version: "0.19.0"
appVersion: "0.18.0"
kubeVersion: ">= 1.10.0-0"
home: https://github.com/NVIDIA/k8s-device-plugin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,29 +63,29 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
value: {{ .Values.configManager.init.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "false"
value: {{ .Values.configManager.init.sendSignal | quote }}
- name: SIGNAL
value: ""
value: {{ .Values.configManager.init.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: ""
value: {{ .Values.configManager.init.processToSignal | quote }}
volumeMounts:
- name: available-configs
mountPath: /available-configs
Expand All @@ -99,27 +99,27 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
value: {{ .Values.configManager.sidecar.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "true"
value: {{ .Values.configManager.sidecar.sendSignal | quote }}
- name: SIGNAL
value: "1" # SIGHUP
value: {{ .Values.configManager.sidecar.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
Expand Down Expand Up @@ -187,20 +187,20 @@ spec:
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
value: {{ .Values.env.configFile | quote }}
{{- end }}
{{- if $options.addMigMonitorDevices }}
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
value: {{ .Values.env.nvidia.migMonitorDevices | quote }}
{{- end }}
{{- if typeIs "string" .Values.deviceDiscoveryStrategy }}
- name: DEVICE_DISCOVERY_STRATEGY
value: {{ .Values.deviceDiscoveryStrategy }}
{{- end }}
- name: NVIDIA_VISIBLE_DEVICES
value: all
value: {{ .Values.env.nvidia.visibleDevices | quote }}
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
value: {{ .Values.env.nvidia.driverCapabilities | quote }}
securityContext:
{{- include "nvidia-device-plugin.securityContext" . | nindent 10 }}
volumeMounts:
Expand Down
30 changes: 15 additions & 15 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,29 +65,29 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
value: {{ .Values.configManager.init.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "false"
value: {{ .Values.configManager.init.sendSignal | quote }}
- name: SIGNAL
value: ""
value: {{ .Values.configManager.init.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: ""
value: {{ .Values.configManager.init.processToSignal | quote }}
volumeMounts:
- name: available-configs
mountPath: /available-configs
Expand All @@ -101,27 +101,27 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
value: {{ .Values.configManager.sidecar.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "true"
value: {{ .Values.configManager.sidecar.sendSignal | quote }}
- name: SIGNAL
value: "1" # SIGHUP
value: {{ .Values.configManager.sidecar.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: "gpu-feature-discovery"
volumeMounts:
Expand Down Expand Up @@ -167,11 +167,11 @@ spec:
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
value: {{ .Values.env.configFile | quote }}
{{- end }}
{{- if $options.addMigMonitorDevices }}
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
value: {{ .Values.env.nvidia.migMonitorDevices | quote }}
{{- end }}
{{- if typeIs "string" .Values.deviceDiscoveryStrategy }}
- name: DEVICE_DISCOVERY_STRATEGY
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,29 +79,29 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
value: {{ .Values.configManager.init.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "false"
value: {{ .Values.configManager.init.sendSignal | quote }}
- name: SIGNAL
value: ""
value: {{ .Values.configManager.init.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: ""
value: {{ .Values.configManager.init.processToSignal | quote }}
volumeMounts:
- name: available-configs
mountPath: /available-configs
Expand All @@ -116,27 +116,27 @@ spec:
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
value: {{ .Values.configManager.sidecar.oneshot | quote }}
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
value: {{ .Values.configManager.init.nodeLabel | quote }}
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
value: {{ .Values.configManager.init.configFileSrcDir | quote }}
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
value: {{ .Values.configManager.init.configFileDst | quote }}
- name: DEFAULT_CONFIG
value: {{ .Values.config.default }}
- name: FALLBACK_STRATEGIES
value: {{ join "," .Values.config.fallbackStrategies }}
- name: SEND_SIGNAL
value: "true"
value: {{ .Values.configManager.sidecar.sendSignal | quote }}
- name: SIGNAL
value: "1"
value: {{ .Values.configManager.sidecar.signal | quote }}
- name: PROCESS_TO_SIGNAL
value: "/usr/bin/mps-control-daemon"
volumeMounts:
Expand All @@ -161,16 +161,16 @@ spec:
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
value: {{ .Values.env.configFile | quote }}
{{- end }}
{{- if $options.addMigMonitorDevices }}
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
value: {{ .Values.env.nvidia.migMonitorDevices | quote }}
{{- end }}
- name: NVIDIA_VISIBLE_DEVICES
value: all
value: {{ .Values.env.nvidia.visibleDevices | quote }}
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
value: {{ .Values.env.nvidia.driverCapabilities | quote }}
securityContext:
privileged: true
volumeMounts:
Expand Down
49 changes: 49 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,55 @@ config:
# List of fallback strategies to attempt if no config is selected and no default is provided
fallbackStrategies: ["named" , "single"]

# Configuration Manager Environment Variables
# Controls config-manager init and sidecar containers that handle dynamic config updates
configManager:
# Init container configuration (runs once on pod startup)
init:
# Kubernetes node label to watch for device plugin configuration
nodeLabel: "nvidia.com/device-plugin.config"
# Directory containing available configuration files
configFileSrcDir: "/available-configs"
# Destination path for the active configuration file
configFileDst: "/config/config.yaml"
# Whether to send signal after config update (init container: false)
sendSignal: "false"
# Signal number to send (empty for init container)
signal: ""
# Process name to signal (empty for init container)
processToSignal: ""
# Run once and exit (init container: true)
oneshot: "true"

# Sidecar container configuration (runs continuously for hot-reload)
sidecar:
# Run continuously to watch for config changes
oneshot: "false"
# Send signal to main container after config update
sendSignal: "true"
# SIGHUP (1) to trigger config reload
signal: "1"
# Process name varies by DaemonSet and is set automatically in templates

# Main Container Environment Variables
# Apply to device-plugin, gpu-feature-discovery, and mps-control-daemon containers
env:
# Path to the active configuration file
configFile: "/config/config.yaml"

# NVIDIA runtime environment variables
nvidia:
# MIG (Multi-Instance GPU) devices to monitor
# Options: "all" or comma-separated GPU indices
migMonitorDevices: "all"
# GPU devices visible to containers
# Options: "all", "none", or comma-separated GPU indices/UUIDs
visibleDevices: "all"
# NVIDIA driver capabilities to enable
# Common values: "compute,utility" for standard workloads
# Full list: compute, compat32, graphics, utility, video, display, ngx
driverCapabilities: "compute,utility"

compatWithCPUManager: null
migStrategy: null
failOnInitError: null
Expand Down