Merge pull request #277881 from schaffererin/bug262390

prmerger-automator[bot] · web-flow · commit 1d16fbb09df6 · 2024-06-11T19:59:28.000Z
Updated DaemonSet YAML with priorityClassName
diff --git a/articles/aks/gpu-cluster.md b/articles/aks/gpu-cluster.md
@@ -164,7 +164,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
     kind: DaemonSet
     metadata:
       name: nvidia-device-plugin-daemonset
-      namespace: gpu-resources
+      namespace: kube-system
     spec:
       selector:
         matchLabels:
@@ -173,40 +173,35 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
         type: RollingUpdate
       template:
         metadata:
-          # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
-          # reserves resources for critical add-on pods so that they can be rescheduled after
-          # a failure.  This annotation works in tandem with the toleration below.
-          annotations:
-            scheduler.alpha.kubernetes.io/critical-pod: ""
           labels:
             name: nvidia-device-plugin-ds
         spec:
           tolerations:
-          # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
-          # This, along with the annotation above marks this pod as a critical add-on.
-          - key: CriticalAddonsOnly
-            operator: Exists
           - key: nvidia.com/gpu
             operator: Exists
             effect: NoSchedule
-          - key: "sku"
-            operator: "Equal"
-            value: "gpu"
-            effect: "NoSchedule"
+          # Mark this pod as a critical add-on; when enabled, the critical add-on
+          # scheduler reserves resources for critical add-on pods so that they can
+          # be rescheduled after a failure.
+          # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+          priorityClassName: "system-node-critical"
           containers:
-          - image: mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.14.1
+          - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
             name: nvidia-device-plugin-ctr
+            env:
+              - name: FAIL_ON_INIT_ERROR
+                value: "false"
             securityContext:
               allowPrivilegeEscalation: false
               capabilities:
                 drop: ["ALL"]
             volumeMounts:
-              - name: device-plugin
-                mountPath: /var/lib/kubelet/device-plugins
-          volumes:
             - name: device-plugin
-              hostPath:
-                path: /var/lib/kubelet/device-plugins
+              mountPath: /var/lib/kubelet/device-plugins
+          volumes:
+          - name: device-plugin
+            hostPath:
+              path: /var/lib/kubelet/device-plugins
     ```
 
 3. Create the DaemonSet and confirm the NVIDIA device plugin is created successfully using the [`kubectl apply`][kubectl-apply] command.
@@ -499,7 +494,7 @@ To see the GPU in action, you can schedule a GPU-enabled workload with the appro
 [kubectl-create]: https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#create
 [azure-pricing]: https://azure.microsoft.com/pricing/
 [azure-availability]: https://azure.microsoft.com/global-infrastructure/services/
-[nvidia-github]: https://github.com/NVIDIA/k8s-device-plugin
+[nvidia-github]: https://github.com/NVIDIA/k8s-device-plugin/blob/4b3d6b0a6613a3672f71ea4719fd8633eaafb4f3/deployments/static/nvidia-device-plugin.yml
 
 <!-- LINKS - internal -->
 [az-aks-create]: /cli/azure/aks#az_aks_create