Skip to content

Commit d50cee8

Browse files
committed
add nvidia monitor and device plugin crds
1 parent 52db6d1 commit d50cee8

File tree

3 files changed

+105
-0
lines changed

3 files changed

+105
-0
lines changed

chart/templates/daemonset.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: apps/v1
2+
kind: DaemonSet
3+
metadata:
4+
name: "{{ $.Release.Name }}-nvidia-monitor"
5+
namespace: "{{ $.Values.namespace }}"
6+
labels:
7+
app: "{{ $.Release.Name }}-nvidia-monitor"
8+
spec:
9+
selector:
10+
matchLabels:
11+
name: "{{ $.Release.Name }}-nvidia-monitor"
12+
template:
13+
metadata:
14+
labels:
15+
name: "{{ $.Release.Name }}-nvidia-monitor"
16+
spec:
17+
nodeSelector:
18+
gpuEnabled: "true"
19+
gpuType: a100
20+
tolerations:
21+
- effect: NoSchedule
22+
key: kubernetes.azure.com/scalesetpriority
23+
operator: Equal
24+
value: spot
25+
{{/* hostNetwork: true*/}}
26+
hostPID: true
27+
hostIPC: true
28+
containers:
29+
- name: monitor
30+
{{/* securityContext:*/}}
31+
{{/* capabilities:*/}}
32+
{{/* add:*/}}
33+
{{/* - SYS_ADMIN*/}}
34+
{{/* privileged: true*/}}
35+
{{/* runAsNonRoot: false*/}}
36+
{{/* runAsUser: 0*/}}
37+
image: "nvcr.io/nvidia/cuda:12.5.0-runtime-ubuntu22.04"
38+
volumeMounts:
39+
- name: host
40+
mountPath: /host
41+
args:
42+
- sleep
43+
- infinity
44+
volumes:
45+
- name: host
46+
hostPath:
47+
path: /

nvidia-device-plugin-ds.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# this installs the NVIDIA device plugin: https://learn.microsoft.com/en-us/azure/aks/gpu-cluster?tabs=add-ubuntu-gpu-node-pool#manually-install-the-nvidia-device-plugin
2+
# kubectl create namespace gpu-resources
3+
# kubectl apply -f nvidia-device-plugin-ds.yaml
4+
5+
apiVersion: apps/v1
6+
kind: DaemonSet
7+
metadata:
8+
name: nvidia-device-plugin-daemonset
9+
namespace: kube-system
10+
spec:
11+
selector:
12+
matchLabels:
13+
name: nvidia-device-plugin-ds
14+
updateStrategy:
15+
type: RollingUpdate
16+
template:
17+
metadata:
18+
labels:
19+
name: nvidia-device-plugin-ds
20+
spec:
21+
nodeSelector:
22+
gpuEnabled: "true"
23+
tolerations:
24+
- effect: NoSchedule
25+
key: kubernetes.azure.com/scalesetpriority
26+
operator: Equal
27+
value: spot
28+
# Mark this pod as a critical add-on; when enabled, the critical add-on
29+
# scheduler reserves resources for critical add-on pods so that they can
30+
# be rescheduled after a failure.
31+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
32+
priorityClassName: "system-node-critical"
33+
containers:
34+
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
35+
name: nvidia-device-plugin-ctr
36+
env:
37+
- name: FAIL_ON_INIT_ERROR
38+
value: "false"
39+
securityContext:
40+
allowPrivilegeEscalation: false
41+
capabilities:
42+
drop: [ "ALL" ]
43+
volumeMounts:
44+
- name: device-plugin
45+
mountPath: /var/lib/kubelet/device-plugins
46+
volumes:
47+
- name: device-plugin
48+
hostPath:
49+
path: /var/lib/kubelet/device-plugins

nvidia-monitor.Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM nvcr.io/nvidia/cuda:12.5.0-runtime-ubuntu22.04
2+
3+
# "https://github.com/containerd/nerdctl/releases/download/v1.7.5/nerdctl-full-1.7.5-linux-amd64.tar.gz" \
4+
RUN wget -qO nerdctl.tar.gz "https://github.com/containerd/nerdctl/releases/download/v1.7.6/nerdctl-1.7.6-linux-amd64.tar.gz" \
5+
&& tar Cxzvvf /usr/local nerdctl.tar.gz \
6+
&& rm nerdctl.tar.gz
7+
8+
# for i in $(nerdctl -a /host/run/containerd/containerd.sock -n k8s.io container ls --format "{{.ID}}"); do nerdctl -a /host/run/containerd/containerd.sock -n k8s.io inspect -f '{{.State.Pid}} {{index .Config.Labels "io.kubernetes.pod.name"}}' $i; done | grep gooey-gpu
9+
# nvidia-smi

0 commit comments

Comments
 (0)