|
1 | 1 | # NVIDIA Driver container for CoreOS
|
2 | 2 |
|
3 |
| -## Based on https://gitlab.com/nvidia/container-images/driver/-/tree/main/coreos?ref_type=heads |
4 |
| - |
5 |
| -### Prerequisites |
6 |
| - |
7 |
| -##### Enable required modules |
8 |
| -```sh |
9 |
| -sudo modprobe -a loop ipmi_msghandler |
10 |
| -echo -e "loop\nipmi_msghandler" | sudo tee /etc/modules-load.d/driver.conf |
11 |
| -``` |
12 |
| - |
13 |
| -#### Add nvidia-runtime |
14 |
| -```bash |
15 |
| -sudo mkdir -p /etc/systemd/system/docker.service.d |
16 |
| - |
17 |
| -sudo tee /etc/systemd/system/docker.service.d/override.conf <<EOF |
18 |
| -[Service] |
19 |
| -Environment=PATH=$PATH:/run/nvidia/driver/usr/bin |
20 |
| -ExecStart= |
21 |
| -ExecStart=/usr/bin/dockerd --host=fd:// --add-runtime=nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime |
22 |
| -EOF |
23 |
| - |
24 |
| -sudo systemctl daemon-reload |
25 |
| -sudo systemctl restart docker |
26 |
| - |
27 |
| -# Make sure the runtime is added |
28 |
| -docker info | grep -i "runtime" |
29 |
| -``` |
30 |
| - |
31 |
| -#### CoreOS AWS image |
32 |
| -```sh |
33 |
| -# Run driver container in detached mode |
34 |
| - |
35 |
| -docker run -d --privileged --pid=host --restart=unless-stopped -v /run/nvidia:/run/nvidia:shared --name nvidia-driver nvidia/driver:410.72-4.14.81-coreos |
36 |
| - |
37 |
| -# Check logs to make sure driver container ran properly |
38 |
| - |
39 |
| -docker logs -f nvidia-driver |
40 |
| - |
41 |
| -# Test nvidia-smi with official CUDA image |
42 |
| - |
43 |
| -docker run --runtime=nvidia --rm nvidia/cuda:9.2-base sh -c 'uname -r && nvidia-smi --query-gpu=driver_version --format=csv,noheader' |
44 |
| -``` |
45 |
| - |
46 |
| -#### In Kubernetes |
47 |
| -```sh |
48 |
| -# Set up the cluster for Container Linux (https://kubernetes.io/docs/setup/independent/install-kubeadm/#installing-kubeadm-kubelet-and-kubectl) |
49 |
| - |
50 |
| -# Run driver container |
51 |
| - |
52 |
| -docker run -d --privileged --pid=host --restart=unless-stopped -v /run/nvidia:/run/nvidia:shared --name nvidia-driver nvidia/driver:410.72-4.14.81-coreos |
53 |
| - |
54 |
| -# Make sure the driver container is running before moving on to next steps |
55 |
| -docker ps -a | grep -i driver |
56 |
| - |
57 |
| -# Set default runtime to nvidia by editing /etc/systemd/system/docker.service.d |
58 |
| - |
59 |
| -sudo sed -i "s|nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime|nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime default-runtime=nvidia|" /etc/systemd/system/docker.service.d/override.conf |
60 |
| - |
61 |
| -sudo systemctl daemon-reload |
62 |
| -sudo systemctl restart docker |
63 |
| - |
64 |
| -# Deploy nvidia k8s-device-plugin (https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes) |
65 |
| - |
66 |
| -# Deploy GPU pods |
67 |
| - |
68 |
| -# Set up monitoring |
69 |
| - |
70 |
| -# Install helm (https://docs.helm.sh/using_helm/#installing-the-helm-client) to /opt/bin |
71 |
| - |
72 |
| -# Initialize helm |
73 |
| - |
74 |
| -kubectl create serviceaccount tiller --namespace kube-system |
75 |
| -kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller |
76 |
| -helm init --service-account tiller |
77 |
| - |
78 |
| -# Label GPU nodes |
79 |
| - |
80 |
| -kubectl label nodes <coreos-gpu-node> hardware-type=NVIDIAGPU |
81 |
| - |
82 |
| -# Install the monitoring charts |
83 |
| - |
84 |
| -helm repo add gpu-helm-charts https://nvidia.github.io/gpu-monitoring-tools/helm-charts |
85 |
| -helm repo update |
86 |
| -helm install gpu-helm-charts/prometheus-operator --name prometheus-operator --namespace monitoring |
87 |
| -helm install gpu-helm-charts/kube-prometheus --name kube-prometheus --namespace monitoring |
88 |
| - |
89 |
| -# Check the status of the pods |
90 |
| - |
91 |
| -kubectl get pods -n monitoring |
92 |
| - |
93 |
| -# Forward the port for Grafana |
94 |
| - |
95 |
| -kubectl -n monitoring port-forward $(kubectl get pods -n monitoring -lapp=kube-prometheus-grafana -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') 3000 & |
96 |
| - |
97 |
| -# Open a browser window and type http://localhost:3000 to view the Nodes Dashboard in Grafana |
98 |
| -``` |
| 3 | +Based on https://gitlab.com/nvidia/container-images/driver/-/tree/main/coreos?ref_type=heads & https://github.com/NVIDIA/gpu-driver-container/tree/main/fedora |
0 commit comments