Skip to content

Commit 7fcd85c

Browse files
committed
Initialised github repo to buil our own Nvidia driver images for the DSRI
Based on the following Nvidia repo: https://gitlab.com/nvidia/container-images/driver
0 parents  commit 7fcd85c

File tree

192 files changed

+40433
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

192 files changed

+40433
-0
lines changed

Dockerfile

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
FROM nvcr.io/nvidia/cuda:12.9.1-base-ubi8 as license
2+
3+
# Build nvidia-container-runtime binary
4+
FROM golang:1.24.4 as build-runtime
5+
6+
WORKDIR /go/src/nvidia-container-runtime
7+
COPY nvidia-container-runtime .
8+
RUN go install -v nvidia-container-runtime
9+
10+
# Build driver image
11+
FROM ubuntu:16.04
12+
13+
RUN dpkg --add-architecture i386 && \
14+
apt-get update && apt-get install -y --no-install-recommends \
15+
apt-transport-https \
16+
apt-utils \
17+
bc \
18+
binutils \
19+
build-essential \
20+
ca-certificates \
21+
curl \
22+
gnupg2 \
23+
jq \
24+
kmod \
25+
libc6:i386 \
26+
libelf-dev \
27+
libssl-dev \
28+
module-init-tools \
29+
software-properties-common && \
30+
rm -rf /var/lib/apt/lists/*
31+
32+
RUN echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic main" > /etc/apt/sources.list && \
33+
echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic-updates main" >> /etc/apt/sources.list && \
34+
echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ bionic-security main" >> /etc/apt/sources.list && \
35+
usermod -o -u 0 -g 0 _apt
36+
37+
RUN curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
38+
curl -fsSL -o /usr/local/bin/extract-vmlinux https://raw.githubusercontent.com/torvalds/linux/master/scripts/extract-vmlinux && \
39+
chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux
40+
41+
#ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64
42+
ARG BASE_URL=https://us.download.nvidia.com/tesla
43+
ARG DRIVER_VERSION=450.80.02
44+
ENV DRIVER_VERSION=$DRIVER_VERSION
45+
46+
# Install the userspace components and copy the kernel module sources.
47+
RUN cd /tmp && \
48+
curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run && \
49+
sh NVIDIA-Linux-x86_64-$DRIVER_VERSION.run -x && \
50+
cd NVIDIA-Linux-x86_64-$DRIVER_VERSION* && \
51+
./nvidia-installer --silent \
52+
--no-kernel-module \
53+
--install-compat32-libs \
54+
--no-nouveau-check \
55+
--no-nvidia-modprobe \
56+
--no-rpms \
57+
--no-backup \
58+
--no-check-for-alternate-installs \
59+
--no-libglx-indirect \
60+
--no-install-libglvnd \
61+
--x-prefix=/tmp/null \
62+
--x-module-path=/tmp/null \
63+
--x-library-path=/tmp/null \
64+
--x-sysconfig-path=/tmp/null && \
65+
mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
66+
mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \
67+
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest && \
68+
rm -rf /tmp/*
69+
70+
# Install and configure nvidia-container-runtime
71+
ENV NVIDIA_VISIBLE_DEVICES void
72+
73+
COPY --from=build-runtime /go/bin/nvidia-container-runtime /usr/bin/nvidia-container-runtime
74+
75+
RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - && \
76+
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && \
77+
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list && \
78+
apt-get update && \
79+
apt-get install -y nvidia-container-runtime-hook && \
80+
cd /usr && ln -s lib/x86_64-linux-gnu lib64 && cd - && \
81+
sed -i 's/^#root/root/; s;@/sbin/ldconfig.real;@/run/nvidia/driver/sbin/ldconfig.real;' /etc/nvidia-container-runtime/config.toml
82+
83+
COPY nvidia-driver /usr/local/bin
84+
85+
WORKDIR /usr/src/nvidia-$DRIVER_VERSION
86+
87+
ARG PUBLIC_KEY=empty
88+
COPY ${PUBLIC_KEY} kernel/pubkey.x509
89+
90+
# Add NGC DL license
91+
COPY --from=license /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
92+
93+
ENTRYPOINT ["nvidia-driver"]

README.md

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# NVIDIA Driver container for CoreOS
2+
3+
## Based on https://gitlab.com/nvidia/container-images/driver/-/tree/main/coreos?ref_type=heads
4+
5+
### Prerequisites
6+
7+
##### Enable required modules
8+
```sh
9+
sudo modprobe -a loop ipmi_msghandler
10+
echo -e "loop\nipmi_msghandler" | sudo tee /etc/modules-load.d/driver.conf
11+
```
12+
13+
#### Add nvidia-runtime
14+
```bash
15+
sudo mkdir -p /etc/systemd/system/docker.service.d
16+
17+
sudo tee /etc/systemd/system/docker.service.d/override.conf <<EOF
18+
[Service]
19+
Environment=PATH=$PATH:/run/nvidia/driver/usr/bin
20+
ExecStart=
21+
ExecStart=/usr/bin/dockerd --host=fd:// --add-runtime=nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime
22+
EOF
23+
24+
sudo systemctl daemon-reload
25+
sudo systemctl restart docker
26+
27+
# Make sure the runtime is added
28+
docker info | grep -i "runtime"
29+
```
30+
31+
#### CoreOS AWS image
32+
```sh
33+
# Run driver container in detached mode
34+
35+
docker run -d --privileged --pid=host --restart=unless-stopped -v /run/nvidia:/run/nvidia:shared --name nvidia-driver nvidia/driver:410.72-4.14.81-coreos
36+
37+
# Check logs to make sure driver container ran properly
38+
39+
docker logs -f nvidia-driver
40+
41+
# Test nvidia-smi with official CUDA image
42+
43+
docker run --runtime=nvidia --rm nvidia/cuda:9.2-base sh -c 'uname -r && nvidia-smi --query-gpu=driver_version --format=csv,noheader'
44+
```
45+
46+
#### In Kubernetes
47+
```sh
48+
# Set up the cluster for Container Linux (https://kubernetes.io/docs/setup/independent/install-kubeadm/#installing-kubeadm-kubelet-and-kubectl)
49+
50+
# Run driver container
51+
52+
docker run -d --privileged --pid=host --restart=unless-stopped -v /run/nvidia:/run/nvidia:shared --name nvidia-driver nvidia/driver:410.72-4.14.81-coreos
53+
54+
# Make sure the driver container is running before moving on to next steps
55+
docker ps -a | grep -i driver
56+
57+
# Set default runtime to nvidia by editing /etc/systemd/system/docker.service.d
58+
59+
sudo sed -i "s|nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime|nvidia=/run/nvidia/driver/usr/bin/nvidia-container-runtime default-runtime=nvidia|" /etc/systemd/system/docker.service.d/override.conf
60+
61+
sudo systemctl daemon-reload
62+
sudo systemctl restart docker
63+
64+
# Deploy nvidia k8s-device-plugin (https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes)
65+
66+
# Deploy GPU pods
67+
68+
# Set up monitoring
69+
70+
# Install helm (https://docs.helm.sh/using_helm/#installing-the-helm-client) to /opt/bin
71+
72+
# Initialize helm
73+
74+
kubectl create serviceaccount tiller --namespace kube-system
75+
kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
76+
helm init --service-account tiller
77+
78+
# Label GPU nodes
79+
80+
kubectl label nodes <coreos-gpu-node> hardware-type=NVIDIAGPU
81+
82+
# Install the monitoring charts
83+
84+
helm repo add gpu-helm-charts https://nvidia.github.io/gpu-monitoring-tools/helm-charts
85+
helm repo update
86+
helm install gpu-helm-charts/prometheus-operator --name prometheus-operator --namespace monitoring
87+
helm install gpu-helm-charts/kube-prometheus --name kube-prometheus --namespace monitoring
88+
89+
# Check the status of the pods
90+
91+
kubectl get pods -n monitoring
92+
93+
# Forward the port for Grafana
94+
95+
kubectl -n monitoring port-forward $(kubectl get pods -n monitoring -lapp=kube-prometheus-grafana -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') 3000 &
96+
97+
# Open a browser window and type http://localhost:3000 to view the Nodes Dashboard in Grafana
98+
```

empty

Whitespace-only changes.

nvidia-container-runtime/Gopkg.lock

Lines changed: 56 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
2+
# Gopkg.toml example
3+
#
4+
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
5+
# for detailed Gopkg.toml documentation.
6+
#
7+
# required = ["github.com/user/thing/cmd/thing"]
8+
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
9+
#
10+
# [[constraint]]
11+
# name = "github.com/user/project"
12+
# version = "1.0.0"
13+
#
14+
# [[constraint]]
15+
# name = "github.com/user/project2"
16+
# branch = "dev"
17+
# source = "github.com/myfork/project2"
18+
#
19+
# [[override]]
20+
# name = "github.com/x/y"
21+
# version = "2.4.0"
22+
23+
24+
[[constraint]]
25+
name = "github.com/opencontainers/runtime-spec"
26+
version = "1.0.1"

nvidia-container-runtime/Makefile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
RUNTIME=nvidia-container-runtime
2+
MOCK_RUNC=$(CURDIR)/runc
3+
4+
all:
5+
@go build -o ${RUNTIME}
6+
7+
mock-runc:
8+
@(echo '#!/bin/bash\necho mock runc') > ${MOCK_RUNC}
9+
@chmod +x ${MOCK_RUNC}
10+
11+
test: all mock-runc
12+
@go test -v
13+
@${RM} ${MOCK_RUNC}
14+
15+
clean:
16+
@${RM} ${RUNTIME}

0 commit comments

Comments
 (0)