Skip to content

Commit 9fcfdeb

Browse files
committed
Added necessary install to install driver in image on OpenShift
1 parent 34d7c80 commit 9fcfdeb

File tree

4 files changed

+365
-0
lines changed

4 files changed

+365
-0
lines changed

common.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env bash
2+
# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
3+
4+
GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
5+
6+
_mellanox_devices_present() {
7+
devices_found=0
8+
for dev in /sys/bus/pci/devices/*; do
9+
read vendor < $dev/vendor
10+
if [ "$vendor" = "0x15b3" ]; then
11+
echo "Mellanox device found at $(basename $dev)"
12+
return 0
13+
fi
14+
done
15+
echo "No Mellanox devices were found..."
16+
return 1
17+
}
18+
19+
_gpu_direct_rdma_enabled() {
20+
if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then
21+
# check if mellanox cards are present
22+
if _mellanox_devices_present; then
23+
return 0
24+
fi
25+
fi
26+
return 1

drivers/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Folder for downloading vGPU drivers and dependent metadata files

install.sh

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/bin/bash
2+
3+
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
4+
echo "DRIVER_ARCH is $DRIVER_ARCH"
5+
6+
dep_installer () {
7+
if [ "$DRIVER_ARCH" = "x86_64" ]; then
8+
dnf install -y \
9+
libglvnd-glx \
10+
ca-certificates \
11+
curl \
12+
gcc \
13+
glibc.i686 \
14+
make \
15+
cpio \
16+
kmod \
17+
jq
18+
elif [ "$DRIVER_ARCH" = "ppc64le" ]; then
19+
dnf install -y \
20+
libglvnd-glx \
21+
ca-certificates \
22+
curl \
23+
gcc \
24+
glibc \
25+
make \
26+
cpio \
27+
kmod \
28+
jq
29+
elif [ "$DRIVER_ARCH" = "aarch64" ]; then
30+
dnf install -y \
31+
libglvnd-glx \
32+
ca-certificates \
33+
curl \
34+
gcc \
35+
glibc \
36+
make \
37+
cpio \
38+
kmod \
39+
jq
40+
fi
41+
rm -rf /var/cache/yum/*
42+
}
43+
44+
nvidia_installer () {
45+
if [ "$DRIVER_ARCH" = "x86_64" ]; then
46+
./nvidia-installer --silent \
47+
--no-kernel-module \
48+
--install-compat32-libs \
49+
--no-nouveau-check \
50+
--no-nvidia-modprobe \
51+
--no-rpms \
52+
--no-backup \
53+
--no-check-for-alternate-installs \
54+
--no-libglx-indirect \
55+
--no-install-libglvnd \
56+
--x-prefix=/tmp/null \
57+
--x-module-path=/tmp/null \
58+
--x-library-path=/tmp/null \
59+
--x-sysconfig-path=/tmp/null
60+
elif [ "$DRIVER_ARCH" = "ppc64le" ]; then
61+
./nvidia-installer --silent \
62+
--no-kernel-module \
63+
--no-nouveau-check \
64+
--no-nvidia-modprobe \
65+
--no-rpms \
66+
--no-backup \
67+
--no-check-for-alternate-installs \
68+
--no-libglx-indirect \
69+
--no-install-libglvnd \
70+
--x-prefix=/tmp/null \
71+
--x-module-path=/tmp/null \
72+
--x-library-path=/tmp/null \
73+
--x-sysconfig-path=/tmp/null
74+
elif [ "$DRIVER_ARCH" = "aarch64" ]; then
75+
./nvidia-installer --silent \
76+
--no-kernel-module \
77+
--no-nouveau-check \
78+
--no-nvidia-modprobe \
79+
--no-rpms \
80+
--no-backup \
81+
--no-check-for-alternate-installs \
82+
--no-libglx-indirect \
83+
--no-install-libglvnd \
84+
--x-prefix=/tmp/null \
85+
--x-module-path=/tmp/null \
86+
--x-library-path=/tmp/null \
87+
--x-sysconfig-path=/tmp/null
88+
else
89+
echo "DRIVER_ARCH doesn't match a known arch target"
90+
fi
91+
}
92+
93+
if [ "$1" = "nvinstall" ]; then
94+
nvidia_installer
95+
elif [ "$1" = "depinstall" ]; then
96+
dep_installer
97+
else
98+
echo "Unknown function: $1"
99+
fi

ocp_dtk_entrypoint

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
#!/usr/bin/env bash
2+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3+
4+
set -eu
5+
6+
DRIVER_TOOLKIT_SHARED_DIR=/mnt/shared-nvidia-driver-toolkit
7+
8+
echo "Running $*"
9+
10+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
11+
source $SCRIPT_DIR/common.sh
12+
13+
nv-ctr-run-with-dtk() {
14+
set -x
15+
16+
if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then
17+
echo "WARNING: RHCOS '${RHCOS_VERSION:-}' imagetag missing, using entitlement-based fallback"
18+
exec bash -x nvidia-driver init
19+
fi
20+
21+
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
22+
cp -r \
23+
/tmp/install.sh \
24+
/usr/local/bin/ocp_dtk_entrypoint \
25+
/usr/local/bin/nvidia-driver \
26+
/usr/local/bin/common.sh \
27+
/usr/local/bin/extract-vmlinux \
28+
/usr/local/bin/vgpu-util \
29+
/drivers \
30+
/licenses \
31+
"$DRIVER_TOOLKIT_SHARED_DIR/"
32+
33+
env | sed 's/=/="/' | sed 's/$/"/' > "$DRIVER_TOOLKIT_SHARED_DIR/env"
34+
35+
touch "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared"
36+
fi
37+
38+
set +x
39+
while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" ]]; do
40+
if [[ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" ]]; then
41+
echo "WARNING: broken driver toolkit detected, using entitlement-based fallback"
42+
exec bash -x nvidia-driver init
43+
fi
44+
echo "$(date) Waiting for openshift-driver-toolkit-ctr container to start ..."
45+
sleep 15
46+
done
47+
48+
echo "$(date) openshift-driver-toolkit-ctr started."
49+
50+
while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]]; do
51+
echo "$(date) Waiting for openshift-driver-toolkit-ctr container to build the precompiled driver ..."
52+
sleep 15
53+
done
54+
set -x
55+
56+
MODULES_SHARED=${DRIVER_TOOLKIT_SHARED_DIR}/modules/
57+
58+
# Copy the modules to their standard location
59+
MODULES_LOCAL="/lib/modules/$(uname -r)"
60+
mkdir -p "${MODULES_LOCAL}"
61+
62+
cp -rv "${MODULES_SHARED}"/* "${MODULES_LOCAL}"
63+
64+
# tell SELinux to allow loading these files
65+
find . -type f \
66+
\( -name "*.txt" -or -name "*.go" \) \
67+
-exec chcon -t modules_object_t "{}" \;
68+
69+
echo "#"
70+
echo "# Executing nvidia-driver load script ..."
71+
echo "#"
72+
73+
exec bash -x nvidia-driver load
74+
}
75+
76+
dtk-build-driver() {
77+
if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then
78+
echo "WARNING: 'istag/driver-toolkit:${RHCOS_VERSION} -n openshift' missing, nothing to do in openshift-driver-toolkit-ctr container"
79+
sleep +inf
80+
fi
81+
82+
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
83+
echo "WARNING: broken Driver Toolkit image detected:"
84+
echo "- Node kernel: $(uname -r)"
85+
echo "- Kernel package: $(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core)"
86+
87+
echo "INFO: informing nvidia-driver-ctr to fallback on entitled-build."
88+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken"
89+
echo "INFO: nothing else to do in openshift-driver-toolkit-ctr container, sleeping forever."
90+
sleep +inf
91+
fi
92+
93+
# Shared directory is prepared before entering this script. See
94+
# 'until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ] ...'
95+
# in the Pod command/args
96+
97+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
98+
99+
if [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; then
100+
echo "NVIDIA drivers already generated, nothing to do ..."
101+
102+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
103+
sleep 30
104+
done
105+
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
106+
else
107+
echo "Start building nvidia.ko driver ..."
108+
fi
109+
110+
set -x
111+
set -o allexport
112+
source "${DRIVER_TOOLKIT_SHARED_DIR}/env"
113+
set +o allexport;
114+
115+
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
116+
echo "DRIVER_ARCH is $DRIVER_ARCH"
117+
118+
# if this directory already exists,
119+
# NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run fails to run
120+
# and doesn't create its files. This may happen when the
121+
# container fails and restart its execution, leading to
122+
# hard-to-understand "unrelated" errors in the following of the script execution
123+
124+
rm -rf "${DRIVER_TOOLKIT_SHARED_DIR}/drivers/NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}";
125+
126+
# elfutils-libelf-devel.x86_64 is already install in the DTK and enough
127+
sed 's/elfutils-libelf.x86_64//' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
128+
# install script assumes these directories can be deleted->recreated,
129+
# but recreation doesn't happen in the DTK
130+
sed 's|rm -rf /lib/modules/${KERNEL_VERSION}/video||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
131+
sed 's|rm -rf /lib/modules/${KERNEL_VERSION}||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
132+
133+
mkdir "${DRIVER_TOOLKIT_SHARED_DIR}/bin" -p
134+
135+
cp -v \
136+
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
137+
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
138+
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
139+
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
140+
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"
141+
142+
ln -s $(which true) ${DRIVER_TOOLKIT_SHARED_DIR}/bin/dnf --force
143+
144+
export PATH="${DRIVER_TOOLKIT_SHARED_DIR}/bin:$PATH";
145+
146+
# install.sh script is mandatory
147+
cp "${DRIVER_TOOLKIT_SHARED_DIR}/install.sh" /tmp/
148+
149+
cd "${DRIVER_TOOLKIT_SHARED_DIR}/drivers";
150+
echo "#"
151+
echo "# Executing nvidia-driver build script ..."
152+
echo "#"
153+
bash -x "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" build --tag builtin
154+
155+
echo "#"
156+
echo "# nvidia-driver build script completed."
157+
echo "#"
158+
159+
drivers=$(ls /lib/modules/"$(uname -r)"/kernel/drivers/video/nvidia*.ko)
160+
if ! ls ${drivers} 2>/dev/null; then
161+
echo "FATAL: no NVIDIA driver generated ..."
162+
exit 1
163+
fi
164+
165+
MODULES_SHARED="${DRIVER_TOOLKIT_SHARED_DIR}/modules"
166+
mkdir -p "${MODULES_SHARED}"
167+
168+
# prepare the list of modules required by NVIDIA
169+
170+
modprobe -a i2c_core ipmi_msghandler ipmi_devintf --show-depends > ${MODULES_SHARED}/insmod_nvidia
171+
modprobe -a nvidia nvidia-uvm nvidia-modeset --show-depends >> ${MODULES_SHARED}/insmod_nvidia
172+
if _gpu_direct_rdma_enabled; then
173+
modprobe -a nvidia-peermem --show-depends >> ${MODULES_SHARED}/insmod_nvidia
174+
fi
175+
set +x
176+
177+
# copy the modules to the shared directory
178+
while read line; do
179+
if [[ "$line" == "builtin "* ]]; then
180+
#eg: line="builtin i2c_core"
181+
continue
182+
fi
183+
# eg: line="insmod /lib/modules/4.18.0-305.10.2.el8_4.x86_64/kernel/drivers/gpu/drm/drm.ko.x"
184+
modsrc=$(echo "${line}" | awk '{ print $2}')
185+
moddir=$(dirname "$(echo "${modsrc}" | sed "s|/lib/modules/$(uname -r)/||")")
186+
moddst="${MODULES_SHARED}/${moddir}"
187+
mkdir -p "${moddst}"
188+
cp -v "${modsrc}" "${moddst}"
189+
done <<< $(cat "${MODULES_SHARED}/insmod_nvidia")
190+
191+
# copies modules location and dependency files
192+
cp /lib/modules/$(uname -r)/modules.* "${MODULES_SHARED}"
193+
194+
echo "NVIDIA drivers generated, inform nvidia-driver-ctr container about it and sleep forever."
195+
touch "${DRIVER_TOOLKIT_SHARED_DIR}/driver_built"
196+
197+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
198+
sleep 30
199+
done
200+
201+
echo "WARNING: driver_built flag disappeared, restart this container"
202+
203+
exit 0
204+
}
205+
206+
usage() {
207+
cat >&2 <<EOF
208+
Usage: $0 COMMAND
209+
210+
Commands:
211+
dtk-build-driver
212+
nv-ctr-run-with-dtk
213+
EOF
214+
exit 1
215+
}
216+
if [ $# -eq 0 ]; then
217+
usage
218+
fi
219+
command=$1; shift
220+
case "${command}" in
221+
dtk-build-driver) options="" ;;
222+
nv-ctr-run-with-dtk) options="" ;;
223+
*) usage ;;
224+
esac
225+
if [ $? -ne 0 ]; then
226+
usage
227+
fi
228+
eval set -- "${options}"
229+
230+
if ! [ -d "${DRIVER_TOOLKIT_SHARED_DIR:-}" ]; then
231+
echo "FATAL: DRIVER_TOOLKIT_SHARED_DIR env variable must be populated with a valid directory"
232+
usage
233+
fi
234+
235+
if [ $# -ne 0 ]; then
236+
usage
237+
fi
238+
239+
$command

0 commit comments

Comments
 (0)