1
+ #! /usr/bin/env bash
2
+ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3
+
4
+ set -eu
5
+
6
+ DRIVER_TOOLKIT_SHARED_DIR=/mnt/shared-nvidia-driver-toolkit
7
+
8
+ echo " Running $* "
9
+
10
+ SCRIPT_DIR=$( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " & > /dev/null && pwd )
11
+ source $SCRIPT_DIR /common.sh
12
+
13
+ nv-ctr-run-with-dtk () {
14
+ set -x
15
+
16
+ if [[ " ${RHCOS_IMAGE_MISSING:- } " == " true" ]]; then
17
+ echo " WARNING: RHCOS '${RHCOS_VERSION:- } ' imagetag missing, using entitlement-based fallback"
18
+ exec bash -x nvidia-driver init
19
+ fi
20
+
21
+ if [[ ! -f " $DRIVER_TOOLKIT_SHARED_DIR /dir_prepared" ]]; then
22
+ cp -r \
23
+ /tmp/install.sh \
24
+ /usr/local/bin/ocp_dtk_entrypoint \
25
+ /usr/local/bin/nvidia-driver \
26
+ /usr/local/bin/common.sh \
27
+ /usr/local/bin/extract-vmlinux \
28
+ /usr/local/bin/vgpu-util \
29
+ /drivers \
30
+ /licenses \
31
+ " $DRIVER_TOOLKIT_SHARED_DIR /"
32
+
33
+ env | sed ' s/=/="/' | sed ' s/$/"/' > " $DRIVER_TOOLKIT_SHARED_DIR /env"
34
+
35
+ touch " $DRIVER_TOOLKIT_SHARED_DIR /dir_prepared"
36
+ fi
37
+
38
+ set +x
39
+ while [[ ! -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_build_started" ]]; do
40
+ if [[ -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_toolkit_broken" ]]; then
41
+ echo " WARNING: broken driver toolkit detected, using entitlement-based fallback"
42
+ exec bash -x nvidia-driver init
43
+ fi
44
+ echo " $( date) Waiting for openshift-driver-toolkit-ctr container to start ..."
45
+ sleep 15
46
+ done
47
+
48
+ echo " $( date) openshift-driver-toolkit-ctr started."
49
+
50
+ while [[ ! -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_built" ]]; do
51
+ echo " $( date) Waiting for openshift-driver-toolkit-ctr container to build the precompiled driver ..."
52
+ sleep 15
53
+ done
54
+ set -x
55
+
56
+ MODULES_SHARED=${DRIVER_TOOLKIT_SHARED_DIR} /modules/
57
+
58
+ # Copy the modules to their standard location
59
+ MODULES_LOCAL=" /lib/modules/$( uname -r) "
60
+ mkdir -p " ${MODULES_LOCAL} "
61
+
62
+ cp -rv " ${MODULES_SHARED} " /* " ${MODULES_LOCAL} "
63
+
64
+ # tell SELinux to allow loading these files
65
+ find . -type f \
66
+ \( -name " *.txt" -or -name " *.go" \) \
67
+ -exec chcon -t modules_object_t " {}" \;
68
+
69
+ echo " #"
70
+ echo " # Executing nvidia-driver load script ..."
71
+ echo " #"
72
+
73
+ exec bash -x nvidia-driver load
74
+ }
75
+
76
+ dtk-build-driver () {
77
+ if [[ " ${RHCOS_IMAGE_MISSING:- } " == " true" ]]; then
78
+ echo " WARNING: 'istag/driver-toolkit:${RHCOS_VERSION} -n openshift' missing, nothing to do in openshift-driver-toolkit-ctr container"
79
+ sleep +inf
80
+ fi
81
+
82
+ if ! [[ -f " /lib/modules/$( uname -r) /vmlinuz" ]]; then
83
+ echo " WARNING: broken Driver Toolkit image detected:"
84
+ echo " - Node kernel: $( uname -r) "
85
+ echo " - Kernel package: $( rpm -q --qf " %{VERSION}-%{RELEASE}.%{ARCH}" kernel-core) "
86
+
87
+ echo " INFO: informing nvidia-driver-ctr to fallback on entitled-build."
88
+ touch " $DRIVER_TOOLKIT_SHARED_DIR /driver_toolkit_broken"
89
+ echo " INFO: nothing else to do in openshift-driver-toolkit-ctr container, sleeping forever."
90
+ sleep +inf
91
+ fi
92
+
93
+ # Shared directory is prepared before entering this script. See
94
+ # 'until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ] ...'
95
+ # in the Pod command/args
96
+
97
+ touch " $DRIVER_TOOLKIT_SHARED_DIR /driver_build_started"
98
+
99
+ if [ -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_built" ]; then
100
+ echo " NVIDIA drivers already generated, nothing to do ..."
101
+
102
+ while [ -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_built" ]; do
103
+ sleep 30
104
+ done
105
+ echo " WARNING: driver_built flag disappeared, rebuilding the drivers ..."
106
+ else
107
+ echo " Start building nvidia.ko driver ..."
108
+ fi
109
+
110
+ set -x
111
+ set -o allexport
112
+ source " ${DRIVER_TOOLKIT_SHARED_DIR} /env"
113
+ set +o allexport;
114
+
115
+ DRIVER_ARCH=${TARGETARCH/ amd64/ x86_64} && DRIVER_ARCH=${DRIVER_ARCH/ arm64/ aarch64}
116
+ echo " DRIVER_ARCH is $DRIVER_ARCH "
117
+
118
+ # if this directory already exists,
119
+ # NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run fails to run
120
+ # and doesn't create its files. This may happen when the
121
+ # container fails and restart its execution, leading to
122
+ # hard-to-understand "unrelated" errors in the following of the script execution
123
+
124
+ rm -rf " ${DRIVER_TOOLKIT_SHARED_DIR} /drivers/NVIDIA-Linux-${DRIVER_ARCH} -${DRIVER_VERSION} " ;
125
+
126
+ # elfutils-libelf-devel.x86_64 is already install in the DTK and enough
127
+ sed ' s/elfutils-libelf.x86_64//' -i " ${DRIVER_TOOLKIT_SHARED_DIR} /nvidia-driver"
128
+ # install script assumes these directories can be deleted->recreated,
129
+ # but recreation doesn't happen in the DTK
130
+ sed ' s|rm -rf /lib/modules/${KERNEL_VERSION}/video||' -i " ${DRIVER_TOOLKIT_SHARED_DIR} /nvidia-driver"
131
+ sed ' s|rm -rf /lib/modules/${KERNEL_VERSION}||' -i " ${DRIVER_TOOLKIT_SHARED_DIR} /nvidia-driver"
132
+
133
+ mkdir " ${DRIVER_TOOLKIT_SHARED_DIR} /bin" -p
134
+
135
+ cp -v \
136
+ " $DRIVER_TOOLKIT_SHARED_DIR /nvidia-driver" \
137
+ " $DRIVER_TOOLKIT_SHARED_DIR /common.sh" \
138
+ " $DRIVER_TOOLKIT_SHARED_DIR /extract-vmlinux" \
139
+ " $DRIVER_TOOLKIT_SHARED_DIR /vgpu-util" \
140
+ " ${DRIVER_TOOLKIT_SHARED_DIR} /bin"
141
+
142
+ ln -s $( which true) ${DRIVER_TOOLKIT_SHARED_DIR} /bin/dnf --force
143
+
144
+ export PATH=" ${DRIVER_TOOLKIT_SHARED_DIR} /bin:$PATH " ;
145
+
146
+ # install.sh script is mandatory
147
+ cp " ${DRIVER_TOOLKIT_SHARED_DIR} /install.sh" /tmp/
148
+
149
+ cd " ${DRIVER_TOOLKIT_SHARED_DIR} /drivers" ;
150
+ echo " #"
151
+ echo " # Executing nvidia-driver build script ..."
152
+ echo " #"
153
+ bash -x " ${DRIVER_TOOLKIT_SHARED_DIR} /nvidia-driver" build --tag builtin
154
+
155
+ echo " #"
156
+ echo " # nvidia-driver build script completed."
157
+ echo " #"
158
+
159
+ drivers=$( ls /lib/modules/" $( uname -r) " /kernel/drivers/video/nvidia* .ko)
160
+ if ! ls ${drivers} 2> /dev/null; then
161
+ echo " FATAL: no NVIDIA driver generated ..."
162
+ exit 1
163
+ fi
164
+
165
+ MODULES_SHARED=" ${DRIVER_TOOLKIT_SHARED_DIR} /modules"
166
+ mkdir -p " ${MODULES_SHARED} "
167
+
168
+ # prepare the list of modules required by NVIDIA
169
+
170
+ modprobe -a i2c_core ipmi_msghandler ipmi_devintf --show-depends > ${MODULES_SHARED} /insmod_nvidia
171
+ modprobe -a nvidia nvidia-uvm nvidia-modeset --show-depends >> ${MODULES_SHARED} /insmod_nvidia
172
+ if _gpu_direct_rdma_enabled; then
173
+ modprobe -a nvidia-peermem --show-depends >> ${MODULES_SHARED} /insmod_nvidia
174
+ fi
175
+ set +x
176
+
177
+ # copy the modules to the shared directory
178
+ while read line; do
179
+ if [[ " $line " == " builtin " * ]]; then
180
+ # eg: line="builtin i2c_core"
181
+ continue
182
+ fi
183
+ # eg: line="insmod /lib/modules/4.18.0-305.10.2.el8_4.x86_64/kernel/drivers/gpu/drm/drm.ko.x"
184
+ modsrc=$( echo " ${line} " | awk ' { print $2}' )
185
+ moddir=$( dirname " $( echo " ${modsrc} " | sed " s|/lib/modules/$( uname -r) /||" ) " )
186
+ moddst=" ${MODULES_SHARED} /${moddir} "
187
+ mkdir -p " ${moddst} "
188
+ cp -v " ${modsrc} " " ${moddst} "
189
+ done <<< $( cat " ${MODULES_SHARED} /insmod_nvidia" )
190
+
191
+ # copies modules location and dependency files
192
+ cp /lib/modules/$( uname -r) /modules.* " ${MODULES_SHARED} "
193
+
194
+ echo " NVIDIA drivers generated, inform nvidia-driver-ctr container about it and sleep forever."
195
+ touch " ${DRIVER_TOOLKIT_SHARED_DIR} /driver_built"
196
+
197
+ while [ -f " $DRIVER_TOOLKIT_SHARED_DIR /driver_built" ]; do
198
+ sleep 30
199
+ done
200
+
201
+ echo " WARNING: driver_built flag disappeared, restart this container"
202
+
203
+ exit 0
204
+ }
205
+
206
+ usage () {
207
+ cat >&2 << EOF
208
+ Usage: $0 COMMAND
209
+
210
+ Commands:
211
+ dtk-build-driver
212
+ nv-ctr-run-with-dtk
213
+ EOF
214
+ exit 1
215
+ }
216
+ if [ $# -eq 0 ]; then
217
+ usage
218
+ fi
219
+ command=$1 ; shift
220
+ case " ${command} " in
221
+ dtk-build-driver) options=" " ;;
222
+ nv-ctr-run-with-dtk) options=" " ;;
223
+ * ) usage ;;
224
+ esac
225
+ if [ $? -ne 0 ]; then
226
+ usage
227
+ fi
228
+ eval set -- " ${options} "
229
+
230
+ if ! [ -d " ${DRIVER_TOOLKIT_SHARED_DIR:- } " ]; then
231
+ echo " FATAL: DRIVER_TOOLKIT_SHARED_DIR env variable must be populated with a valid directory"
232
+ usage
233
+ fi
234
+
235
+ if [ $# -ne 0 ]; then
236
+ usage
237
+ fi
238
+
239
+ $command
0 commit comments