Skip to content

Commit 0790ce1

Browse files
authored
fix(al2023): install grid drivers from ec2 grid runfile (#2450)
* fix: install grid drivers from ec2 grid runfile
1 parent 405b829 commit 0790ce1

File tree

1 file changed

+99
-15
lines changed

1 file changed

+99
-15
lines changed

templates/al2023/provisioners/install-nvidia-driver.sh

Lines changed: 99 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,23 @@ function rpm_install() {
2424

2525
echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..."
2626

27+
# The AL2023 GPU AMI currently builds and archives the following nvidia kernel modules
28+
# in /var/lib/dkms-archive: nvidia, nvidia-open, nvidia-open-grid. To maintain the stability
29+
# of the AMI, we want to ensure that all three kernel modules (and also the userspace modules)
30+
# are on the same NVIDIA driver version. Currently, the script installs the NVIDIA GRID drivers
31+
# first and decides the full NVIDIA driver version that the AMI will adhere to
32+
EC2_GRID_DRIVER_S3_BUCKET="ec2-linux-nvidia-drivers"
33+
NVIDIA_DRIVER_FULL_VERSION=$(aws s3 ls --recursive s3://${EC2_GRID_DRIVER_S3_BUCKET}/ \
34+
| grep -Eo "(NVIDIA-Linux-x86_64-)${NVIDIA_DRIVER_MAJOR_VERSION}\.[0-9]+\.[0-9]+(-grid-aws\.run)" \
35+
| cut -d'-' -f4 \
36+
| sort -V \
37+
| tail -1)
38+
39+
if [[ -z "$NVIDIA_DRIVER_FULL_VERSION" ]]; then
40+
echo "ERROR: Could not determine the full nvidia driver version to install"
41+
exit 1
42+
fi
43+
2744
################################################################################
2845
### Add repository #############################################################
2946
################################################################################
@@ -94,11 +111,16 @@ else
94111
fi
95112

96113
function archive-open-kmods() {
114+
local NVIDIA_OPEN_MODULE
97115
echo "Archiving open kmods"
116+
98117
if is-isolated-partition; then
99118
sudo dnf -y install "kmod-nvidia-open-dkms-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
100119
else
101-
sudo dnf -y module install nvidia-driver:${NVIDIA_DRIVER_MAJOR_VERSION}-open
120+
# Output of `sudo dnf module provides -q kmod-nvidia-open-dkms-570.172.08* | grep Module` is:
121+
# Module : nvidia-driver:570-open:20251009011129:f132e61741:x86_64
122+
NVIDIA_OPEN_MODULE=$(sudo dnf module provides -q kmod-nvidia-open-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}')
123+
sudo dnf -y module install ${NVIDIA_OPEN_MODULE}
102124
fi
103125
dkms status
104126
ls -la /var/lib/dkms/
@@ -107,7 +129,16 @@ function archive-open-kmods() {
107129

108130
# The open kernel module name changed from nvidia-open to nvidia in 570.148.08
109131
# Remove and re-add dkms module with the correct name. This maintains the current install and archive behavior
132+
local NVIDIA_OPEN_VERSION
110133
NVIDIA_OPEN_VERSION=$(kmod-util module-version nvidia)
134+
135+
# Sanity check to have consistent NVIDIA driver versions
136+
if [[ "$NVIDIA_OPEN_VERSION" != "$NVIDIA_DRIVER_FULL_VERSION" ]]; then
137+
echo "ERROR: NVIDIA open driver version ($NVIDIA_OPEN_VERSION) does not match GRID driver version ($NVIDIA_DRIVER_FULL_VERSION)"
138+
echo "All NVIDIA drivers must be on the same version."
139+
exit 1
140+
fi
141+
111142
sudo dkms remove "nvidia/$NVIDIA_OPEN_VERSION" --all
112143
sudo sed -i 's/PACKAGE_NAME="nvidia"/PACKAGE_NAME="nvidia-open"/' /usr/src/nvidia-$NVIDIA_OPEN_VERSION/dkms.conf
113144
sudo mv /usr/src/nvidia-$NVIDIA_OPEN_VERSION /usr/src/nvidia-open-$NVIDIA_OPEN_VERSION
@@ -117,10 +148,6 @@ function archive-open-kmods() {
117148

118149
sudo kmod-util archive nvidia-open
119150

120-
# Copy the source files to a new directory for GRID driver installation
121-
sudo mkdir /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION
122-
sudo cp -R /usr/src/nvidia-open-$NVIDIA_OPEN_VERSION/* /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION
123-
124151
KMOD_MAJOR_VERSION=$(sudo kmod-util module-version nvidia-open | cut -d. -f1)
125152
SUPPORTED_DEVICE_FILE="${WORKING_DIR}/gpu/nvidia-open-supported-devices-${KMOD_MAJOR_VERSION}.txt"
126153
sudo mv "${SUPPORTED_DEVICE_FILE}" /etc/eks/
@@ -131,43 +158,100 @@ function archive-open-kmods() {
131158
sudo dnf -y remove --all nvidia-driver
132159
sudo dnf -y remove --all "kmod-nvidia-open*"
133160
else
134-
sudo dnf -y module remove --all nvidia-driver
135-
sudo dnf -y module reset nvidia-driver
161+
sudo dnf -y module remove --all ${NVIDIA_OPEN_MODULE}
162+
sudo dnf -y module reset ${NVIDIA_OPEN_MODULE}
136163
fi
137164
}
138165

139166
function archive-grid-kmod() {
140167
local MACHINE
168+
local NVIDIA_GRID_RUNFILE_NAME
169+
local GRID_INSTALLATION_TEMP_DIR
170+
local EXTRACT_DIR
171+
172+
GRID_INSTALLATION_TEMP_DIR=$(mktemp -d)
173+
EXTRACT_DIR="${GRID_INSTALLATION_TEMP_DIR}/NVIDIA-GRID-extract"
174+
141175
MACHINE=$(uname -m)
142176
if [ "$MACHINE" != "x86_64" ]; then
143177
return
144178
fi
145-
echo "Archiving GRID kmods"
146-
NVIDIA_OPEN_VERSION=$(ls -d /usr/src/nvidia-open-grid-* | sed 's/.*nvidia-open-grid-//')
147-
sudo sed -i 's/PACKAGE_NAME="nvidia-open"/PACKAGE_NAME="nvidia-open-grid"/g' /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION/dkms.conf
148-
sudo sed -i "s/MAKE\[0\]=\"'make'/MAKE\[0\]=\"'make' GRID_BUILD=1 GRID_BUILD_CSP=1 /g" /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION/dkms.conf
149-
sudo dkms build -m nvidia-open-grid -v $NVIDIA_OPEN_VERSION
150-
sudo dkms install nvidia-open-grid/$NVIDIA_OPEN_VERSION
179+
180+
echo "Archiving NVIDIA GRID kernel modules for major version ${NVIDIA_DRIVER_MAJOR_VERSION}"
181+
NVIDIA_GRID_RUNFILE_NAME=$(aws s3 ls --recursive s3://${EC2_GRID_DRIVER_S3_BUCKET}/ \
182+
| grep "NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_FULL_VERSION}" \
183+
| sort -k1,2 \
184+
| tail -1 \
185+
| awk '{print $4}')
186+
187+
if [[ -z "$NVIDIA_GRID_RUNFILE_NAME" ]]; then
188+
echo "ERROR: No GRID driver found for driver version ${NVIDIA_DRIVER_FULL_VERSION} in EC2 S3 bucket"
189+
exit 1
190+
fi
191+
192+
echo "Found GRID runfile: ${NVIDIA_GRID_RUNFILE_NAME}"
193+
local GRID_RUNFILE_LOCAL_NAME
194+
GRID_RUNFILE_LOCAL_NAME=$(basename "${NVIDIA_GRID_RUNFILE_NAME}")
195+
196+
echo "Downloading GRID driver runfile..."
197+
aws s3 cp "s3://ec2-linux-nvidia-drivers/${NVIDIA_GRID_RUNFILE_NAME}" "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}"
198+
chmod +x "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}"
199+
echo "Extracting NVIDIA GRID driver runfile..."
200+
sudo "${GRID_INSTALLATION_TEMP_DIR}/${GRID_RUNFILE_LOCAL_NAME}" --extract-only --target "${EXTRACT_DIR}"
201+
202+
pushd "${EXTRACT_DIR}"
203+
204+
echo "Installing NVIDIA GRID kernel modules..."
205+
sudo ./nvidia-installer \
206+
--dkms \
207+
--kernel-module-type open \
208+
--silent || sudo cat /var/log/nvidia-installer.log
209+
210+
# Manual DKMS registration with package name changed to `nvidia-open-grid`
211+
sudo dkms remove "nvidia/$NVIDIA_DRIVER_FULL_VERSION" --all
212+
sudo sed -i 's/PACKAGE_NAME="nvidia"/PACKAGE_NAME="nvidia-open-grid"/' /usr/src/nvidia-$NVIDIA_DRIVER_FULL_VERSION/dkms.conf
213+
sudo mv /usr/src/nvidia-$NVIDIA_DRIVER_FULL_VERSION /usr/src/nvidia-open-grid-$NVIDIA_DRIVER_FULL_VERSION
214+
sudo dkms add -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
215+
sudo dkms build -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
216+
sudo dkms install -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
151217

152218
sudo kmod-util archive nvidia-open-grid
153219
sudo kmod-util remove nvidia-open-grid
154220
sudo rm -rf /usr/src/nvidia-open-grid*
221+
222+
popd
223+
sudo rm -rf "${GRID_INSTALLATION_TEMP_DIR}"
155224
}
156225

157226
function archive-proprietary-kmod() {
227+
local NVIDIA_PROPRIETARY_MODULE
158228
echo "Archiving proprietary kmods"
229+
159230
if is-isolated-partition; then
160231
sudo dnf -y install "kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
161232
else
162-
sudo dnf -y module install nvidia-driver:${NVIDIA_DRIVER_MAJOR_VERSION}-dkms
233+
# Output of `sudo dnf module provides -q kmod-nvidia-latest-dkms-570.172.08* | grep Module` is:
234+
# Module : nvidia-driver:570-dkms:20251009011129:61f77618b4:x86_64
235+
NVIDIA_PROPRIETARY_MODULE=$(sudo dnf module provides -q kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}')
236+
sudo dnf -y module install ${NVIDIA_PROPRIETARY_MODULE}
163237
fi
238+
239+
local NVIDIA_PROPRIETARY_VERSION
240+
NVIDIA_PROPRIETARY_VERSION=$(kmod-util module-version nvidia)
241+
242+
if [[ "$NVIDIA_PROPRIETARY_VERSION" != "$NVIDIA_DRIVER_FULL_VERSION" ]]; then
243+
echo "ERROR: NVIDIA proprietary driver version ($NVIDIA_PROPRIETARY_VERSION) does not match GRID driver version ($NVIDIA_DRIVER_FULL_VERSION)"
244+
echo "All NVIDIA drivers must be on the same version. GRID driver determines the version."
245+
exit 1
246+
fi
247+
164248
sudo kmod-util archive nvidia
165249
sudo kmod-util remove nvidia
166250
sudo rm -rf /usr/src/nvidia*
167251
}
168252

169-
archive-open-kmods
170253
archive-grid-kmod
254+
archive-open-kmods
171255
archive-proprietary-kmod
172256

173257
################################################################################

0 commit comments

Comments
 (0)