@@ -24,6 +24,23 @@ function rpm_install() {
2424
2525echo " Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..."
2626
27+ # The AL2023 GPU AMI currently builds and archives the following nvidia kernel modules
28+ # in /var/lib/dkms-archive: nvidia, nvidia-open, nvidia-open-grid. To maintain the stability
29+ # of the AMI, we want to ensure that all three kernel modules (and also the userspace modules)
30+ # are on the same NVIDIA driver version. Currently, the script installs the NVIDIA GRID drivers
31+ # first and decides the full NVIDIA driver version that the AMI will adhere to
32+ EC2_GRID_DRIVER_S3_BUCKET=" ec2-linux-nvidia-drivers"
33+ NVIDIA_DRIVER_FULL_VERSION=$( aws s3 ls --recursive s3://${EC2_GRID_DRIVER_S3_BUCKET} / \
34+ | grep -Eo " (NVIDIA-Linux-x86_64-)${NVIDIA_DRIVER_MAJOR_VERSION} \.[0-9]+\.[0-9]+(-grid-aws\.run)" \
35+ | cut -d' -' -f4 \
36+ | sort -V \
37+ | tail -1)
38+
39+ if [[ -z " $NVIDIA_DRIVER_FULL_VERSION " ]]; then
40+ echo " ERROR: Could not determine the full nvidia driver version to install"
41+ exit 1
42+ fi
43+
2744# ###############################################################################
2845# ## Add repository #############################################################
2946# ###############################################################################
@@ -94,11 +111,16 @@ else
94111fi
95112
96113function archive-open-kmods() {
114+ local NVIDIA_OPEN_MODULE
97115 echo " Archiving open kmods"
116+
98117 if is-isolated-partition; then
99118 sudo dnf -y install " kmod-nvidia-open-dkms-${NVIDIA_DRIVER_MAJOR_VERSION} .*"
100119 else
101- sudo dnf -y module install nvidia-driver:${NVIDIA_DRIVER_MAJOR_VERSION} -open
120+ # Output of `sudo dnf module provides -q kmod-nvidia-open-dkms-570.172.08* | grep Module` is:
121+ # Module : nvidia-driver:570-open:20251009011129:f132e61741:x86_64
122+ NVIDIA_OPEN_MODULE=$( sudo dnf module provides -q kmod-nvidia-open-dkms-${NVIDIA_DRIVER_FULL_VERSION} * | grep Module | awk -F' : ' ' {print $2}' )
123+ sudo dnf -y module install ${NVIDIA_OPEN_MODULE}
102124 fi
103125 dkms status
104126 ls -la /var/lib/dkms/
@@ -107,7 +129,16 @@ function archive-open-kmods() {
107129
108130 # The open kernel module name changed from nvidia-open to nvidia in 570.148.08
109131 # Remove and re-add dkms module with the correct name. This maintains the current install and archive behavior
132+ local NVIDIA_OPEN_VERSION
110133 NVIDIA_OPEN_VERSION=$( kmod-util module-version nvidia)
134+
135+ # Sanity check to have consistent NVIDIA driver versions
136+ if [[ " $NVIDIA_OPEN_VERSION " != " $NVIDIA_DRIVER_FULL_VERSION " ]]; then
137+ echo " ERROR: NVIDIA open driver version ($NVIDIA_OPEN_VERSION ) does not match GRID driver version ($NVIDIA_DRIVER_FULL_VERSION )"
138+ echo " All NVIDIA drivers must be on the same version."
139+ exit 1
140+ fi
141+
111142 sudo dkms remove " nvidia/$NVIDIA_OPEN_VERSION " --all
112143 sudo sed -i ' s/PACKAGE_NAME="nvidia"/PACKAGE_NAME="nvidia-open"/' /usr/src/nvidia-$NVIDIA_OPEN_VERSION /dkms.conf
113144 sudo mv /usr/src/nvidia-$NVIDIA_OPEN_VERSION /usr/src/nvidia-open-$NVIDIA_OPEN_VERSION
@@ -117,10 +148,6 @@ function archive-open-kmods() {
117148
118149 sudo kmod-util archive nvidia-open
119150
120- # Copy the source files to a new directory for GRID driver installation
121- sudo mkdir /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION
122- sudo cp -R /usr/src/nvidia-open-$NVIDIA_OPEN_VERSION /* /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION
123-
124151 KMOD_MAJOR_VERSION=$( sudo kmod-util module-version nvidia-open | cut -d. -f1)
125152 SUPPORTED_DEVICE_FILE=" ${WORKING_DIR} /gpu/nvidia-open-supported-devices-${KMOD_MAJOR_VERSION} .txt"
126153 sudo mv " ${SUPPORTED_DEVICE_FILE} " /etc/eks/
@@ -131,43 +158,100 @@ function archive-open-kmods() {
131158 sudo dnf -y remove --all nvidia-driver
132159 sudo dnf -y remove --all " kmod-nvidia-open*"
133160 else
134- sudo dnf -y module remove --all nvidia-driver
135- sudo dnf -y module reset nvidia-driver
161+ sudo dnf -y module remove --all ${NVIDIA_OPEN_MODULE}
162+ sudo dnf -y module reset ${NVIDIA_OPEN_MODULE}
136163 fi
137164}
138165
139166function archive-grid-kmod() {
140167 local MACHINE
168+ local NVIDIA_GRID_RUNFILE_NAME
169+ local GRID_INSTALLATION_TEMP_DIR
170+ local EXTRACT_DIR
171+
172+ GRID_INSTALLATION_TEMP_DIR=$( mktemp -d)
173+ EXTRACT_DIR=" ${GRID_INSTALLATION_TEMP_DIR} /NVIDIA-GRID-extract"
174+
141175 MACHINE=$( uname -m)
142176 if [ " $MACHINE " != " x86_64" ]; then
143177 return
144178 fi
145- echo " Archiving GRID kmods"
146- NVIDIA_OPEN_VERSION=$( ls -d /usr/src/nvidia-open-grid-* | sed ' s/.*nvidia-open-grid-//' )
147- sudo sed -i ' s/PACKAGE_NAME="nvidia-open"/PACKAGE_NAME="nvidia-open-grid"/g' /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION /dkms.conf
148- sudo sed -i " s/MAKE\[0\]=\" 'make'/MAKE\[0\]=\" 'make' GRID_BUILD=1 GRID_BUILD_CSP=1 /g" /usr/src/nvidia-open-grid-$NVIDIA_OPEN_VERSION /dkms.conf
149- sudo dkms build -m nvidia-open-grid -v $NVIDIA_OPEN_VERSION
150- sudo dkms install nvidia-open-grid/$NVIDIA_OPEN_VERSION
179+
180+ echo " Archiving NVIDIA GRID kernel modules for major version ${NVIDIA_DRIVER_MAJOR_VERSION} "
181+ NVIDIA_GRID_RUNFILE_NAME=$( aws s3 ls --recursive s3://${EC2_GRID_DRIVER_S3_BUCKET} / \
182+ | grep " NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_FULL_VERSION} " \
183+ | sort -k1,2 \
184+ | tail -1 \
185+ | awk ' {print $4}' )
186+
187+ if [[ -z " $NVIDIA_GRID_RUNFILE_NAME " ]]; then
188+ echo " ERROR: No GRID driver found for driver version ${NVIDIA_DRIVER_FULL_VERSION} in EC2 S3 bucket"
189+ exit 1
190+ fi
191+
192+ echo " Found GRID runfile: ${NVIDIA_GRID_RUNFILE_NAME} "
193+ local GRID_RUNFILE_LOCAL_NAME
194+ GRID_RUNFILE_LOCAL_NAME=$( basename " ${NVIDIA_GRID_RUNFILE_NAME} " )
195+
196+ echo " Downloading GRID driver runfile..."
197+ aws s3 cp " s3://ec2-linux-nvidia-drivers/${NVIDIA_GRID_RUNFILE_NAME} " " ${GRID_INSTALLATION_TEMP_DIR} /${GRID_RUNFILE_LOCAL_NAME} "
198+ chmod +x " ${GRID_INSTALLATION_TEMP_DIR} /${GRID_RUNFILE_LOCAL_NAME} "
199+ echo " Extracting NVIDIA GRID driver runfile..."
200+ sudo " ${GRID_INSTALLATION_TEMP_DIR} /${GRID_RUNFILE_LOCAL_NAME} " --extract-only --target " ${EXTRACT_DIR} "
201+
202+ pushd " ${EXTRACT_DIR} "
203+
204+ echo " Installing NVIDIA GRID kernel modules..."
205+ sudo ./nvidia-installer \
206+ --dkms \
207+ --kernel-module-type open \
208+ --silent || sudo cat /var/log/nvidia-installer.log
209+
210+ # Manual DKMS registration with package name changed to `nvidia-open-grid`
211+ sudo dkms remove " nvidia/$NVIDIA_DRIVER_FULL_VERSION " --all
212+ sudo sed -i ' s/PACKAGE_NAME="nvidia"/PACKAGE_NAME="nvidia-open-grid"/' /usr/src/nvidia-$NVIDIA_DRIVER_FULL_VERSION /dkms.conf
213+ sudo mv /usr/src/nvidia-$NVIDIA_DRIVER_FULL_VERSION /usr/src/nvidia-open-grid-$NVIDIA_DRIVER_FULL_VERSION
214+ sudo dkms add -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
215+ sudo dkms build -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
216+ sudo dkms install -m nvidia-open-grid -v $NVIDIA_DRIVER_FULL_VERSION
151217
152218 sudo kmod-util archive nvidia-open-grid
153219 sudo kmod-util remove nvidia-open-grid
154220 sudo rm -rf /usr/src/nvidia-open-grid*
221+
222+ popd
223+ sudo rm -rf " ${GRID_INSTALLATION_TEMP_DIR} "
155224}
156225
157226function archive-proprietary-kmod() {
227+ local NVIDIA_PROPRIETARY_MODULE
158228 echo " Archiving proprietary kmods"
229+
159230 if is-isolated-partition; then
160231 sudo dnf -y install " kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_MAJOR_VERSION} .*"
161232 else
162- sudo dnf -y module install nvidia-driver:${NVIDIA_DRIVER_MAJOR_VERSION} -dkms
233+ # Output of `sudo dnf module provides -q kmod-nvidia-latest-dkms-570.172.08* | grep Module` is:
234+ # Module : nvidia-driver:570-dkms:20251009011129:61f77618b4:x86_64
235+ NVIDIA_PROPRIETARY_MODULE=$( sudo dnf module provides -q kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_FULL_VERSION} * | grep Module | awk -F' : ' ' {print $2}' )
236+ sudo dnf -y module install ${NVIDIA_PROPRIETARY_MODULE}
163237 fi
238+
239+ local NVIDIA_PROPRIETARY_VERSION
240+ NVIDIA_PROPRIETARY_VERSION=$( kmod-util module-version nvidia)
241+
242+ if [[ " $NVIDIA_PROPRIETARY_VERSION " != " $NVIDIA_DRIVER_FULL_VERSION " ]]; then
243+ echo " ERROR: NVIDIA proprietary driver version ($NVIDIA_PROPRIETARY_VERSION ) does not match GRID driver version ($NVIDIA_DRIVER_FULL_VERSION )"
244+ echo " All NVIDIA drivers must be on the same version. GRID driver determines the version."
245+ exit 1
246+ fi
247+
164248 sudo kmod-util archive nvidia
165249 sudo kmod-util remove nvidia
166250 sudo rm -rf /usr/src/nvidia*
167251}
168252
169- archive-open-kmods
170253archive-grid-kmod
254+ archive-open-kmods
171255archive-proprietary-kmod
172256
173257# ###############################################################################
0 commit comments