Skip to content

Commit 3cb8b05

Browse files
committed
chore(al2023/nvidia): default to 580 driver for 1.34+
1 parent 81179a5 commit 3cb8b05

File tree

10 files changed

+397
-19
lines changed

10 files changed

+397
-19
lines changed

templates/al2023/provisioners/install-nvidia-driver.sh

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ function archive-open-kmods() {
119119
else
120120
# Output of `sudo dnf module provides -q kmod-nvidia-open-dkms-570.172.08* | grep Module` is:
121121
# Module : nvidia-driver:570-open:20251009011129:f132e61741:x86_64
122-
NVIDIA_OPEN_MODULE=$(sudo dnf module provides -q kmod-nvidia-open-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}')
122+
# There is also a latest module stream that may provide the same package version, so we select only
123+
# the one with the version in the name
124+
NVIDIA_OPEN_MODULE=$(sudo dnf module provides -q kmod-nvidia-open-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}' | grep "$NVIDIA_DRIVER_MAJOR_VERSION")
123125
sudo dnf -y module install ${NVIDIA_OPEN_MODULE}
124126
fi
125127
dkms status
@@ -232,7 +234,9 @@ function archive-proprietary-kmod() {
232234
else
233235
# Output of `sudo dnf module provides -q kmod-nvidia-latest-dkms-570.172.08* | grep Module` is:
234236
# Module : nvidia-driver:570-dkms:20251009011129:61f77618b4:x86_64
235-
NVIDIA_PROPRIETARY_MODULE=$(sudo dnf module provides -q kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}')
237+
# There is also a latest module stream that may provide the same package version, so we select only
238+
# the one with the version in the name
239+
NVIDIA_PROPRIETARY_MODULE=$(sudo dnf module provides -q kmod-nvidia-latest-dkms-${NVIDIA_DRIVER_FULL_VERSION}* | grep Module | awk -F' : ' '{print $2}' | grep "$NVIDIA_DRIVER_MAJOR_VERSION")
236240
sudo dnf -y module install ${NVIDIA_PROPRIETARY_MODULE}
237241
fi
238242

@@ -282,14 +286,30 @@ sudo systemctl enable set-nvidia-clocks.service
282286
################################################################################
283287
### Install other dependencies #################################################
284288
################################################################################
285-
sudo dnf -y install "nvidia-fabric-manager-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
286-
sudo dnf -y install "nvidia-imex-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
289+
if [[ "$NVIDIA_DRIVER_MAJOR_VERSION" -lt "580" ]]; then
290+
# versions before 580 used to have a dash between fabric and manager
291+
sudo dnf -y install "nvidia-fabric-manager-${NVIDIA_DRIVER_FULL_VERSION}"
292+
else
293+
sudo dnf -y install "nvidia-fabricmanager-${NVIDIA_DRIVER_FULL_VERSION}"
294+
fi
295+
# versions of nvidia-imex < 580 use nvidia-imex-<major-version>-<full-version>
296+
sudo dnf -y install "nvidia-imex-${NVIDIA_DRIVER_MAJOR_VERSION}*"
287297

288298
# NVIDIA Container toolkit needs to be locally installed for isolated partitions, also install NVIDIA-Persistenced
289299
if is-isolated-partition; then
290300
sudo dnf -y install nvidia-container-toolkit
291-
sudo dnf -y install "nvidia-persistenced-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
292-
sudo dnf -y install "nvidia-driver-cuda-${NVIDIA_DRIVER_MAJOR_VERSION}.*"
301+
sudo dnf -y install "nvidia-persistenced-${NVIDIA_DRIVER_FULL_VERSION}"
302+
sudo dnf -y install "nvidia-driver-cuda-${NVIDIA_DRIVER_FULL_VERSION}"
303+
# TODO: standardize this across partitions
304+
if [[ "$NVIDIA_DRIVER_MAJOR_VERSION" -ge "580" ]]; then
305+
sudo dnf -y install \
306+
"libnvidia-fbc-${NVIDIA_DRIVER_FULL_VERSION}" \
307+
"nvidia-driver-${NVIDIA_DRIVER_FULL_VERSION}" \
308+
"nvidia-libXNVCtrl-devel-${NVIDIA_DRIVER_FULL_VERSION}" \
309+
"nvidia-settings-${NVIDIA_DRIVER_FULL_VERSION}" \
310+
"nvidia-xconfig-${NVIDIA_DRIVER_FULL_VERSION}" \
311+
"xorg-x11-nvidia-${NVIDIA_DRIVER_FULL_VERSION}"
312+
fi
293313
else
294314
sudo dnf -y install nvidia-container-toolkit
295315
fi

templates/al2023/runtime/gpu/nvidia-kmod-load.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ else
7373
MODULE_NAME="nvidia"
7474
fi
7575

76+
# TODO: disable dkms autoinstall and write these configurations to /run to ensure even lower
77+
# priority (more room for user overrides) and expected behavior between reboots
78+
# dkms install is enabled by default for nvidia modules through the AUTOINSTALL="yes" line
79+
# in the dkms.conf, which is picked up and ran by dkms.service at boot
7680
function disable-gsp() {
7781
echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-disable-gsp.conf
7882
}
@@ -84,7 +88,7 @@ case "${INSTANCE_TYPE}" in
8488
g4dn.* | g5.* | g5g.*)
8589
echo "Disabling GSP for instance type: ${INSTANCE_TYPE}"
8690
disable-gsp
87-
echo "Using propreitary module for instance type: ${INSTANCE_TYPE}"
91+
echo "Using proprietary module for instance type: ${INSTANCE_TYPE}"
8892
MODULE_NAME="nvidia"
8993
;;
9094

@@ -93,4 +97,8 @@ case "${INSTANCE_TYPE}" in
9397
;;
9498
esac
9599

100+
# Enable CDMM, only applies for driver versions 580 or later and machines with coherent memory (e.g. GB200)
101+
# https://nvdam.widen.net/s/gpqp6wmz7s/cuda-whitepaper--cdmm-pdf
102+
echo "options nvidia NVreg_CoherentGPUMemoryMode=driver" > /etc/modprobe.d/40-eks-nvidia-openrm.conf
103+
96104
kmod-util load "${MODULE_NAME}"

0 commit comments

Comments
 (0)