Skip to content

Commit be8a570

Browse files
Shiva Kumarshivakunv
authored andcommitted
Add support for rhel10.0 and rhel10.1
Signed-off-by: Shiva Kumar (SW-CLOUD) <[email protected]> Add support for rhel10.0 and rhel10.1 Signed-off-by: Shiva Kumar (SW-CLOUD) <[email protected]>
1 parent 1627344 commit be8a570

File tree

3 files changed

+132
-55
lines changed

3 files changed

+132
-55
lines changed

rhel10/common.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Check if fast path should be used (driver already loaded with matching config)
50+
# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest
51+
_should_skip_kernel_module_reload() {
52+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
53+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
54+
[ -z "${current_digest}" ] && return 1
55+
local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
56+
[ "${current_digest}" = "${stored_digest}" ]
57+
}

rhel10/nvidia-driver

Lines changed: 100 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,10 @@ _install_prerequisites() (
166166
# Parse gcc version
167167
# gcc_version is expected to match x.y.z
168168
# current_gcc is expected to match 'gcc-x.y.z-rel.el8.x86_64
169-
local gcc_version=$(cat /lib/modules/${KERNEL_VERSION}/proc/version | sed -n 's/.*gcc (GCC) \([0-9.]*\).*/\1/p')
169+
local gcc_version=$(cat /lib/modules/${KERNEL_VERSION}/proc/version | grep -Eo "gcc \(GCC\) ([0-9\.]+)" | grep -Eo "([0-9\.]+)")
170170
local current_gcc=$(rpm -qa gcc)
171171
echo "kernel requires gcc version: 'gcc-${gcc_version}', current gcc version is '${current_gcc}'"
172-
172+
173173
if ! [[ "${current_gcc}" =~ "gcc-${gcc_version}"-.* ]]; then
174174
echo "WARNING: GCC version mismatch detected, but attempting to continue..."
175175
echo "Kernel built with: gcc-${gcc_version}"
@@ -423,44 +423,7 @@ _load_driver() {
423423
set +o xtrace -o nounset
424424
fi
425425

426-
echo "Starting NVIDIA persistence daemon..."
427-
nvidia-persistenced --persistence-mode
428-
429-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
430-
echo "Copying gridd.conf..."
431-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
432-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
433-
echo "Copying ClientConfigToken..."
434-
mkdir -p /etc/nvidia/ClientConfigToken/
435-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
436-
fi
437-
438-
echo "Starting nvidia-gridd.."
439-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
440-
441-
# Start virtual topology daemon
442-
_start_vgpu_topology_daemon
443-
fi
444-
445-
if _assert_nvlink5_system; then
446-
_ensure_nvlink5_prerequisites || return 1
447-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
448-
449-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
450-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
451-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
452-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
453-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
454-
--fm-config-file $fm_config_file \
455-
--fm-pid-file $fm_pid_file \
456-
--nvlsm-config-file $nvlsm_config_file \
457-
--nvlsm-pid-file $nvlsm_pid_file
458-
459-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
460-
elif _assert_nvswitch_system; then
461-
echo "Starting NVIDIA fabric manager daemon..."
462-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
463-
fi
426+
_start_daemons
464427
}
465428

466429
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -502,6 +465,21 @@ _unload_driver() {
502465
fi
503466
fi
504467

468+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
469+
echo "Stopping NVIDIA topology daemon..."
470+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
471+
472+
kill -SIGTERM "${pid}"
473+
for i in $(seq 1 50); do
474+
kill -0 "${pid}" 2> /dev/null || break
475+
sleep 0.1
476+
done
477+
if [ $i -eq 50 ]; then
478+
echo "Could not stop NVIDIA topology daemon" >&2
479+
return 1
480+
fi
481+
fi
482+
505483
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
506484
echo "Stopping NVIDIA fabric manager daemon..."
507485
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -592,10 +570,6 @@ _install_driver() {
592570
fi
593571

594572
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
595-
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
596-
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
597-
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
598-
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
599573
}
600574

601575
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -726,6 +700,74 @@ _start_vgpu_topology_daemon() {
726700
nvidia-topologyd
727701
}
728702

703+
_start_daemons() {
704+
echo "Starting NVIDIA persistence daemon..."
705+
nvidia-persistenced --persistence-mode
706+
707+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
708+
echo "Copying gridd.conf..."
709+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
710+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
711+
echo "Copying ClientConfigToken..."
712+
mkdir -p /etc/nvidia/ClientConfigToken/
713+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
714+
fi
715+
716+
echo "Starting nvidia-gridd.."
717+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
718+
719+
# Start virtual topology daemon
720+
_start_vgpu_topology_daemon
721+
fi
722+
723+
if _assert_nvlink5_system; then
724+
_ensure_nvlink5_prerequisites || return 1
725+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
726+
727+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
728+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
729+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
730+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
731+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
732+
--fm-config-file $fm_config_file \
733+
--fm-pid-file $fm_pid_file \
734+
--nvlsm-config-file $nvlsm_config_file \
735+
--nvlsm-pid-file $nvlsm_pid_file
736+
737+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
738+
elif _assert_nvswitch_system; then
739+
echo "Starting NVIDIA fabric manager daemon..."
740+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
741+
fi
742+
}
743+
744+
_store_driver_digest() {
745+
local digest_file="${RUN_DIR}/nvidia-driver.state"
746+
echo "Storing driver configuration digest..."
747+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
748+
echo "${current_digest}" > "$digest_file"
749+
echo "Driver configuration digest stored at $digest_file"
750+
}
751+
752+
_wait_for_signal() {
753+
echo "Done, now waiting for signal"
754+
sleep infinity &
755+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
756+
trap - EXIT
757+
while true; do wait $! || continue; done
758+
exit 0
759+
}
760+
761+
_userspace_install() {
762+
echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
763+
_unmount_rootfs
764+
_start_daemons
765+
_mount_rootfs
766+
_write_kernel_update_hook
767+
_store_driver_digest
768+
echo "Userspace-only install complete"
769+
}
770+
729771
_prepare() {
730772
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
731773
_find_vgpu_driver_version || exit 1
@@ -763,8 +805,6 @@ _prepare_exclusive() {
763805
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
764806
trap "_shutdown" EXIT
765807

766-
_unload_driver || exit 1
767-
_unmount_rootfs
768808
}
769809

770810
_build() {
@@ -786,17 +826,21 @@ _load() {
786826
_mount_rootfs
787827
_write_kernel_update_hook
788828

789-
echo "Done, now waiting for signal"
790-
sleep infinity &
791-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
792-
trap - EXIT
793-
while true; do wait $! || continue; done
794-
exit 0
829+
_store_driver_digest
830+
_wait_for_signal
795831
}
796832

797833
init() {
798834
_prepare_exclusive
799835

836+
if _should_skip_kernel_module_reload; then
837+
_userspace_install
838+
_wait_for_signal
839+
fi
840+
841+
_unload_driver || exit 1
842+
_unmount_rootfs
843+
800844
_build
801845

802846
_load
@@ -811,6 +855,9 @@ build() {
811855
load() {
812856
_prepare_exclusive
813857

858+
_unload_driver || exit 1
859+
_unmount_rootfs
860+
814861
_load
815862
}
816863

@@ -882,7 +929,7 @@ reload_nvidia_peermem() {
882929
fi
883930
# get any parameters provided for nvidia-peermem
884931
_get_module_params && set +o nounset
885-
if chroot /run/nvidia/driver modprobe nvidia-peermem; then
932+
if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then
886933
if [ -f /sys/module/nvidia_peermem/refcnt ]; then
887934
echo "successfully loaded nvidia-peermem module, now waiting for signal"
888935
sleep inf

rhel10/ocp_dtk_entrypoint

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
1818
exec bash -x nvidia-driver init
1919
fi
2020

21+
if _should_skip_kernel_module_reload; then
22+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install"
23+
exec bash -x nvidia-driver init
24+
fi
25+
26+
echo "Fast path not detected: building driver and modules"
27+
2128
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
2229
cp -r \
2330
/tmp/install.sh \
@@ -81,6 +88,19 @@ dtk-build-driver() {
8188
sleep inf
8289
fi
8390

91+
# Check if fast path is being used - if so, skip building and signal completion
92+
if _should_skip_kernel_module_reload; then
93+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping build"
94+
echo "Signaling driver_built to the main container and sleeping forever..."
95+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
96+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
97+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
98+
sleep 5
99+
done
100+
echo "WARNING: driver_built flag disappeared"
101+
exit 0
102+
fi
103+
84104
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
85105
echo "WARNING: broken Driver Toolkit image detected:"
86106
echo "- Node kernel: $(uname -r)"
@@ -101,7 +121,7 @@ dtk-build-driver() {
101121
echo "NVIDIA drivers already generated, nothing to do ..."
102122

103123
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
104-
sleep 30
124+
sleep 5
105125
done
106126
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
107127
else
@@ -254,7 +274,7 @@ dtk-build-driver() {
254274
fi
255275

256276
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
257-
sleep 30
277+
sleep 5
258278
done
259279

260280
echo "WARNING: driver_built flag disappeared, restart this container"

0 commit comments

Comments
 (0)