Skip to content

Commit d38fdc5

Browse files
Shiva Kumarshivakunv
authored andcommitted
Add support for rhel10.0 and rhel10.1
Signed-off-by: Shiva Kumar (SW-CLOUD) <shivaku@nvidia.com>
1 parent 1627344 commit d38fdc5

File tree

3 files changed

+128
-52
lines changed

3 files changed

+128
-52
lines changed

rhel10/common.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Check if fast path should be used (driver already loaded with matching config)
50+
# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest
51+
_should_skip_kernel_module_reload() {
52+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
53+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
54+
[ -z "${current_digest}" ] && return 1
55+
local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
56+
[ "${current_digest}" = "${stored_digest}" ]
57+
}

rhel10/ocp_dtk_entrypoint

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
1818
exec bash -x nvidia-driver init
1919
fi
2020

21+
if _should_skip_kernel_module_reload; then
22+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install"
23+
exec bash -x nvidia-driver init
24+
fi
25+
26+
echo "Fast path not detected: building driver and modules"
27+
2128
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
2229
cp -r \
2330
/tmp/install.sh \
@@ -81,6 +88,19 @@ dtk-build-driver() {
8188
sleep inf
8289
fi
8390

91+
# Check if fast path is being used - if so, skip building and signal completion
92+
if _should_skip_kernel_module_reload; then
93+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping build"
94+
echo "Signaling driver_built to the main container and sleeping forever..."
95+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
96+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
97+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
98+
sleep 5
99+
done
100+
echo "WARNING: driver_built flag disappeared"
101+
exit 0
102+
fi
103+
84104
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
85105
echo "WARNING: broken Driver Toolkit image detected:"
86106
echo "- Node kernel: $(uname -r)"
@@ -101,7 +121,7 @@ dtk-build-driver() {
101121
echo "NVIDIA drivers already generated, nothing to do ..."
102122

103123
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
104-
sleep 30
124+
sleep 5
105125
done
106126
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
107127
else
@@ -254,7 +274,7 @@ dtk-build-driver() {
254274
fi
255275

256276
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
257-
sleep 30
277+
sleep 5
258278
done
259279

260280
echo "WARNING: driver_built flag disappeared, restart this container"

rhel9/nvidia-driver

Lines changed: 96 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -418,44 +418,7 @@ _load_driver() {
418418
set +o xtrace -o nounset
419419
fi
420420

421-
echo "Starting NVIDIA persistence daemon..."
422-
nvidia-persistenced --persistence-mode
423-
424-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
425-
echo "Copying gridd.conf..."
426-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
427-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
428-
echo "Copying ClientConfigToken..."
429-
mkdir -p /etc/nvidia/ClientConfigToken/
430-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
431-
fi
432-
433-
echo "Starting nvidia-gridd.."
434-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
435-
436-
# Start virtual topology daemon
437-
_start_vgpu_topology_daemon
438-
fi
439-
440-
if _assert_nvlink5_system; then
441-
_ensure_nvlink5_prerequisites || return 1
442-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
449-
--fm-config-file $fm_config_file \
450-
--fm-pid-file $fm_pid_file \
451-
--nvlsm-config-file $nvlsm_config_file \
452-
--nvlsm-pid-file $nvlsm_pid_file
453-
454-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455-
elif _assert_nvswitch_system; then
456-
echo "Starting NVIDIA fabric manager daemon..."
457-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458-
fi
421+
_start_daemons
459422
}
460423

461424
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -497,6 +460,21 @@ _unload_driver() {
497460
fi
498461
fi
499462

463+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
464+
echo "Stopping NVIDIA topology daemon..."
465+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
466+
467+
kill -SIGTERM "${pid}"
468+
for i in $(seq 1 50); do
469+
kill -0 "${pid}" 2> /dev/null || break
470+
sleep 0.1
471+
done
472+
if [ $i -eq 50 ]; then
473+
echo "Could not stop NVIDIA topology daemon" >&2
474+
return 1
475+
fi
476+
fi
477+
500478
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
501479
echo "Stopping NVIDIA fabric manager daemon..."
502480
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -587,10 +565,6 @@ _install_driver() {
587565
fi
588566

589567
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
590-
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
591-
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
592-
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
593-
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
594568
}
595569

596570
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -721,6 +695,73 @@ _start_vgpu_topology_daemon() {
721695
nvidia-topologyd
722696
}
723697

698+
_start_daemons() {
699+
echo "Starting NVIDIA persistence daemon..."
700+
nvidia-persistenced --persistence-mode
701+
702+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
703+
echo "Copying gridd.conf..."
704+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
705+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
706+
echo "Copying ClientConfigToken..."
707+
mkdir -p /etc/nvidia/ClientConfigToken/
708+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
709+
fi
710+
711+
echo "Starting nvidia-gridd.."
712+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
713+
714+
# Start virtual topology daemon
715+
_start_vgpu_topology_daemon
716+
fi
717+
718+
if _assert_nvlink5_system; then
719+
_ensure_nvlink5_prerequisites || return 1
720+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
721+
722+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
723+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
724+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
725+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
726+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
727+
--fm-config-file $fm_config_file \
728+
--fm-pid-file $fm_pid_file \
729+
--nvlsm-config-file $nvlsm_config_file \
730+
--nvlsm-pid-file $nvlsm_pid_file
731+
732+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
733+
elif _assert_nvswitch_system; then
734+
echo "Starting NVIDIA fabric manager daemon..."
735+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
736+
fi
737+
}
738+
739+
_store_driver_digest() {
740+
local digest_file="${RUN_DIR}/nvidia-driver.state"
741+
echo "Storing driver configuration digest..."
742+
echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file"
743+
echo "Driver configuration digest stored at $digest_file"
744+
}
745+
746+
_wait_for_signal() {
747+
echo "Done, now waiting for signal"
748+
sleep infinity &
749+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
750+
trap - EXIT
751+
while true; do wait $! || continue; done
752+
exit 0
753+
}
754+
755+
_userspace_install() {
756+
echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
757+
_unmount_rootfs
758+
_start_daemons
759+
_mount_rootfs
760+
_write_kernel_update_hook
761+
_store_driver_digest
762+
echo "Userspace-only install complete"
763+
}
764+
724765
_prepare() {
725766
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
726767
_find_vgpu_driver_version || exit 1
@@ -758,8 +799,6 @@ _prepare_exclusive() {
758799
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
759800
trap "_shutdown" EXIT
760801

761-
_unload_driver || exit 1
762-
_unmount_rootfs
763802
}
764803

765804
_build() {
@@ -781,17 +820,21 @@ _load() {
781820
_mount_rootfs
782821
_write_kernel_update_hook
783822

784-
echo "Done, now waiting for signal"
785-
sleep infinity &
786-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
787-
trap - EXIT
788-
while true; do wait $! || continue; done
789-
exit 0
823+
_store_driver_digest
824+
_wait_for_signal
790825
}
791826

792827
init() {
793828
_prepare_exclusive
794829

830+
if _should_skip_kernel_module_reload; then
831+
_userspace_install
832+
_wait_for_signal
833+
fi
834+
835+
_unload_driver || exit 1
836+
_unmount_rootfs
837+
795838
_build
796839

797840
_load
@@ -806,6 +849,9 @@ build() {
806849
load() {
807850
_prepare_exclusive
808851

852+
_unload_driver || exit 1
853+
_unmount_rootfs
854+
809855
_load
810856
}
811857

0 commit comments

Comments
 (0)