Skip to content

Commit d612e96

Browse files
Add userspace-only install for RHEL8 and Ubuntu 24.04
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent a83583e commit d612e96

File tree

4 files changed

+284
-99
lines changed

4 files changed

+284
-99
lines changed

rhel8/common.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Check if fast path should be used (driver already loaded with matching config)
50+
# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest
51+
_should_skip_kernel_module_reload() {
52+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
53+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
54+
[ -z "${current_digest}" ] && return 1
55+
local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
56+
[ "${current_digest}" = "${stored_digest}" ]
57+
}

rhel8/nvidia-driver

Lines changed: 96 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -401,44 +401,7 @@ _load_driver() {
401401
set +o xtrace -o nounset
402402
fi
403403

404-
echo "Starting NVIDIA persistence daemon..."
405-
nvidia-persistenced --persistence-mode
406-
407-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
408-
echo "Copying gridd.conf..."
409-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
410-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
411-
echo "Copying ClientConfigToken..."
412-
mkdir -p /etc/nvidia/ClientConfigToken/
413-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
414-
fi
415-
416-
echo "Starting nvidia-gridd.."
417-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
418-
419-
# Start virtual topology daemon
420-
_start_vgpu_topology_daemon
421-
fi
422-
423-
if _assert_nvlink5_system; then
424-
_ensure_nvlink5_prerequisites || return 1
425-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
426-
427-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
428-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
429-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
430-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
431-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
432-
--fm-config-file $fm_config_file \
433-
--fm-pid-file $fm_pid_file \
434-
--nvlsm-config-file $nvlsm_config_file \
435-
--nvlsm-pid-file $nvlsm_pid_file
436-
437-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
438-
elif _assert_nvswitch_system; then
439-
echo "Starting NVIDIA fabric manager daemon..."
440-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
441-
fi
404+
_start_daemons
442405
}
443406

444407
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -480,6 +443,21 @@ _unload_driver() {
480443
fi
481444
fi
482445

446+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
447+
echo "Stopping NVIDIA topology daemon..."
448+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
449+
450+
kill -SIGTERM "${pid}"
451+
for i in $(seq 1 50); do
452+
kill -0 "${pid}" 2> /dev/null || break
453+
sleep 0.1
454+
done
455+
if [ $i -eq 50 ]; then
456+
echo "Could not stop NVIDIA topology daemon" >&2
457+
return 1
458+
fi
459+
fi
460+
483461
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
484462
echo "Stopping NVIDIA fabric manager daemon..."
485463
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -570,10 +548,6 @@ _install_driver() {
570548
fi
571549

572550
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
573-
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
574-
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
575-
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
576-
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
577551
}
578552

579553
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -704,6 +678,73 @@ _start_vgpu_topology_daemon() {
704678
nvidia-topologyd
705679
}
706680

681+
_start_daemons() {
682+
echo "Starting NVIDIA persistence daemon..."
683+
nvidia-persistenced --persistence-mode
684+
685+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
686+
echo "Copying gridd.conf..."
687+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
688+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
689+
echo "Copying ClientConfigToken..."
690+
mkdir -p /etc/nvidia/ClientConfigToken/
691+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
692+
fi
693+
694+
echo "Starting nvidia-gridd.."
695+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
696+
697+
# Start virtual topology daemon
698+
_start_vgpu_topology_daemon
699+
fi
700+
701+
if _assert_nvlink5_system; then
702+
_ensure_nvlink5_prerequisites || return 1
703+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
704+
705+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
706+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
707+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
708+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
709+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
710+
--fm-config-file $fm_config_file \
711+
--fm-pid-file $fm_pid_file \
712+
--nvlsm-config-file $nvlsm_config_file \
713+
--nvlsm-pid-file $nvlsm_pid_file
714+
715+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
716+
elif _assert_nvswitch_system; then
717+
echo "Starting NVIDIA fabric manager daemon..."
718+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
719+
fi
720+
}
721+
722+
_store_driver_digest() {
723+
local digest_file="${RUN_DIR}/nvidia-driver.state"
724+
echo "Storing driver configuration digest..."
725+
echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file"
726+
echo "Driver configuration digest stored at $digest_file"
727+
}
728+
729+
_wait_for_signal() {
730+
echo "Done, now waiting for signal"
731+
sleep infinity &
732+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
733+
trap - EXIT
734+
while true; do wait $! || continue; done
735+
exit 0
736+
}
737+
738+
_userspace_install() {
739+
echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
740+
_unmount_rootfs
741+
_mount_rootfs
742+
_start_daemons
743+
_write_kernel_update_hook
744+
_store_driver_digest
745+
echo "Userspace-only install complete"
746+
}
747+
707748
_prepare() {
708749
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
709750
_find_vgpu_driver_version || exit 1
@@ -740,9 +781,6 @@ _prepare_exclusive() {
740781

741782
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
742783
trap "_shutdown" EXIT
743-
744-
_unload_driver || exit 1
745-
_unmount_rootfs
746784
}
747785

748786
_build() {
@@ -763,18 +801,21 @@ _load() {
763801
_load_driver
764802
_mount_rootfs
765803
_write_kernel_update_hook
766-
767-
echo "Done, now waiting for signal"
768-
sleep infinity &
769-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
770-
trap - EXIT
771-
while true; do wait $! || continue; done
772-
exit 0
804+
_store_driver_digest
805+
_wait_for_signal
773806
}
774807

775808
init() {
776809
_prepare_exclusive
777810

811+
if _should_skip_kernel_module_reload; then
812+
_userspace_install
813+
_wait_for_signal
814+
fi
815+
816+
_unload_driver || exit 1
817+
_unmount_rootfs
818+
778819
_build
779820

780821
_load
@@ -789,6 +830,9 @@ build() {
789830
load() {
790831
_prepare_exclusive
791832

833+
_unload_driver || exit 1
834+
_unmount_rootfs
835+
792836
_load
793837
}
794838

rhel8/ocp_dtk_entrypoint

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
1818
exec bash -x nvidia-driver init
1919
fi
2020

21+
if _should_skip_kernel_module_reload; then
22+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install"
23+
exec bash -x nvidia-driver init
24+
fi
25+
26+
echo "Fast path not detected: building driver and modules"
27+
2128
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
2229
cp -r \
2330
/tmp/install.sh \
@@ -79,6 +86,19 @@ dtk-build-driver() {
7986
sleep inf
8087
fi
8188

89+
# Check if fast path is being used - if so, skip building and signal completion
90+
if _should_skip_kernel_module_reload; then
91+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping build"
92+
echo "Signaling driver_built to the main container and sleeping forever..."
93+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
94+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
95+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
96+
sleep 30
97+
done
98+
echo "WARNING: driver_built flag disappeared"
99+
exit 0
100+
fi
101+
82102
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
83103
echo "WARNING: broken Driver Toolkit image detected:"
84104
echo "- Node kernel: $(uname -r)"

0 commit comments

Comments
 (0)