Skip to content

Commit a577c50

Browse files
Add userspace-only install for RHEL8 and Ubuntu 24.04
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent a83583e commit a577c50

File tree

7 files changed

+291
-106
lines changed

7 files changed

+291
-106
lines changed

rhel8/common.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Check if fast path should be used (driver already loaded with matching config)
50+
# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest
51+
_should_skip_kernel_module_reload() {
52+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
53+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
54+
[ -z "${current_digest}" ] && return 1
55+
local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
56+
[ "${current_digest}" = "${stored_digest}" ]
57+
}

rhel8/nvidia-driver

Lines changed: 96 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -401,44 +401,7 @@ _load_driver() {
401401
set +o xtrace -o nounset
402402
fi
403403

404-
echo "Starting NVIDIA persistence daemon..."
405-
nvidia-persistenced --persistence-mode
406-
407-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
408-
echo "Copying gridd.conf..."
409-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
410-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
411-
echo "Copying ClientConfigToken..."
412-
mkdir -p /etc/nvidia/ClientConfigToken/
413-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
414-
fi
415-
416-
echo "Starting nvidia-gridd.."
417-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
418-
419-
# Start virtual topology daemon
420-
_start_vgpu_topology_daemon
421-
fi
422-
423-
if _assert_nvlink5_system; then
424-
_ensure_nvlink5_prerequisites || return 1
425-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
426-
427-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
428-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
429-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
430-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
431-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
432-
--fm-config-file $fm_config_file \
433-
--fm-pid-file $fm_pid_file \
434-
--nvlsm-config-file $nvlsm_config_file \
435-
--nvlsm-pid-file $nvlsm_pid_file
436-
437-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
438-
elif _assert_nvswitch_system; then
439-
echo "Starting NVIDIA fabric manager daemon..."
440-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
441-
fi
404+
_start_daemons
442405
}
443406

444407
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -480,6 +443,21 @@ _unload_driver() {
480443
fi
481444
fi
482445

446+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
447+
echo "Stopping NVIDIA topology daemon..."
448+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
449+
450+
kill -SIGTERM "${pid}"
451+
for i in $(seq 1 50); do
452+
kill -0 "${pid}" 2> /dev/null || break
453+
sleep 0.1
454+
done
455+
if [ $i -eq 50 ]; then
456+
echo "Could not stop NVIDIA topology daemon" >&2
457+
return 1
458+
fi
459+
fi
460+
483461
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
484462
echo "Stopping NVIDIA fabric manager daemon..."
485463
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -570,10 +548,6 @@ _install_driver() {
570548
fi
571549

572550
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
573-
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
574-
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
575-
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
576-
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
577551
}
578552

579553
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -704,6 +678,73 @@ _start_vgpu_topology_daemon() {
704678
nvidia-topologyd
705679
}
706680

681+
_start_daemons() {
682+
echo "Starting NVIDIA persistence daemon..."
683+
nvidia-persistenced --persistence-mode
684+
685+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
686+
echo "Copying gridd.conf..."
687+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
688+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
689+
echo "Copying ClientConfigToken..."
690+
mkdir -p /etc/nvidia/ClientConfigToken/
691+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
692+
fi
693+
694+
echo "Starting nvidia-gridd.."
695+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
696+
697+
# Start virtual topology daemon
698+
_start_vgpu_topology_daemon
699+
fi
700+
701+
if _assert_nvlink5_system; then
702+
_ensure_nvlink5_prerequisites || return 1
703+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
704+
705+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
706+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
707+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
708+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
709+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
710+
--fm-config-file $fm_config_file \
711+
--fm-pid-file $fm_pid_file \
712+
--nvlsm-config-file $nvlsm_config_file \
713+
--nvlsm-pid-file $nvlsm_pid_file
714+
715+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
716+
elif _assert_nvswitch_system; then
717+
echo "Starting NVIDIA fabric manager daemon..."
718+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
719+
fi
720+
}
721+
722+
_store_driver_digest() {
723+
local digest_file="${RUN_DIR}/nvidia-driver.state"
724+
echo "Storing driver configuration digest..."
725+
echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file"
726+
echo "Driver configuration digest stored at $digest_file"
727+
}
728+
729+
_wait_for_signal() {
730+
echo "Done, now waiting for signal"
731+
sleep infinity &
732+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
733+
trap - EXIT
734+
while true; do wait $! || continue; done
735+
exit 0
736+
}
737+
738+
_userspace_install() {
739+
echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
740+
_unmount_rootfs
741+
_start_daemons
742+
_mount_rootfs
743+
_write_kernel_update_hook
744+
_store_driver_digest
745+
echo "Userspace-only install complete"
746+
}
747+
707748
_prepare() {
708749
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
709750
_find_vgpu_driver_version || exit 1
@@ -740,9 +781,6 @@ _prepare_exclusive() {
740781

741782
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
742783
trap "_shutdown" EXIT
743-
744-
_unload_driver || exit 1
745-
_unmount_rootfs
746784
}
747785

748786
_build() {
@@ -763,18 +801,21 @@ _load() {
763801
_load_driver
764802
_mount_rootfs
765803
_write_kernel_update_hook
766-
767-
echo "Done, now waiting for signal"
768-
sleep infinity &
769-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
770-
trap - EXIT
771-
while true; do wait $! || continue; done
772-
exit 0
804+
_store_driver_digest
805+
_wait_for_signal
773806
}
774807

775808
init() {
776809
_prepare_exclusive
777810

811+
if _should_skip_kernel_module_reload; then
812+
_userspace_install
813+
_wait_for_signal
814+
fi
815+
816+
_unload_driver || exit 1
817+
_unmount_rootfs
818+
778819
_build
779820

780821
_load
@@ -789,6 +830,9 @@ build() {
789830
load() {
790831
_prepare_exclusive
791832

833+
_unload_driver || exit 1
834+
_unmount_rootfs
835+
792836
_load
793837
}
794838

rhel8/ocp_dtk_entrypoint

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
1818
exec bash -x nvidia-driver init
1919
fi
2020

21+
if _should_skip_kernel_module_reload; then
22+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install"
23+
exec bash -x nvidia-driver init
24+
fi
25+
26+
echo "Fast path not detected: building driver and modules"
27+
2128
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
2229
cp -r \
2330
/tmp/install.sh \
@@ -79,6 +86,19 @@ dtk-build-driver() {
7986
sleep inf
8087
fi
8188

89+
# Check if fast path is being used - if so, skip building and signal completion
90+
if _should_skip_kernel_module_reload; then
91+
echo "The NVIDIA driver is already loaded with the desired configuration, skipping build"
92+
echo "Signaling driver_built to the main container and sleeping forever..."
93+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
94+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
95+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
96+
sleep 5
97+
done
98+
echo "WARNING: driver_built flag disappeared"
99+
exit 0
100+
fi
101+
82102
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
83103
echo "WARNING: broken Driver Toolkit image detected:"
84104
echo "- Node kernel: $(uname -r)"
@@ -99,7 +119,7 @@ dtk-build-driver() {
99119
echo "NVIDIA drivers already generated, nothing to do ..."
100120

101121
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
102-
sleep 30
122+
sleep 5
103123
done
104124
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
105125
else
@@ -249,7 +269,7 @@ dtk-build-driver() {
249269
fi
250270

251271
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
252-
sleep 30
272+
sleep 5
253273
done
254274

255275
echo "WARNING: driver_built flag disappeared, restart this container"

rhel9/nvidia-driver

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -755,8 +755,8 @@ _wait_for_signal() {
755755
_userspace_install() {
756756
echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
757757
_unmount_rootfs
758-
_mount_rootfs
759758
_start_daemons
759+
_mount_rootfs
760760
_write_kernel_update_hook
761761
_store_driver_digest
762762
echo "Userspace-only install complete"

rhel9/ocp_dtk_entrypoint

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ dtk-build-driver() {
9696
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
9797
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
9898
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
99-
sleep 30
99+
sleep 5
100100
done
101101
echo "WARNING: driver_built flag disappeared"
102102
exit 0
@@ -122,7 +122,7 @@ dtk-build-driver() {
122122
echo "NVIDIA drivers already generated, nothing to do ..."
123123

124124
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
125-
sleep 30
125+
sleep 5
126126
done
127127
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
128128
else
@@ -276,7 +276,7 @@ dtk-build-driver() {
276276
fi
277277

278278
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
279-
sleep 30
279+
sleep 5
280280
done
281281

282282
echo "WARNING: driver_built flag disappeared, restart this container"

ubuntu22.04/nvidia-driver

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -762,8 +762,8 @@ init() {
762762
_install_userspace_components
763763
_resolve_kernel_type || exit 1
764764
_move_kernel_module_sources
765-
_mount_rootfs
766765
_start_daemons
766+
_mount_rootfs
767767
_write_kernel_update_hook
768768
_store_driver_digest
769769
echo "Userspace-only install complete"

0 commit comments

Comments
 (0)