Skip to content

Commit a9bde48

Browse files
Add fast path optimization for userspace-only install when driver config digest matches
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent e4f05d3 commit a9bde48

File tree

4 files changed

+268
-127
lines changed

4 files changed

+268
-127
lines changed

rhel9/common.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Check if fast path should be used (driver already loaded with matching config)
50+
# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest
51+
_should_use_fast_path() {
52+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
53+
local current_digest="${DRIVER_CONFIG_DIGEST:-}"
54+
[ -z "${current_digest}" ] && return 1
55+
local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
56+
[ "${current_digest}" = "${stored_digest}" ]
57+
}

rhel9/nvidia-driver

Lines changed: 100 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
88
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
99
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1010
NUM_VGPU_DEVICES=0
11+
DRIVER_TYPE="${DRIVER_TYPE:-passthrough}"
12+
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1113
NVIDIA_MODULE_PARAMS=()
1214
NVIDIA_UVM_MODULE_PARAMS=()
1315
NVIDIA_MODESET_MODULE_PARAMS=()
1416
NVIDIA_PEERMEM_MODULE_PARAMS=()
1517
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
16-
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1718
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
1819
RHEL_VERSION=${RHEL_VERSION:-""}
1920
RHEL_MAJOR_VERSION=9
@@ -418,44 +419,7 @@ _load_driver() {
418419
set +o xtrace -o nounset
419420
fi
420421

421-
echo "Starting NVIDIA persistence daemon..."
422-
nvidia-persistenced --persistence-mode
423-
424-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
425-
echo "Copying gridd.conf..."
426-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
427-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
428-
echo "Copying ClientConfigToken..."
429-
mkdir -p /etc/nvidia/ClientConfigToken/
430-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
431-
fi
432-
433-
echo "Starting nvidia-gridd.."
434-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
435-
436-
# Start virtual topology daemon
437-
_start_vgpu_topology_daemon
438-
fi
439-
440-
if _assert_nvlink5_system; then
441-
_ensure_nvlink5_prerequisites || return 1
442-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
449-
--fm-config-file $fm_config_file \
450-
--fm-pid-file $fm_pid_file \
451-
--nvlsm-config-file $nvlsm_config_file \
452-
--nvlsm-pid-file $nvlsm_pid_file
453-
454-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455-
elif _assert_nvswitch_system; then
456-
echo "Starting NVIDIA fabric manager daemon..."
457-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458-
fi
422+
_start_daemons
459423
}
460424

461425
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -497,6 +461,21 @@ _unload_driver() {
497461
fi
498462
fi
499463

464+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
465+
echo "Stopping NVIDIA topology daemon..."
466+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
467+
468+
kill -SIGTERM "${pid}"
469+
for i in $(seq 1 50); do
470+
kill -0 "${pid}" 2> /dev/null || break
471+
sleep 0.1
472+
done
473+
if [ $i -eq 50 ]; then
474+
echo "Could not stop NVIDIA topology daemon" >&2
475+
return 1
476+
fi
477+
fi
478+
500479
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
501480
echo "Stopping NVIDIA fabric manager daemon..."
502481
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -586,11 +565,7 @@ _install_driver() {
586565
install_args+=("--skip-module-load")
587566
fi
588567

589-
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
590-
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
591-
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
592-
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
593-
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
568+
IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
594569
}
595570

596571
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -721,6 +696,73 @@ _start_vgpu_topology_daemon() {
721696
nvidia-topologyd
722697
}
723698

699+
_start_daemons() {
700+
echo "Starting NVIDIA persistence daemon..."
701+
nvidia-persistenced --persistence-mode
702+
703+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
704+
echo "Copying gridd.conf..."
705+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
706+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
707+
echo "Copying ClientConfigToken..."
708+
mkdir -p /etc/nvidia/ClientConfigToken/
709+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
710+
fi
711+
712+
echo "Starting nvidia-gridd.."
713+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
714+
715+
# Start virtual topology daemon
716+
_start_vgpu_topology_daemon
717+
fi
718+
719+
if _assert_nvlink5_system; then
720+
_ensure_nvlink5_prerequisites || return 1
721+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
722+
723+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
724+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
725+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
726+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
727+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
728+
--fm-config-file $fm_config_file \
729+
--fm-pid-file $fm_pid_file \
730+
--nvlsm-config-file $nvlsm_config_file \
731+
--nvlsm-pid-file $nvlsm_pid_file
732+
733+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
734+
elif _assert_nvswitch_system; then
735+
echo "Starting NVIDIA fabric manager daemon..."
736+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
737+
fi
738+
}
739+
740+
_store_driver_digest() {
741+
local digest_file="${RUN_DIR}/nvidia-driver.state"
742+
echo "Storing driver configuration digest..."
743+
echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file"
744+
echo "Driver configuration digest stored at $digest_file"
745+
}
746+
747+
_wait_for_signal() {
748+
echo "Done, now waiting for signal"
749+
sleep infinity &
750+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
751+
trap - EXIT
752+
while true; do wait $! || continue; done
753+
exit 0
754+
}
755+
756+
_userspace_only_install() {
757+
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
758+
_unmount_rootfs
759+
_mount_rootfs
760+
_start_daemons
761+
_write_kernel_update_hook
762+
_store_driver_digest
763+
echo "Userspace-only install complete"
764+
}
765+
724766
_prepare() {
725767
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
726768
_find_vgpu_driver_version || exit 1
@@ -746,8 +788,6 @@ _prepare() {
746788
}
747789

748790
_prepare_exclusive() {
749-
_prepare
750-
751791
exec 3> ${PID_FILE}
752792
if ! flock -n 3; then
753793
echo "An instance of the NVIDIA driver is already running, aborting"
@@ -758,8 +798,7 @@ _prepare_exclusive() {
758798
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
759799
trap "_shutdown" EXIT
760800

761-
_unload_driver || exit 1
762-
_unmount_rootfs
801+
_prepare
763802
}
764803

765804
_build() {
@@ -780,18 +819,21 @@ _load() {
780819
_load_driver
781820
_mount_rootfs
782821
_write_kernel_update_hook
783-
784-
echo "Done, now waiting for signal"
785-
sleep infinity &
786-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
787-
trap - EXIT
788-
while true; do wait $! || continue; done
789-
exit 0
822+
_store_driver_digest
823+
_wait_for_signal
790824
}
791825

792826
init() {
793827
_prepare_exclusive
794828

829+
if _should_use_fast_path; then
830+
_userspace_only_install
831+
_wait_for_signal
832+
fi
833+
834+
_unload_driver || exit 1
835+
_unmount_rootfs
836+
795837
_build
796838

797839
_load
@@ -806,6 +848,9 @@ build() {
806848
load() {
807849
_prepare_exclusive
808850

851+
_unload_driver || exit 1
852+
_unmount_rootfs
853+
809854
_load
810855
}
811856

rhel9/ocp_dtk_entrypoint

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
1818
exec bash -x nvidia-driver init
1919
fi
2020

21+
if _should_use_fast_path; then
22+
echo "Fast path detected: skipping DTK build and module copy, proceeding with userspace-only install"
23+
exec bash -x nvidia-driver init
24+
fi
25+
26+
echo "Fast path not detected: building driver and modules"
27+
2128
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
2229
cp -r \
2330
/tmp/install.sh \
@@ -82,6 +89,19 @@ dtk-build-driver() {
8289
sleep inf
8390
fi
8491

92+
# Check if fast path is being used - if so, skip building and signal completion
93+
if _should_use_fast_path; then
94+
echo "Fast path detected in DTK container: driver already loaded with matching config, skipping build"
95+
echo "Signaling driver_built and sleeping forever..."
96+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
97+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
98+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
99+
sleep 30
100+
done
101+
echo "WARNING: driver_built flag disappeared"
102+
exit 0
103+
fi
104+
85105
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
86106
echo "WARNING: broken Driver Toolkit image detected:"
87107
echo "- Node kernel: $(uname -r)"

0 commit comments

Comments
 (0)