@@ -401,44 +401,7 @@ _load_driver() {
401401 set +o xtrace -o nounset
402402 fi
403403
404- echo " Starting NVIDIA persistence daemon..."
405- nvidia-persistenced --persistence-mode
406-
407- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
408- echo " Copying gridd.conf..."
409- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
410- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
411- echo " Copying ClientConfigToken..."
412- mkdir -p /etc/nvidia/ClientConfigToken/
413- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
414- fi
415-
416- echo " Starting nvidia-gridd.."
417- LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
418-
419- # Start virtual topology daemon
420- _start_vgpu_topology_daemon
421- fi
422-
423- if _assert_nvlink5_system; then
424- _ensure_nvlink5_prerequisites || return 1
425- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
426-
427- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
428- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
429- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
430- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
431- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
432- --fm-config-file $fm_config_file \
433- --fm-pid-file $fm_pid_file \
434- --nvlsm-config-file $nvlsm_config_file \
435- --nvlsm-pid-file $nvlsm_pid_file
436-
437- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
438- elif _assert_nvswitch_system; then
439- echo " Starting NVIDIA fabric manager daemon..."
440- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
441- fi
404+ _start_daemons
442405}
443406
444407# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -480,6 +443,21 @@ _unload_driver() {
480443 fi
481444 fi
482445
446+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
447+ echo " Stopping NVIDIA topology daemon..."
448+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
449+
450+ kill -SIGTERM " ${pid} "
451+ for i in $( seq 1 50) ; do
452+ kill -0 " ${pid} " 2> /dev/null || break
453+ sleep 0.1
454+ done
455+ if [ $i -eq 50 ]; then
456+ echo " Could not stop NVIDIA topology daemon" >&2
457+ return 1
458+ fi
459+ fi
460+
483461 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
484462 echo " Stopping NVIDIA fabric manager daemon..."
485463 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -570,10 +548,6 @@ _install_driver() {
570548 fi
571549
572550 IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
573- # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
574- # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
575- # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
576- # nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
577551}
578552
579553# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -704,6 +678,74 @@ _start_vgpu_topology_daemon() {
704678 nvidia-topologyd
705679}
706680
681+ _start_daemons () {
682+ echo " Starting NVIDIA persistence daemon..."
683+ nvidia-persistenced --persistence-mode
684+
685+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
686+ echo " Copying gridd.conf..."
687+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
688+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
689+ echo " Copying ClientConfigToken..."
690+ mkdir -p /etc/nvidia/ClientConfigToken/
691+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
692+ fi
693+
694+ echo " Starting nvidia-gridd.."
695+ LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
696+
697+ # Start virtual topology daemon
698+ _start_vgpu_topology_daemon
699+ fi
700+
701+ if _assert_nvlink5_system; then
702+ _ensure_nvlink5_prerequisites || return 1
703+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
704+
705+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
706+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
707+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
708+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
709+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
710+ --fm-config-file $fm_config_file \
711+ --fm-pid-file $fm_pid_file \
712+ --nvlsm-config-file $nvlsm_config_file \
713+ --nvlsm-pid-file $nvlsm_pid_file
714+
715+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
716+ elif _assert_nvswitch_system; then
717+ echo " Starting NVIDIA fabric manager daemon..."
718+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
719+ fi
720+ }
721+
722+ _store_driver_digest () {
723+ local digest_file=" ${RUN_DIR} /nvidia-driver.state"
724+ local current_digest=" ${DRIVER_CONFIG_DIGEST:- } "
725+ echo " Storing driver configuration digest..."
726+ echo " ${current_digest} " > " $digest_file "
727+ echo " Driver configuration digest stored at $digest_file "
728+ }
729+
730+ _wait_for_signal () {
731+ echo " Done, now waiting for signal"
732+ sleep infinity &
733+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
734+ trap - EXIT
735+ while true ; do wait $! || continue ; done
736+ exit 0
737+ }
738+
739+ _userspace_install () {
740+ echo " The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
741+ _unmount_rootfs
742+ _start_daemons
743+ _mount_rootfs
744+ _write_kernel_update_hook
745+ _store_driver_digest
746+ echo " Userspace-only install complete"
747+ }
748+
707749_prepare () {
708750 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
709751 _find_vgpu_driver_version || exit 1
@@ -740,9 +782,6 @@ _prepare_exclusive() {
740782
741783 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
742784 trap " _shutdown" EXIT
743-
744- _unload_driver || exit 1
745- _unmount_rootfs
746785}
747786
748787_build () {
@@ -763,18 +802,21 @@ _load() {
763802 _load_driver
764803 _mount_rootfs
765804 _write_kernel_update_hook
766-
767- echo " Done, now waiting for signal"
768- sleep infinity &
769- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
770- trap - EXIT
771- while true ; do wait $! || continue ; done
772- exit 0
805+ _store_driver_digest
806+ _wait_for_signal
773807}
774808
775809init () {
776810 _prepare_exclusive
777811
812+ if _should_skip_kernel_module_reload; then
813+ _userspace_install
814+ _wait_for_signal
815+ fi
816+
817+ _unload_driver || exit 1
818+ _unmount_rootfs
819+
778820 _build
779821
780822 _load
@@ -789,6 +831,9 @@ build() {
789831load () {
790832 _prepare_exclusive
791833
834+ _unload_driver || exit 1
835+ _unmount_rootfs
836+
792837 _load
793838}
794839
0 commit comments