@@ -401,44 +401,7 @@ _load_driver() {
401401 set +o xtrace -o nounset
402402 fi
403403
404- echo " Starting NVIDIA persistence daemon..."
405- nvidia-persistenced --persistence-mode
406-
407- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
408- echo " Copying gridd.conf..."
409- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
410- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
411- echo " Copying ClientConfigToken..."
412- mkdir -p /etc/nvidia/ClientConfigToken/
413- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
414- fi
415-
416- echo " Starting nvidia-gridd.."
417- LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
418-
419- # Start virtual topology daemon
420- _start_vgpu_topology_daemon
421- fi
422-
423- if _assert_nvlink5_system; then
424- _ensure_nvlink5_prerequisites || return 1
425- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
426-
427- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
428- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
429- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
430- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
431- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
432- --fm-config-file $fm_config_file \
433- --fm-pid-file $fm_pid_file \
434- --nvlsm-config-file $nvlsm_config_file \
435- --nvlsm-pid-file $nvlsm_pid_file
436-
437- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
438- elif _assert_nvswitch_system; then
439- echo " Starting NVIDIA fabric manager daemon..."
440- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
441- fi
404+ _start_daemons
442405}
443406
444407# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -480,6 +443,21 @@ _unload_driver() {
480443 fi
481444 fi
482445
446+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
447+ echo " Stopping NVIDIA topology daemon..."
448+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
449+
450+ kill -SIGTERM " ${pid} "
451+ for i in $( seq 1 50) ; do
452+ kill -0 " ${pid} " 2> /dev/null || break
453+ sleep 0.1
454+ done
455+ if [ $i -eq 50 ]; then
456+ echo " Could not stop NVIDIA topology daemon" >&2
457+ return 1
458+ fi
459+ fi
460+
483461 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
484462 echo " Stopping NVIDIA fabric manager daemon..."
485463 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -570,10 +548,6 @@ _install_driver() {
570548 fi
571549
572550 IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
573- # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
574- # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
575- # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
576- # nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
577551}
578552
579553# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -704,6 +678,73 @@ _start_vgpu_topology_daemon() {
704678 nvidia-topologyd
705679}
706680
681+ _start_daemons () {
682+ echo " Starting NVIDIA persistence daemon..."
683+ nvidia-persistenced --persistence-mode
684+
685+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
686+ echo " Copying gridd.conf..."
687+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
688+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
689+ echo " Copying ClientConfigToken..."
690+ mkdir -p /etc/nvidia/ClientConfigToken/
691+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
692+ fi
693+
694+ echo " Starting nvidia-gridd.."
695+ LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
696+
697+ # Start virtual topology daemon
698+ _start_vgpu_topology_daemon
699+ fi
700+
701+ if _assert_nvlink5_system; then
702+ _ensure_nvlink5_prerequisites || return 1
703+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
704+
705+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
706+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
707+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
708+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
709+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
710+ --fm-config-file $fm_config_file \
711+ --fm-pid-file $fm_pid_file \
712+ --nvlsm-config-file $nvlsm_config_file \
713+ --nvlsm-pid-file $nvlsm_pid_file
714+
715+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
716+ elif _assert_nvswitch_system; then
717+ echo " Starting NVIDIA fabric manager daemon..."
718+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
719+ fi
720+ }
721+
722+ _store_driver_digest () {
723+ local digest_file=" ${RUN_DIR} /nvidia-driver.state"
724+ echo " Storing driver configuration digest..."
725+ echo " ${DRIVER_CONFIG_DIGEST} " > " $digest_file "
726+ echo " Driver configuration digest stored at $digest_file "
727+ }
728+
729+ _wait_for_signal () {
730+ echo " Done, now waiting for signal"
731+ sleep infinity &
732+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
733+ trap - EXIT
734+ while true ; do wait $! || continue ; done
735+ exit 0
736+ }
737+
738+ _userspace_install () {
739+ echo " The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
740+ _unmount_rootfs
741+ _start_daemons
742+ _mount_rootfs
743+ _write_kernel_update_hook
744+ _store_driver_digest
745+ echo " Userspace-only install complete"
746+ }
747+
707748_prepare () {
708749 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
709750 _find_vgpu_driver_version || exit 1
@@ -740,9 +781,6 @@ _prepare_exclusive() {
740781
741782 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
742783 trap " _shutdown" EXIT
743-
744- _unload_driver || exit 1
745- _unmount_rootfs
746784}
747785
748786_build () {
@@ -763,18 +801,21 @@ _load() {
763801 _load_driver
764802 _mount_rootfs
765803 _write_kernel_update_hook
766-
767- echo " Done, now waiting for signal"
768- sleep infinity &
769- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
770- trap - EXIT
771- while true ; do wait $! || continue ; done
772- exit 0
804+ _store_driver_digest
805+ _wait_for_signal
773806}
774807
775808init () {
776809 _prepare_exclusive
777810
811+ if _should_skip_kernel_module_reload; then
812+ _userspace_install
813+ _wait_for_signal
814+ fi
815+
816+ _unload_driver || exit 1
817+ _unmount_rootfs
818+
778819 _build
779820
780821 _load
@@ -789,6 +830,9 @@ build() {
789830load () {
790831 _prepare_exclusive
791832
833+ _unload_driver || exit 1
834+ _unmount_rootfs
835+
792836 _load
793837}
794838
0 commit comments