@@ -418,44 +418,7 @@ _load_driver() {
418418 set +o xtrace -o nounset
419419 fi
420420
421- echo " Starting NVIDIA persistence daemon..."
422- nvidia-persistenced --persistence-mode
423-
424- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
425- echo " Copying gridd.conf..."
426- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
427- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
428- echo " Copying ClientConfigToken..."
429- mkdir -p /etc/nvidia/ClientConfigToken/
430- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
431- fi
432-
433- echo " Starting nvidia-gridd.."
434- LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
435-
436- # Start virtual topology daemon
437- _start_vgpu_topology_daemon
438- fi
439-
440- if _assert_nvlink5_system; then
441- _ensure_nvlink5_prerequisites || return 1
442- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
449- --fm-config-file $fm_config_file \
450- --fm-pid-file $fm_pid_file \
451- --nvlsm-config-file $nvlsm_config_file \
452- --nvlsm-pid-file $nvlsm_pid_file
453-
454- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455- elif _assert_nvswitch_system; then
456- echo " Starting NVIDIA fabric manager daemon..."
457- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458- fi
421+ _start_daemons
459422}
460423
461424# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -497,6 +460,21 @@ _unload_driver() {
497460 fi
498461 fi
499462
463+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
464+ echo " Stopping NVIDIA topology daemon..."
465+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
466+
467+ kill -SIGTERM " ${pid} "
468+ for i in $( seq 1 50) ; do
469+ kill -0 " ${pid} " 2> /dev/null || break
470+ sleep 0.1
471+ done
472+ if [ $i -eq 50 ]; then
473+ echo " Could not stop NVIDIA topology daemon" >&2
474+ return 1
475+ fi
476+ fi
477+
500478 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
501479 echo " Stopping NVIDIA fabric manager daemon..."
502480 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -587,10 +565,6 @@ _install_driver() {
587565 fi
588566
589567 IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
590- # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
591- # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
592- # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
593- # nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
594568}
595569
596570# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -721,6 +695,73 @@ _start_vgpu_topology_daemon() {
721695 nvidia-topologyd
722696}
723697
698+ _start_daemons () {
699+ echo " Starting NVIDIA persistence daemon..."
700+ nvidia-persistenced --persistence-mode
701+
702+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
703+ echo " Copying gridd.conf..."
704+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
705+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
706+ echo " Copying ClientConfigToken..."
707+ mkdir -p /etc/nvidia/ClientConfigToken/
708+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
709+ fi
710+
711+ echo " Starting nvidia-gridd.."
712+ LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
713+
714+ # Start virtual topology daemon
715+ _start_vgpu_topology_daemon
716+ fi
717+
718+ if _assert_nvlink5_system; then
719+ _ensure_nvlink5_prerequisites || return 1
720+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
721+
722+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
723+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
724+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
725+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
726+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
727+ --fm-config-file $fm_config_file \
728+ --fm-pid-file $fm_pid_file \
729+ --nvlsm-config-file $nvlsm_config_file \
730+ --nvlsm-pid-file $nvlsm_pid_file
731+
732+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
733+ elif _assert_nvswitch_system; then
734+ echo " Starting NVIDIA fabric manager daemon..."
735+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
736+ fi
737+ }
738+
739+ _store_driver_digest () {
740+ local digest_file=" ${RUN_DIR} /nvidia-driver.state"
741+ echo " Storing driver configuration digest..."
742+ echo " ${DRIVER_CONFIG_DIGEST} " > " $digest_file "
743+ echo " Driver configuration digest stored at $digest_file "
744+ }
745+
746+ _wait_for_signal () {
747+ echo " Done, now waiting for signal"
748+ sleep infinity &
749+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
750+ trap - EXIT
751+ while true ; do wait $! || continue ; done
752+ exit 0
753+ }
754+
755+ _userspace_install () {
756+ echo " The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
757+ _unmount_rootfs
758+ _start_daemons
759+ _mount_rootfs
760+ _write_kernel_update_hook
761+ _store_driver_digest
762+ echo " Userspace-only install complete"
763+ }
764+
724765_prepare () {
725766 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
726767 _find_vgpu_driver_version || exit 1
@@ -758,8 +799,6 @@ _prepare_exclusive() {
758799 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
759800 trap " _shutdown" EXIT
760801
761- _unload_driver || exit 1
762- _unmount_rootfs
763802}
764803
765804_build () {
@@ -781,17 +820,21 @@ _load() {
781820 _mount_rootfs
782821 _write_kernel_update_hook
783822
784- echo " Done, now waiting for signal"
785- sleep infinity &
786- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
787- trap - EXIT
788- while true ; do wait $! || continue ; done
789- exit 0
823+ _store_driver_digest
824+ _wait_for_signal
790825}
791826
792827init () {
793828 _prepare_exclusive
794829
830+ if _should_skip_kernel_module_reload; then
831+ _userspace_install
832+ _wait_for_signal
833+ fi
834+
835+ _unload_driver || exit 1
836+ _unmount_rootfs
837+
795838 _build
796839
797840 _load
@@ -806,6 +849,9 @@ build() {
806849load () {
807850 _prepare_exclusive
808851
852+ _unload_driver || exit 1
853+ _unmount_rootfs
854+
809855 _load
810856}
811857
0 commit comments