@@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
88DRIVER_VERSION=${DRIVER_VERSION:? " Missing DRIVER_VERSION env" }
99KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1010NUM_VGPU_DEVICES=0
11+ DRIVER_TYPE=" ${DRIVER_TYPE:- passthrough} "
12+ USE_HOST_MOFED=" ${USE_HOST_MOFED:- false} "
1113NVIDIA_MODULE_PARAMS=()
1214NVIDIA_UVM_MODULE_PARAMS=()
1315NVIDIA_MODESET_MODULE_PARAMS=()
1416NVIDIA_PEERMEM_MODULE_PARAMS=()
1517TARGETARCH=${TARGETARCH:? " Missing TARGETARCH env" }
16- USE_HOST_MOFED=" ${USE_HOST_MOFED:- false} "
1718DNF_RELEASEVER=${DNF_RELEASEVER:- " " }
1819RHEL_VERSION=${RHEL_VERSION:- " " }
1920RHEL_MAJOR_VERSION=9
@@ -418,44 +419,7 @@ _load_driver() {
418419 set +o xtrace -o nounset
419420 fi
420421
421- echo " Starting NVIDIA persistence daemon..."
422- nvidia-persistenced --persistence-mode
423-
424- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
425- echo " Copying gridd.conf..."
426- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
427- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
428- echo " Copying ClientConfigToken..."
429- mkdir -p /etc/nvidia/ClientConfigToken/
430- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
431- fi
432-
433- echo " Starting nvidia-gridd.."
434- LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
435-
436- # Start virtual topology daemon
437- _start_vgpu_topology_daemon
438- fi
439-
440- if _assert_nvlink5_system; then
441- _ensure_nvlink5_prerequisites || return 1
442- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
443-
444- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
445- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
446- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
447- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
448- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
449- --fm-config-file $fm_config_file \
450- --fm-pid-file $fm_pid_file \
451- --nvlsm-config-file $nvlsm_config_file \
452- --nvlsm-pid-file $nvlsm_pid_file
453-
454- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
455- elif _assert_nvswitch_system; then
456- echo " Starting NVIDIA fabric manager daemon..."
457- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
458- fi
422+ _start_daemons
459423}
460424
461425# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -497,6 +461,21 @@ _unload_driver() {
497461 fi
498462 fi
499463
464+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
465+ echo " Stopping NVIDIA topology daemon..."
466+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
467+
468+ kill -SIGTERM " ${pid} "
469+ for i in $( seq 1 50) ; do
470+ kill -0 " ${pid} " 2> /dev/null || break
471+ sleep 0.1
472+ done
473+ if [ $i -eq 50 ]; then
474+ echo " Could not stop NVIDIA topology daemon" >&2
475+ return 1
476+ fi
477+ fi
478+
500479 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
501480 echo " Stopping NVIDIA fabric manager daemon..."
502481 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -586,11 +565,7 @@ _install_driver() {
586565 install_args+=(" --skip-module-load" )
587566 fi
588567
589- IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
590- # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
591- # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
592- # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
593- # nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
568+ IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
594569}
595570
596571# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -721,6 +696,73 @@ _start_vgpu_topology_daemon() {
721696 nvidia-topologyd
722697}
723698
699+ _start_daemons () {
700+ echo " Starting NVIDIA persistence daemon..."
701+ nvidia-persistenced --persistence-mode
702+
703+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
704+ echo " Copying gridd.conf..."
705+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
706+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
707+ echo " Copying ClientConfigToken..."
708+ mkdir -p /etc/nvidia/ClientConfigToken/
709+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
710+ fi
711+
712+ echo " Starting nvidia-gridd.."
713+ LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
714+
715+ # Start virtual topology daemon
716+ _start_vgpu_topology_daemon
717+ fi
718+
719+ if _assert_nvlink5_system; then
720+ _ensure_nvlink5_prerequisites || return 1
721+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
722+
723+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
724+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
725+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
726+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
727+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
728+ --fm-config-file $fm_config_file \
729+ --fm-pid-file $fm_pid_file \
730+ --nvlsm-config-file $nvlsm_config_file \
731+ --nvlsm-pid-file $nvlsm_pid_file
732+
733+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
734+ elif _assert_nvswitch_system; then
735+ echo " Starting NVIDIA fabric manager daemon..."
736+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
737+ fi
738+ }
739+
740+ _store_driver_digest () {
741+ local digest_file=" ${RUN_DIR} /nvidia-driver.state"
742+ echo " Storing driver configuration digest..."
743+ echo " ${DRIVER_CONFIG_DIGEST} " > " $digest_file "
744+ echo " Driver configuration digest stored at $digest_file "
745+ }
746+
747+ _wait_for_signal () {
748+ echo " Done, now waiting for signal"
749+ sleep infinity &
750+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
751+ trap - EXIT
752+ while true ; do wait $! || continue ; done
753+ exit 0
754+ }
755+
756+ _userspace_only_install () {
757+ echo " Detected matching loaded driver & config (${DRIVER_VERSION} ); performing userspace-only install"
758+ _unmount_rootfs
759+ _mount_rootfs
760+ _start_daemons
761+ _write_kernel_update_hook
762+ _store_driver_digest
763+ echo " Userspace-only install complete"
764+ }
765+
724766_prepare () {
725767 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
726768 _find_vgpu_driver_version || exit 1
@@ -746,8 +788,6 @@ _prepare() {
746788}
747789
748790_prepare_exclusive () {
749- _prepare
750-
751791 exec 3> ${PID_FILE}
752792 if ! flock -n 3; then
753793 echo " An instance of the NVIDIA driver is already running, aborting"
@@ -758,8 +798,7 @@ _prepare_exclusive() {
758798 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
759799 trap " _shutdown" EXIT
760800
761- _unload_driver || exit 1
762- _unmount_rootfs
801+ _prepare
763802}
764803
765804_build () {
@@ -780,18 +819,21 @@ _load() {
780819 _load_driver
781820 _mount_rootfs
782821 _write_kernel_update_hook
783-
784- echo " Done, now waiting for signal"
785- sleep infinity &
786- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
787- trap - EXIT
788- while true ; do wait $! || continue ; done
789- exit 0
822+ _store_driver_digest
823+ _wait_for_signal
790824}
791825
792826init () {
793827 _prepare_exclusive
794828
829+ if _should_use_fast_path; then
830+ _userspace_only_install
831+ _wait_for_signal
832+ fi
833+
834+ _unload_driver || exit 1
835+ _unmount_rootfs
836+
795837 _build
796838
797839 _load
@@ -806,6 +848,9 @@ build() {
806848load () {
807849 _prepare_exclusive
808850
851+ _unload_driver || exit 1
852+ _unmount_rootfs
853+
809854 _load
810855}
811856
0 commit comments