@@ -166,10 +166,10 @@ _install_prerequisites() (
166166 # Parse gcc version
167167 # gcc_version is expected to match x.y.z
168168 # current_gcc is expected to match 'gcc-x.y.z-rel.el8.x86_64
169- local gcc_version=$( cat /lib/modules/${KERNEL_VERSION} /proc/version | sed -n ' s/.* gcc (GCC) \ ([0-9.]*\).*/\1/p ' )
169+ local gcc_version=$( cat /lib/modules/${KERNEL_VERSION} /proc/version | grep -Eo " gcc \ (GCC\) ([0-9\.]+) " | grep -Eo " ([0-9\.]+) " )
170170 local current_gcc=$( rpm -qa gcc)
171171 echo " kernel requires gcc version: 'gcc-${gcc_version} ', current gcc version is '${current_gcc} '"
172-
172+
173173 if ! [[ " ${current_gcc} " =~ " gcc-${gcc_version} " -.* ]]; then
174174 echo " WARNING: GCC version mismatch detected, but attempting to continue..."
175175 echo " Kernel built with: gcc-${gcc_version} "
@@ -423,44 +423,7 @@ _load_driver() {
423423 set +o xtrace -o nounset
424424 fi
425425
426- echo " Starting NVIDIA persistence daemon..."
427- nvidia-persistenced --persistence-mode
428-
429- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
430- echo " Copying gridd.conf..."
431- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
432- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
433- echo " Copying ClientConfigToken..."
434- mkdir -p /etc/nvidia/ClientConfigToken/
435- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
436- fi
437-
438- echo " Starting nvidia-gridd.."
439- LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
440-
441- # Start virtual topology daemon
442- _start_vgpu_topology_daemon
443- fi
444-
445- if _assert_nvlink5_system; then
446- _ensure_nvlink5_prerequisites || return 1
447- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
448-
449- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
450- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
451- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
452- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
453- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
454- --fm-config-file $fm_config_file \
455- --fm-pid-file $fm_pid_file \
456- --nvlsm-config-file $nvlsm_config_file \
457- --nvlsm-pid-file $nvlsm_pid_file
458-
459- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
460- elif _assert_nvswitch_system; then
461- echo " Starting NVIDIA fabric manager daemon..."
462- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
463- fi
426+ _start_daemons
464427}
465428
466429# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -502,6 +465,21 @@ _unload_driver() {
502465 fi
503466 fi
504467
468+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
469+ echo " Stopping NVIDIA topology daemon..."
470+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
471+
472+ kill -SIGTERM " ${pid} "
473+ for i in $( seq 1 50) ; do
474+ kill -0 " ${pid} " 2> /dev/null || break
475+ sleep 0.1
476+ done
477+ if [ $i -eq 50 ]; then
478+ echo " Could not stop NVIDIA topology daemon" >&2
479+ return 1
480+ fi
481+ fi
482+
505483 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
506484 echo " Stopping NVIDIA fabric manager daemon..."
507485 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -592,10 +570,6 @@ _install_driver() {
592570 fi
593571
594572 IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+" ${install_args[@]} " }
595- # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
596- # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
597- # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
598- # nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
599573}
600574
601575# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -726,6 +700,74 @@ _start_vgpu_topology_daemon() {
726700 nvidia-topologyd
727701}
728702
703+ _start_daemons () {
704+ echo " Starting NVIDIA persistence daemon..."
705+ nvidia-persistenced --persistence-mode
706+
707+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
708+ echo " Copying gridd.conf..."
709+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
710+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
711+ echo " Copying ClientConfigToken..."
712+ mkdir -p /etc/nvidia/ClientConfigToken/
713+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
714+ fi
715+
716+ echo " Starting nvidia-gridd.."
717+ LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
718+
719+ # Start virtual topology daemon
720+ _start_vgpu_topology_daemon
721+ fi
722+
723+ if _assert_nvlink5_system; then
724+ _ensure_nvlink5_prerequisites || return 1
725+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
726+
727+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
728+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
729+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
730+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
731+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
732+ --fm-config-file $fm_config_file \
733+ --fm-pid-file $fm_pid_file \
734+ --nvlsm-config-file $nvlsm_config_file \
735+ --nvlsm-pid-file $nvlsm_pid_file
736+
737+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
738+ elif _assert_nvswitch_system; then
739+ echo " Starting NVIDIA fabric manager daemon..."
740+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
741+ fi
742+ }
743+
744+ _store_driver_digest () {
745+ local digest_file=" ${RUN_DIR} /nvidia-driver.state"
746+ echo " Storing driver configuration digest..."
747+ local current_digest=" ${DRIVER_CONFIG_DIGEST:- } "
748+ echo " ${current_digest} " > " $digest_file "
749+ echo " Driver configuration digest stored at $digest_file "
750+ }
751+
752+ _wait_for_signal () {
753+ echo " Done, now waiting for signal"
754+ sleep infinity &
755+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
756+ trap - EXIT
757+ while true ; do wait $! || continue ; done
758+ exit 0
759+ }
760+
761+ _userspace_install () {
762+ echo " The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install"
763+ _unmount_rootfs
764+ _start_daemons
765+ _mount_rootfs
766+ _write_kernel_update_hook
767+ _store_driver_digest
768+ echo " Userspace-only install complete"
769+ }
770+
729771_prepare () {
730772 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
731773 _find_vgpu_driver_version || exit 1
@@ -763,8 +805,6 @@ _prepare_exclusive() {
763805 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
764806 trap " _shutdown" EXIT
765807
766- _unload_driver || exit 1
767- _unmount_rootfs
768808}
769809
770810_build () {
@@ -786,17 +826,21 @@ _load() {
786826 _mount_rootfs
787827 _write_kernel_update_hook
788828
789- echo " Done, now waiting for signal"
790- sleep infinity &
791- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
792- trap - EXIT
793- while true ; do wait $! || continue ; done
794- exit 0
829+ _store_driver_digest
830+ _wait_for_signal
795831}
796832
797833init () {
798834 _prepare_exclusive
799835
836+ if _should_skip_kernel_module_reload; then
837+ _userspace_install
838+ _wait_for_signal
839+ fi
840+
841+ _unload_driver || exit 1
842+ _unmount_rootfs
843+
800844 _build
801845
802846 _load
@@ -811,6 +855,9 @@ build() {
811855load () {
812856 _prepare_exclusive
813857
858+ _unload_driver || exit 1
859+ _unmount_rootfs
860+
814861 _load
815862}
816863
@@ -882,7 +929,7 @@ reload_nvidia_peermem() {
882929 fi
883930 # get any parameters provided for nvidia-peermem
884931 _get_module_params && set +o nounset
885- if chroot /run/nvidia/driver modprobe nvidia-peermem; then
932+ if chroot /run/nvidia/driver modprobe nvidia-peermem " ${NVIDIA_PEERMEM_MODULE_PARAMS[@]} " ; then
886933 if [ -f /sys/module/nvidia_peermem/refcnt ]; then
887934 echo " successfully loaded nvidia-peermem module, now waiting for signal"
888935 sleep inf
0 commit comments