@@ -640,62 +640,79 @@ _start_vgpu_topology_daemon() {
640640}
641641
642642_ensure_persistenced () {
643- local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
644- if pid=$( < " ${pid_file} " 2> /dev/null) && [ -n " ${pid} " ] && kill -0 " ${pid} " 2> /dev/null; then
645- return 0
646- fi
643+ local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
644+ if pid=$( < " ${pid_file} " 2> /dev/null) && [ -n " ${pid} " ] && kill -0 " ${pid} " 2> /dev/null; then
645+ return 0
646+ fi
647+
648+ if command -v nvidia-persistenced > /dev/null 2>&1 ; then
649+ nvidia-persistenced --persistence-mode || true
650+ else
651+ echo " nvidia-persistenced not found; continuing without persistence"
652+ fi
653+ }
647654
648- if command -v nvidia-persistenced > /dev/null 2>&1 ; then
649- nvidia-persistenced --persistence-mode || true
650- else
651- echo " nvidia-persistenced not found; continuing without persistence"
652- fi
655+ _read_conf_file () {
656+ local file=" $1 "
657+ [ -f " $file " ] && tr ' \n' ' ' < " $file "
653658}
654659
655660_build_driver_config () {
656- local nvidia_params=" " nvidia_uvm_params=" " nvidia_modeset_params=" " nvidia_peermem_params=" "
657-
658- # Read module parameters from conf files
659- if [ -f " /drivers/nvidia.conf" ]; then
660- nvidia_params=$( cat " /drivers/nvidia.conf" | tr ' \n' ' ' )
661- fi
662- if [ -f " /drivers/nvidia-uvm.conf" ]; then
663- nvidia_uvm_params=$( cat " /drivers/nvidia-uvm.conf" | tr ' \n' ' ' )
664- fi
665- if [ -f " /drivers/nvidia-modeset.conf" ]; then
666- nvidia_modeset_params=$( cat " /drivers/nvidia-modeset.conf" | tr ' \n' ' ' )
667- fi
668- if [ -f " /drivers/nvidia-peermem.conf" ]; then
669- nvidia_peermem_params=$( cat " /drivers/nvidia-peermem.conf" | tr ' \n' ' ' )
670- fi
671-
672- local config=" DRIVER_VERSION=${DRIVER_VERSION}
661+ cat << EOF
662+ DRIVER_VERSION=${DRIVER_VERSION}
673663DRIVER_TYPE=${DRIVER_TYPE:- passthrough}
674664KERNEL_VERSION=$( uname -r)
675665GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
676666USE_HOST_MOFED=${USE_HOST_MOFED}
677667KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}
678- NVIDIA_MODULE_PARAMS=${nvidia_params}
679- NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params}
680- NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params}
681- NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params} "
668+ NVIDIA_MODULE_PARAMS=$( _read_conf_file /drivers/nvidia.conf)
669+ NVIDIA_UVM_MODULE_PARAMS=$( _read_conf_file /drivers/nvidia-uvm.conf)
670+ NVIDIA_MODESET_MODULE_PARAMS=$( _read_conf_file /drivers/nvidia-modeset.conf)
671+ NVIDIA_PEERMEM_MODULE_PARAMS=$( _read_conf_file /drivers/nvidia-peermem.conf)
672+ EOF
673+ }
682674
683- # Append config file contents directly
684- for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
685- if [ -f " /drivers/$conf_file " ]; then
686- config=" ${config}
687- $( cat " /drivers/$conf_file " ) "
688- fi
689- done
675+ _store_driver_config () {
676+ local config_file=" /run/nvidia/driver-config.state"
677+ echo " Storing driver configuration state..."
678+ _build_driver_config > " $config_file "
679+ echo " Driver configuration stored at $config_file "
680+ }
690681
691- echo " $config "
682+ _install_userspace_components () {
683+ echo " Installing userspace components (libraries and binaries)..."
684+ cd /drivers
685+ sh NVIDIA-Linux-${DRIVER_ARCH} -${DRIVER_VERSION} .run -x
686+ cd NVIDIA-Linux-${DRIVER_ARCH} -${DRIVER_VERSION}
687+ ./nvidia-installer \
688+ --silent \
689+ --no-kernel-module \
690+ --no-nouveau-check \
691+ --no-nvidia-modprobe \
692+ --no-rpms \
693+ --no-backup \
694+ --no-check-for-alternate-installs \
695+ --no-libglx-indirect \
696+ --no-install-libglvnd \
697+ --x-prefix=/tmp/null \
698+ --x-module-path=/tmp/null \
699+ --x-library-path=/tmp/null \
700+ --x-sysconfig-path=/tmp/null
692701}
693702
694- _store_driver_config () {
695- local config_file=" /run/nvidia/driver-config.state"
696- echo " Storing driver configuration state..."
697- _build_driver_config > " $config_file "
698- echo " Driver configuration stored at $config_file "
703+ _copy_kernel_module_sources () {
704+ mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
705+ cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} /
706+ sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
707+ }
708+
709+ _wait_for_signal () {
710+ echo " Done, now waiting for signal"
711+ sleep infinity &
712+ trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
713+ trap - EXIT
714+ while true ; do wait $! || continue ; done
715+ exit 0
699716}
700717
701718init () {
@@ -716,96 +733,41 @@ init() {
716733 trap " echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
717734 trap " _shutdown" EXIT
718735
719- # Fast path: if the NVIDIA kernel modules are already loaded and driver config matches,
720- # skip kernel module build/load but install userspace components.
721- # This handles non-clean restarts where modules are in use and can't be unloaded.
722- if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
723- current_config=$( _build_driver_config)
724- stored_config=$( cat /run/nvidia/driver-config.state)
725-
726- if [ " ${current_config} " = " ${stored_config} " ]; then
727- echo " Detected matching loaded driver & config (${DRIVER_VERSION} ); performing userspace-only install"
728-
729- # Skip kernel module unload since they're already loaded with correct version
730- # Unmount any existing rootfs
731- _unmount_rootfs
732-
733- # Update package cache for userspace install
734- _update_package_cache
735- _resolve_kernel_version || exit 1
736- _install_prerequisites
737-
738- # Install userspace components only (libraries, binaries)
739- # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
740- echo " Installing userspace components (libraries and binaries)..."
741- cd /drivers
742- # Extract the driver first
743- sh NVIDIA-Linux-${DRIVER_ARCH} -${DRIVER_VERSION} .run -x
744- cd NVIDIA-Linux-${DRIVER_ARCH} -${DRIVER_VERSION}
745- ./nvidia-installer \
746- --silent \
747- --no-kernel-module \
748- --no-nouveau-check \
749- --no-nvidia-modprobe \
750- --no-drm \
751- --no-peermem
752-
753- # Determine the kernel module type
754- _resolve_kernel_type || exit 1
755-
756- # Copy the kernel module sources for sidecar containers (gdrcopy, nvidia-fs, etc.)
757- mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \
758- cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} / && \
759- sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
760-
761- # Mount the driver rootfs to make components available
762- _mount_rootfs
763-
764- # Ensure persistence daemon is running
765- _ensure_persistenced
766-
767- # Write kernel update hook
768- _write_kernel_update_hook
769-
770- # Store driver configuration
771- _store_driver_config
772-
773- echo " Userspace-only install complete, now waiting for signal"
774- sleep infinity &
775- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
776- trap - EXIT
777- while true ; do wait $! || continue ; done
778- exit 0
779- fi
780- fi
736+ # Fast path: if NVIDIA kernel modules are already loaded and config matches,
737+ # skip kernel module build/load and only reinstall userspace components.
738+ # This handles non-clean restarts where modules are in use and can't be unloaded.
739+ if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
740+ current_config=$( _build_driver_config)
741+ stored_config=$( cat /run/nvidia/driver-config.state)
742+
743+ if [ " ${current_config} " = " ${stored_config} " ]; then
744+ echo " Detected matching loaded driver & config (${DRIVER_VERSION} ); performing userspace-only install"
745+ _unmount_rootfs
746+ _update_package_cache
747+ _resolve_kernel_version || exit 1
748+ _install_prerequisites
749+ _install_userspace_components
750+ _resolve_kernel_type || exit 1
751+ _copy_kernel_module_sources
752+ _mount_rootfs
753+ _ensure_persistenced
754+ _write_kernel_update_hook
755+ _store_driver_config
756+ echo " Userspace-only install complete"
757+ _wait_for_signal
758+ fi
759+ fi
781760
761+ # Full install path: unload existing driver and perform complete installation
782762 _unload_driver || exit 1
783763 _unmount_rootfs
784-
785- # Install the userspace components
786- sh NVIDIA-Linux-$DRIVER_ARCH -$DRIVER_VERSION .run -x && \
787- cd NVIDIA-Linux-$DRIVER_ARCH -$DRIVER_VERSION && \
788- ./nvidia-installer --silent \
789- --no-kernel-module \
790- --no-nouveau-check \
791- --no-nvidia-modprobe \
792- --no-rpms \
793- --no-backup \
794- --no-check-for-alternate-installs \
795- --no-libglx-indirect \
796- --no-install-libglvnd \
797- --x-prefix=/tmp/null \
798- --x-module-path=/tmp/null \
799- --x-library-path=/tmp/null \
800- --x-sysconfig-path=/tmp/null
801-
802- # Determine the kernel module type
764+ _install_userspace_components
803765 _resolve_kernel_type || exit 1
804766
805- # Copy the kernel module sources
806- mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \
807- mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \
808- sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
767+ # Move (not copy) kernel module sources since this is the full install path
768+ mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
769+ mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} /
770+ sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
809771
810772 if _kernel_requires_package; then
811773 _update_ca_certificates
@@ -814,22 +776,14 @@ init() {
814776 _resolve_kernel_version || exit 1
815777 _install_prerequisites
816778 _create_driver_package
817- # _remove_prerequisites
818- # _cleanup_package_cache
819779 fi
820780
821781 _install_driver
822782 _load_driver || exit 1
823783 _mount_rootfs
824784 _write_kernel_update_hook
825785 _store_driver_config
826-
827- echo " Done, now waiting for signal"
828- sleep infinity &
829- trap " echo 'Caught signal'; _shutdown && { kill $! ; exit 0; }" HUP INT QUIT PIPE TERM
830- trap - EXIT
831- while true ; do wait $! || continue ; done
832- exit 0
786+ _wait_for_signal
833787}
834788
835789update () {
0 commit comments