@@ -8,6 +8,7 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
88DRIVER_VERSION=${DRIVER_VERSION:? " Missing DRIVER_VERSION env" }
99KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1010NUM_VGPU_DEVICES=0
11+ DRIVER_TYPE=" ${DRIVER_TYPE:- passthrough} "
1112GPU_DIRECT_RDMA_ENABLED=" ${GPU_DIRECT_RDMA_ENABLED:- false} "
1213USE_HOST_MOFED=" ${USE_HOST_MOFED:- false} "
1314NVIDIA_MODULE_PARAMS=()
@@ -344,44 +345,7 @@ _load_driver() {
344345 set +o xtrace -o nounset
345346 fi
346347
347- echo " Starting NVIDIA persistence daemon..."
348- nvidia-persistenced --persistence-mode
349-
350- if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
351- echo " Copying gridd.conf..."
352- cp /drivers/gridd.conf /etc/nvidia/gridd.conf
353- if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
354- echo " Copying ClientConfigToken..."
355- mkdir -p /etc/nvidia/ClientConfigToken/
356- cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
357- fi
358-
359- echo " Starting nvidia-gridd.."
360- LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH -linux-gnu/nvidia/gridd nvidia-gridd
361-
362- # Start virtual topology daemon
363- _start_vgpu_topology_daemon
364- fi
365-
366- if _assert_nvlink5_system; then
367- _ensure_nvlink5_prerequisites || return 1
368- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
369-
370- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
371- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
372- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
373- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
374- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
375- --fm-config-file $fm_config_file \
376- --fm-pid-file $fm_pid_file \
377- --nvlsm-config-file $nvlsm_config_file \
378- --nvlsm-pid-file $nvlsm_pid_file
379-
380- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
381- elif _assert_nvswitch_system; then
382- echo " Starting NVIDIA fabric manager daemon..."
383- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
384- fi
348+ _start_daemons
385349
386350 return 0
387351}
@@ -425,6 +389,21 @@ _unload_driver() {
425389 fi
426390 fi
427391
392+ if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
393+ echo " Stopping NVIDIA topology daemon..."
394+ local pid=$( < /var/run/nvidia-topologyd/nvidia-topologyd.pid)
395+
396+ kill -SIGTERM " ${pid} "
397+ for i in $( seq 1 50) ; do
398+ kill -0 " ${pid} " 2> /dev/null || break
399+ sleep 0.1
400+ done
401+ if [ $i -eq 50 ]; then
402+ echo " Could not stop NVIDIA topology daemon" >&2
403+ return 1
404+ fi
405+ fi
406+
428407 if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
429408 echo " Stopping NVIDIA fabric manager daemon..."
430409 local pid=$( < /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -639,16 +618,44 @@ _start_vgpu_topology_daemon() {
639618 nvidia-topologyd
640619}
641620
642- _ensure_persistenced () {
643- local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
644- if pid=$( < " ${pid_file} " 2> /dev/null) && [ -n " ${pid} " ] && kill -0 " ${pid} " 2> /dev/null; then
645- return 0
621+ _start_daemons () {
622+ echo " Starting NVIDIA persistence daemon..."
623+ nvidia-persistenced --persistence-mode
624+
625+ if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
626+ echo " Copying gridd.conf..."
627+ cp /drivers/gridd.conf /etc/nvidia/gridd.conf
628+ if [ " ${VGPU_LICENSE_SERVER_TYPE} " = " NLS" ]; then
629+ echo " Copying ClientConfigToken..."
630+ mkdir -p /etc/nvidia/ClientConfigToken/
631+ cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
632+ fi
633+
634+ echo " Starting nvidia-gridd.."
635+ LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH -linux-gnu/nvidia/gridd nvidia-gridd
636+
637+ # Start virtual topology daemon
638+ _start_vgpu_topology_daemon
646639 fi
647640
648- if command -v nvidia-persistenced > /dev/null 2>&1 ; then
649- nvidia-persistenced --persistence-mode || true
650- else
651- echo " nvidia-persistenced not found; continuing without persistence"
641+ if _assert_nvlink5_system; then
642+ _ensure_nvlink5_prerequisites || return 1
643+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
644+
645+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
646+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
647+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
648+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
649+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
650+ --fm-config-file $fm_config_file \
651+ --fm-pid-file $fm_pid_file \
652+ --nvlsm-config-file $nvlsm_config_file \
653+ --nvlsm-pid-file $nvlsm_pid_file
654+
655+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
656+ elif _assert_nvswitch_system; then
657+ echo " Starting NVIDIA fabric manager daemon..."
658+ nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
652659 fi
653660}
654661
@@ -660,7 +667,7 @@ _read_conf_file() {
660667_build_driver_config () {
661668 cat << EOF
662669DRIVER_VERSION=${DRIVER_VERSION}
663- DRIVER_TYPE=${DRIVER_TYPE:- passthrough }
670+ DRIVER_TYPE=${DRIVER_TYPE}
664671KERNEL_VERSION=$( uname -r)
665672GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
666673USE_HOST_MOFED=${USE_HOST_MOFED}
673680}
674681
675682_store_driver_config () {
676- local config_file=" /run/nvidia /driver-config.state"
683+ local config_file=" ${RUN_DIR} /driver-config.state"
677684 echo " Storing driver configuration state..."
678685 _build_driver_config > " $config_file "
679686 echo " Driver configuration stored at $config_file "
@@ -700,9 +707,9 @@ _install_userspace_components() {
700707 --x-sysconfig-path=/tmp/null
701708}
702709
703- _copy_kernel_module_sources () {
710+ _move_kernel_module_sources () {
704711 mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
705- cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} /
712+ mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} /
706713 sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
707714}
708715
@@ -743,14 +750,11 @@ init() {
743750 if [ " ${current_config} " = " ${stored_config} " ]; then
744751 echo " Detected matching loaded driver & config (${DRIVER_VERSION} ); performing userspace-only install"
745752 _unmount_rootfs
746- _update_package_cache
747- _resolve_kernel_version || exit 1
748- _install_prerequisites
749753 _install_userspace_components
750754 _resolve_kernel_type || exit 1
751- _copy_kernel_module_sources
755+ _move_kernel_module_sources
752756 _mount_rootfs
753- _ensure_persistenced
757+ _start_daemons
754758 _write_kernel_update_hook
755759 _store_driver_config
756760 echo " Userspace-only install complete"
@@ -764,10 +768,7 @@ init() {
764768 _install_userspace_components
765769 _resolve_kernel_type || exit 1
766770
767- # Move (not copy) kernel module sources since this is the full install path
768- mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
769- mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} /
770- sed ' 9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION} /.manifest
771+ _move_kernel_module_sources
771772
772773 if _kernel_requires_package; then
773774 _update_ca_certificates
0 commit comments