diff --git a/rhel8/common.sh b/rhel8/common.sh index a41a14a12..f33d82de2 100755 --- a/rhel8/common.sh +++ b/rhel8/common.sh @@ -45,3 +45,13 @@ _gdrcopy_enabled() { fi return 1 } + +# Check if fast path should be used (driver already loaded with matching config) +# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest +_should_skip_kernel_module_reload() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1 + local current_digest="${DRIVER_CONFIG_DIGEST:-}" + [ -z "${current_digest}" ] && return 1 + local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "") + [ "${current_digest}" = "${stored_digest}" ] +} diff --git a/rhel8/nvidia-driver b/rhel8/nvidia-driver index c48fadd70..5b19d1e7a 100755 --- a/rhel8/nvidia-driver +++ b/rhel8/nvidia-driver @@ -401,44 +401,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons } # Stop persistenced and unload the kernel modules if they are currently loaded. @@ -480,6 +443,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -570,10 +548,6 @@ _install_driver() { fi IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} - # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path - # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point - # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit - #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} } # Mount the driver rootfs into the run directory with the exception of sysfs. @@ -704,6 +678,73 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +_store_driver_digest() { + local digest_file="${RUN_DIR}/nvidia-driver.state" + echo "Storing driver configuration digest..." + echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file" + echo "Driver configuration digest stored at $digest_file" +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +_userspace_install() { + echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install" + _unmount_rootfs + _start_daemons + _mount_rootfs + _write_kernel_update_hook + _store_driver_digest + echo "Userspace-only install complete" +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -740,9 +781,6 @@ _prepare_exclusive() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - - _unload_driver || exit 1 - _unmount_rootfs } _build() { @@ -763,18 +801,21 @@ _load() { _load_driver _mount_rootfs _write_kernel_update_hook - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _store_driver_digest + _wait_for_signal } init() { _prepare_exclusive + if _should_skip_kernel_module_reload; then + _userspace_install + _wait_for_signal + fi + + _unload_driver || exit 1 + _unmount_rootfs + _build _load @@ -789,6 +830,9 @@ build() { load() { _prepare_exclusive + _unload_driver || exit 1 + _unmount_rootfs + _load } diff --git a/rhel8/ocp_dtk_entrypoint b/rhel8/ocp_dtk_entrypoint index 458ecd57a..c7101cec8 100755 --- a/rhel8/ocp_dtk_entrypoint +++ b/rhel8/ocp_dtk_entrypoint @@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() { exec bash -x nvidia-driver init fi + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install" + exec bash -x nvidia-driver init + fi + + echo "Fast path not detected: building driver and modules" + if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then cp -r \ /tmp/install.sh \ @@ -79,6 +86,19 @@ dtk-build-driver() { sleep inf fi + # Check if fast path is being used - if so, skip building and signal completion + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, skipping build" + echo "Signaling driver_built to the main container and sleeping forever..." + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" + while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do + sleep 5 + done + echo "WARNING: driver_built flag disappeared" + exit 0 + fi + if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then echo "WARNING: broken Driver Toolkit image detected:" echo "- Node kernel: $(uname -r)" @@ -99,7 +119,7 @@ dtk-build-driver() { echo "NVIDIA drivers already generated, nothing to do ..." while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do - sleep 30 + sleep 5 done echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..." else @@ -249,7 +269,7 @@ dtk-build-driver() { fi while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do - sleep 30 + sleep 5 done echo "WARNING: driver_built flag disappeared, restart this container" diff --git a/rhel9/common.sh b/rhel9/common.sh index a41a14a12..f33d82de2 100755 --- a/rhel9/common.sh +++ b/rhel9/common.sh @@ -45,3 +45,13 @@ _gdrcopy_enabled() { fi return 1 } + +# Check if fast path should be used (driver already loaded with matching config) +# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest +_should_skip_kernel_module_reload() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1 + local current_digest="${DRIVER_CONFIG_DIGEST:-}" + [ -z "${current_digest}" ] && return 1 + local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "") + [ "${current_digest}" = "${stored_digest}" ] +} diff --git a/rhel9/nvidia-driver b/rhel9/nvidia-driver index d1a23154e..af626bca4 100755 --- a/rhel9/nvidia-driver +++ b/rhel9/nvidia-driver @@ -418,44 +418,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons } # Stop persistenced and unload the kernel modules if they are currently loaded. @@ -497,6 +460,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -587,10 +565,6 @@ _install_driver() { fi IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} - # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path - # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point - # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit - #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} } # Mount the driver rootfs into the run directory with the exception of sysfs. @@ -721,6 +695,73 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +_store_driver_digest() { + local digest_file="${RUN_DIR}/nvidia-driver.state" + echo "Storing driver configuration digest..." + echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file" + echo "Driver configuration digest stored at $digest_file" +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +_userspace_install() { + echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install" + _unmount_rootfs + _start_daemons + _mount_rootfs + _write_kernel_update_hook + _store_driver_digest + echo "Userspace-only install complete" +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -757,9 +798,6 @@ _prepare_exclusive() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - - _unload_driver || exit 1 - _unmount_rootfs } _build() { @@ -780,18 +818,21 @@ _load() { _load_driver _mount_rootfs _write_kernel_update_hook - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _store_driver_digest + _wait_for_signal } init() { _prepare_exclusive + if _should_skip_kernel_module_reload; then + _userspace_install + _wait_for_signal + fi + + _unload_driver || exit 1 + _unmount_rootfs + _build _load @@ -806,6 +847,9 @@ build() { load() { _prepare_exclusive + _unload_driver || exit 1 + _unmount_rootfs + _load } diff --git a/rhel9/ocp_dtk_entrypoint b/rhel9/ocp_dtk_entrypoint index 53b7ab07d..ff876384f 100755 --- a/rhel9/ocp_dtk_entrypoint +++ b/rhel9/ocp_dtk_entrypoint @@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() { exec bash -x nvidia-driver init fi + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, skipping kernel module build and proceeding with userspace-only install" + exec bash -x nvidia-driver init + fi + + echo "Fast path not detected: building driver and modules" + if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then cp -r \ /tmp/install.sh \ @@ -82,6 +89,19 @@ dtk-build-driver() { sleep inf fi + # Check if fast path is being used - if so, skip building and signal completion + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, skipping build" + echo "Signaling driver_built to the main container and sleeping forever..." + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" + while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do + sleep 5 + done + echo "WARNING: driver_built flag disappeared" + exit 0 + fi + if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then echo "WARNING: broken Driver Toolkit image detected:" echo "- Node kernel: $(uname -r)" @@ -102,7 +122,7 @@ dtk-build-driver() { echo "NVIDIA drivers already generated, nothing to do ..." while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do - sleep 30 + sleep 5 done echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..." else @@ -256,7 +276,7 @@ dtk-build-driver() { fi while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do - sleep 30 + sleep 5 done echo "WARNING: driver_built flag disappeared, restart this container" diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 25169fcfb..89e74a840 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -364,44 +364,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons return 0 } @@ -445,6 +408,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -658,35 +636,112 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -init() { +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + if [ "${DRIVER_TYPE}" = "vgpu" ]; then - _find_vgpu_driver_version || exit 1 + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon fi - # Install the userspace components - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ - cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ - ./nvidia-installer --silent \ - --no-kernel-module \ - --no-nouveau-check \ - --no-nvidia-modprobe \ - --no-rpms \ - --no-backup \ - --no-check-for-alternate-installs \ - --no-libglx-indirect \ - --no-install-libglvnd \ - --x-prefix=/tmp/null \ - --x-module-path=/tmp/null \ - --x-library-path=/tmp/null \ - --x-sysconfig-path=/tmp/null - - # Determine the kernel module type - _resolve_kernel_type || exit 1 + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +_read_stored_digest() { + cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "" +} + +_get_current_digest() { + echo "${DRIVER_CONFIG_DIGEST:-}" +} + +# Check if fast path should be used (driver already loaded with matching config) +# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest +_should_skip_kernel_module_reload() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1 + local current_digest="${DRIVER_CONFIG_DIGEST:-}" + [ -z "${current_digest}" ] && return 1 + local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "") + [ "${current_digest}" = "${stored_digest}" ] +} + +_store_driver_digest() { + local digest_file="${RUN_DIR}/nvidia-driver.state" + echo "Storing driver configuration digest..." + echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file" + echo "Driver configuration digest stored at $digest_file" +} + +_install_userspace_components() { + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null +} + +_move_kernel_module_sources() { + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} - # Copy the kernel module sources - mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ - mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ - sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest +init() { + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + _find_vgpu_driver_version || exit 1 + fi echo -e "\n========== NVIDIA Software Installer ==========\n" echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" @@ -701,8 +756,26 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install" + _unmount_rootfs + _install_userspace_components + _resolve_kernel_type || exit 1 + _move_kernel_module_sources + _start_daemons + _mount_rootfs + _write_kernel_update_hook + _store_driver_digest + echo "Userspace-only install complete" + _wait_for_signal + fi + + # Full install path: unload existing driver and perform complete installation _unload_driver || exit 1 _unmount_rootfs + _install_userspace_components + _resolve_kernel_type || exit 1 + _move_kernel_module_sources if _kernel_requires_package; then _update_ca_certificates @@ -711,8 +784,6 @@ init() { _resolve_kernel_version || exit 1 _install_prerequisites _create_driver_package - #_remove_prerequisites - #_cleanup_package_cache fi _create_module_params_conf @@ -720,13 +791,8 @@ init() { _load_driver || exit 1 _mount_rootfs _write_kernel_update_hook - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _store_driver_digest + _wait_for_signal } update() { diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index e14801fb1..3ad39667a 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -305,44 +305,7 @@ _load_driver() { set +o xtrace -o nounset fi - echo "Starting NVIDIA persistence daemon..." - nvidia-persistenced --persistence-mode - - if [ "${DRIVER_TYPE}" = "vgpu" ]; then - echo "Copying gridd.conf..." - cp /drivers/gridd.conf /etc/nvidia/gridd.conf - if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then - echo "Copying ClientConfigToken..." - mkdir -p /etc/nvidia/ClientConfigToken/ - cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ - fi - - echo "Starting nvidia-gridd.." - LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd - - # Start virtual topology daemon - _start_vgpu_topology_daemon - fi - - if _assert_nvlink5_system; then - _ensure_nvlink5_prerequisites || return 1 - echo "Starting NVIDIA fabric manager daemon for NVLink5+..." - - fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg - fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid - nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf - nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid - /usr/bin/nvidia-fabricmanager-start.sh --mode start \ - --fm-config-file $fm_config_file \ - --fm-pid-file $fm_pid_file \ - --nvlsm-config-file $nvlsm_config_file \ - --nvlsm-pid-file $nvlsm_pid_file - - # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches - elif _assert_nvswitch_system; then - echo "Starting NVIDIA fabric manager daemon..." - nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg - fi + _start_daemons return 0 } @@ -386,6 +349,21 @@ _unload_driver() { fi fi + if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then + echo "Stopping NVIDIA topology daemon..." + local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA topology daemon" >&2 + return 1 + fi + fi + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then echo "Stopping NVIDIA fabric manager daemon..." local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) @@ -586,6 +564,131 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_start_daemons() { + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +_read_stored_digest() { + cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "" +} + +_get_current_digest() { + echo "${DRIVER_CONFIG_DIGEST:-}" +} + +# Check if fast path should be used (driver already loaded with matching config) +# Compares current digest from DRIVER_CONFIG_DIGEST env var with stored digest +_should_skip_kernel_module_reload() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1 + local current_digest="${DRIVER_CONFIG_DIGEST:-}" + [ -z "${current_digest}" ] && return 1 + local stored_digest=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "") + [ "${current_digest}" = "${stored_digest}" ] +} + +_store_driver_digest() { + local digest_file="${RUN_DIR}/nvidia-driver.state" + echo "Storing driver configuration digest..." + echo "${DRIVER_CONFIG_DIGEST}" > "$digest_file" + echo "Driver configuration digest stored at $digest_file" +} + +_install_userspace_components() { + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null +} + +_resolve_kernel_type() { + if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then + KERNEL_TYPE=kernel + elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then + KERNEL_TYPE=kernel-open + elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then + kernel_module_type=$(nvidia-installer --print-recommended-kernel-module-type) + if [ $? -ne 0 ]; then + echo "failed to retrieve the recommended kernel module type from nvidia-installer, falling back to using the driver branch" + _resolve_kernel_type_from_driver_branch + return 0 + fi + [[ "${kernel_module_type}" == "open" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + else + echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}" + return 1 + fi +} + +_resolve_kernel_type_from_driver_branch() { + [[ "${DRIVER_BRANCH}" -lt 560 ]] && KERNEL_TYPE=kernel || KERNEL_TYPE=kernel-open +} + +_move_kernel_module_sources() { + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest +} + +_wait_for_signal() { + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + init() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -604,6 +707,21 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT + if _should_skip_kernel_module_reload; then + echo "The NVIDIA driver is already loaded with the desired configuration, performing userspace-only install" + _unmount_rootfs + _install_userspace_components + _resolve_kernel_type || exit 1 + _move_kernel_module_sources + _start_daemons + _mount_rootfs + _write_kernel_update_hook + _store_driver_digest + echo "Userspace-only install complete" + _wait_for_signal + fi + + # Full install path: unload existing driver and perform complete installation _unload_driver || exit 1 _unmount_rootfs @@ -613,21 +731,14 @@ init() { _resolve_kernel_version || exit 1 _install_prerequisites _link_ofa_kernel - #_remove_prerequisites - #_cleanup_package_cache _create_module_params_conf _install_driver _load_driver || exit 1 _mount_rootfs _write_kernel_update_hook - - echo "Done, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 + _store_driver_digest + _wait_for_signal } # Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates