-
Notifications
You must be signed in to change notification settings - Fork 71
Add fast-track to skip uninstall/install if NVIDIA driver modules present #454
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
7c178f6
a8dbb15
ba7e6de
0a036ed
d4a6dff
b660caa
2087341
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid | |
| DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} | ||
| KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver | ||
| NUM_VGPU_DEVICES=0 | ||
| GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" | ||
| USE_HOST_MOFED="${USE_HOST_MOFED:-false}" | ||
| NVIDIA_MODULE_PARAMS=() | ||
| NVIDIA_UVM_MODULE_PARAMS=() | ||
| NVIDIA_MODESET_MODULE_PARAMS=() | ||
| NVIDIA_PEERMEM_MODULE_PARAMS=() | ||
| TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} | ||
| USE_HOST_MOFED="${USE_HOST_MOFED:-false}" | ||
| DNF_RELEASEVER=${DNF_RELEASEVER:-""} | ||
| RHEL_VERSION=${RHEL_VERSION:-""} | ||
| RHEL_MAJOR_VERSION=9 | ||
|
|
@@ -211,7 +212,10 @@ _create_driver_package() ( | |
| local nvidia_modeset_sign_args="" | ||
| local nvidia_uvm_sign_args="" | ||
|
|
||
| trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT | ||
| # Skip cleanup trap for DTK builds - modules are copied after this function returns | ||
| if [ "${PACKAGE_TAG:-}" != "builtin" ]; then | ||
|
||
| trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT | ||
| fi | ||
|
|
||
| echo "Compiling NVIDIA driver kernel modules..." | ||
| cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} | ||
|
|
@@ -566,11 +570,7 @@ _install_driver() { | |
| install_args+=("--skip-module-load") | ||
| fi | ||
|
|
||
| IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} | ||
| # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path | ||
| # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point | ||
| # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit | ||
| #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} | ||
| IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} | ||
| } | ||
|
|
||
| # Mount the driver rootfs into the run directory with the exception of sysfs. | ||
|
|
@@ -701,6 +701,114 @@ _start_vgpu_topology_daemon() { | |
| nvidia-topologyd | ||
| } | ||
|
|
||
| _ensure_persistence() { | ||
|
||
| local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid | ||
| if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then | ||
| return 0 | ||
| fi | ||
|
|
||
| if command -v nvidia-persistenced >/dev/null 2>&1; then | ||
| nvidia-persistenced --persistence-mode || true | ||
| else | ||
| echo "nvidia-persistenced not found; continuing without persistence" | ||
| fi | ||
| } | ||
|
|
||
| _build_driver_config() { | ||
| local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params="" | ||
|
|
||
| # Read module parameters from conf files | ||
| if [ -f "/drivers/nvidia.conf" ]; then | ||
| nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ') | ||
| fi | ||
| if [ -f "/drivers/nvidia-uvm.conf" ]; then | ||
| nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ') | ||
| fi | ||
| if [ -f "/drivers/nvidia-modeset.conf" ]; then | ||
| nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ') | ||
| fi | ||
| if [ -f "/drivers/nvidia-peermem.conf" ]; then | ||
| nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ') | ||
| fi | ||
|
|
||
| local config="DRIVER_VERSION=${DRIVER_VERSION} | ||
karthikvetrivel marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| KERNEL_VERSION=$(uname -r) | ||
| GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} | ||
| USE_HOST_MOFED=${USE_HOST_MOFED:-false} | ||
| KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} | ||
| NVIDIA_MODULE_PARAMS=${nvidia_params} | ||
| NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params} | ||
| NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params} | ||
| NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}" | ||
|
|
||
| # Append config file contents directly | ||
| for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do | ||
| if [ -f "/drivers/$conf_file" ]; then | ||
| config="${config} | ||
| $(cat "/drivers/$conf_file")" | ||
| fi | ||
| done | ||
|
|
||
| echo "$config" | ||
| } | ||
|
|
||
| _store_driver_config() { | ||
| local config_file="/run/nvidia/driver-config.state" | ||
| echo "Storing driver configuration state..." | ||
| _build_driver_config > "$config_file" | ||
| echo "Driver configuration stored at $config_file" | ||
| } | ||
|
|
||
| _should_use_fast_path() { | ||
| [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 | ||
| local current_config=$(_build_driver_config) | ||
| local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") | ||
| [ "${current_config}" = "${stored_config}" ] | ||
| } | ||
|
|
||
| _userspace_only_install() { | ||
| echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" | ||
|
|
||
| _unmount_rootfs | ||
| _update_package_cache | ||
|
|
||
| # Skip kernel-related steps for userspace-only install | ||
| # KERNEL_VERSION is already set from uname -r, no need to resolve from yum | ||
| # Kernel headers/devel/modules are not needed for userspace-only install | ||
|
|
||
| cd /drivers | ||
| [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x | ||
| cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} | ||
|
|
||
|
|
||
| echo "DEBUG: Current directory: $(pwd)" | ||
| echo "DEBUG: Checking for ./nvidia-installer:" | ||
| ls -la ./nvidia-installer 2>&1 || echo " ./nvidia-installer NOT FOUND" | ||
| echo "DEBUG: Checking PATH for nvidia-installer:" | ||
| which nvidia-installer 2>&1 || echo " nvidia-installer NOT in PATH" | ||
|
|
||
|
|
||
| echo "Installing userspace components (libraries and binaries)..." | ||
| local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none" | ||
| [ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license" | ||
| IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args | ||
|
|
||
| # Copy kernel module sources if not already present (needed for other containers) | ||
| if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then | ||
| _resolve_kernel_type || exit 1 | ||
| mkdir -p /usr/src/nvidia-${DRIVER_VERSION} | ||
| cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ | ||
| sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest | ||
| fi | ||
|
|
||
| _mount_rootfs | ||
| _ensure_persistence | ||
karthikvetrivel marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| _write_kernel_update_hook | ||
| _store_driver_config | ||
|
|
||
| echo "Userspace-only install complete" | ||
| } | ||
|
|
||
| _prepare() { | ||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| _find_vgpu_driver_version || exit 1 | ||
|
|
@@ -758,6 +866,7 @@ _load() { | |
| _load_driver | ||
| _mount_rootfs | ||
| _write_kernel_update_hook | ||
| _store_driver_config | ||
|
|
||
| echo "Done, now waiting for signal" | ||
| sleep infinity & | ||
|
|
@@ -768,7 +877,49 @@ _load() { | |
| } | ||
|
|
||
| init() { | ||
| _prepare_exclusive | ||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| _find_vgpu_driver_version || exit 1 | ||
| fi | ||
|
|
||
| echo -e "\n========== NVIDIA Software Installer ==========\n" | ||
| echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" | ||
|
|
||
| exec 3> ${PID_FILE} | ||
| if ! flock -n 3; then | ||
| echo "An instance of the NVIDIA driver is already running, aborting" | ||
| exit 1 | ||
| fi | ||
| echo $$ >&3 | ||
|
|
||
| trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM | ||
| trap "_shutdown" EXIT | ||
|
Comment on lines
+827
to
+842
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make this a bit easier to read, what if we reverted this change and continued to just call Feel free to push back if this complicates things. |
||
|
|
||
| if _should_use_fast_path; then | ||
| _userspace_only_install | ||
|
|
||
| echo "Userspace-only install complete, now waiting for signal" | ||
| sleep infinity & | ||
| trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM | ||
| trap - EXIT | ||
| while true; do wait $! || continue; done | ||
| exit 0 | ||
| fi | ||
|
|
||
| _unload_driver || exit 1 | ||
| _unmount_rootfs | ||
|
|
||
| # Install the userspace components and copy the kernel module sources. | ||
| sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ | ||
| cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ | ||
| sh /tmp/install.sh nvinstall | ||
|
|
||
| # Determine the kernel module type | ||
| _resolve_kernel_type || exit 1 | ||
|
|
||
| # Copy the kernel module sources | ||
| mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ | ||
| mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ | ||
| sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest | ||
|
|
||
| _build | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The permissions of this file have been changed.