Skip to content

Commit 2087341

Browse files
refactor: address PR review comments for driver install scripts
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent b660caa commit 2087341

File tree

3 files changed

+87
-86
lines changed

3 files changed

+87
-86
lines changed

rhel9/common.sh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,15 @@ _read_conf_file() {
5353
}
5454

5555
# Build driver configuration for state comparison
56+
# Note: Variables are expected to be set by the sourcing script (nvidia-driver)
5657
_build_driver_config() {
5758
cat <<EOF
5859
DRIVER_VERSION=${DRIVER_VERSION}
59-
DRIVER_TYPE=${DRIVER_TYPE:-passthrough}
60+
DRIVER_TYPE=${DRIVER_TYPE}
6061
KERNEL_VERSION=$(uname -r)
61-
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
62-
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
63-
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
62+
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
63+
USE_HOST_MOFED=${USE_HOST_MOFED}
64+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}
6465
NVIDIA_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia.conf)
6566
NVIDIA_UVM_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia-uvm.conf)
6667
NVIDIA_MODESET_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia-modeset.conf)
@@ -70,8 +71,8 @@ EOF
7071

7172
# Check if fast path should be used (driver already loaded with matching config)
7273
_should_use_fast_path() {
73-
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
74+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ] || return 1
7475
local current_config=$(_build_driver_config)
75-
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
76+
local stored_config=$(cat /run/nvidia/nvidia-driver.state 2>/dev/null || echo "")
7677
[ "${current_config}" = "${stored_config}" ]
7778
}

rhel9/nvidia-driver

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
88
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
99
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1010
NUM_VGPU_DEVICES=0
11+
DRIVER_TYPE="${DRIVER_TYPE:-passthrough}"
1112
GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
1213
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1314
NVIDIA_MODULE_PARAMS=()
@@ -212,10 +213,7 @@ _create_driver_package() (
212213
local nvidia_modeset_sign_args=""
213214
local nvidia_uvm_sign_args=""
214215

215-
# Skip cleanup trap for DTK builds - modules are copied after this function returns
216-
if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
217-
trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
218-
fi
216+
trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
219217

220218
echo "Compiling NVIDIA driver kernel modules..."
221219
cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE}
@@ -444,6 +442,21 @@ _unload_driver() {
444442
fi
445443
fi
446444

445+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
446+
echo "Stopping NVIDIA topology daemon..."
447+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
448+
449+
kill -SIGTERM "${pid}"
450+
for i in $(seq 1 50); do
451+
kill -0 "${pid}" 2> /dev/null || break
452+
sleep 0.1
453+
done
454+
if [ $i -eq 50 ]; then
455+
echo "Could not stop NVIDIA topology daemon" >&2
456+
return 1
457+
fi
458+
fi
459+
447460
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
448461
echo "Stopping NVIDIA fabric manager daemon..."
449462
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -664,19 +677,6 @@ _start_vgpu_topology_daemon() {
664677
nvidia-topologyd
665678
}
666679

667-
_ensure_persistence() {
668-
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
669-
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
670-
return 0
671-
fi
672-
673-
if command -v nvidia-persistenced >/dev/null 2>&1; then
674-
nvidia-persistenced --persistence-mode || true
675-
else
676-
echo "nvidia-persistenced not found; continuing without persistence"
677-
fi
678-
}
679-
680680
_start_daemons() {
681681
echo "Starting NVIDIA persistence daemon..."
682682
nvidia-persistenced --persistence-mode
@@ -719,7 +719,7 @@ _start_daemons() {
719719
}
720720

721721
_store_driver_config() {
722-
local config_file="/run/nvidia/driver-config.state"
722+
local config_file="${RUN_DIR}/nvidia-driver.state"
723723
echo "Storing driver configuration state..."
724724
_build_driver_config > "$config_file"
725725
echo "Driver configuration stored at $config_file"
@@ -737,7 +737,6 @@ _wait_for_signal() {
737737
_userspace_only_install() {
738738
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
739739
_unmount_rootfs
740-
_update_package_cache
741740

742741
cd /drivers
743742
[ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x

ubuntu22.04/nvidia-driver

Lines changed: 62 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
88
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
99
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
1010
NUM_VGPU_DEVICES=0
11+
DRIVER_TYPE="${DRIVER_TYPE:-passthrough}"
1112
GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
1213
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
1314
NVIDIA_MODULE_PARAMS=()
@@ -344,44 +345,7 @@ _load_driver() {
344345
set +o xtrace -o nounset
345346
fi
346347

347-
echo "Starting NVIDIA persistence daemon..."
348-
nvidia-persistenced --persistence-mode
349-
350-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
351-
echo "Copying gridd.conf..."
352-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
353-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
354-
echo "Copying ClientConfigToken..."
355-
mkdir -p /etc/nvidia/ClientConfigToken/
356-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
357-
fi
358-
359-
echo "Starting nvidia-gridd.."
360-
LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd
361-
362-
# Start virtual topology daemon
363-
_start_vgpu_topology_daemon
364-
fi
365-
366-
if _assert_nvlink5_system; then
367-
_ensure_nvlink5_prerequisites || return 1
368-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
369-
370-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
371-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
372-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
373-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
374-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
375-
--fm-config-file $fm_config_file \
376-
--fm-pid-file $fm_pid_file \
377-
--nvlsm-config-file $nvlsm_config_file \
378-
--nvlsm-pid-file $nvlsm_pid_file
379-
380-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
381-
elif _assert_nvswitch_system; then
382-
echo "Starting NVIDIA fabric manager daemon..."
383-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
384-
fi
348+
_start_daemons
385349

386350
return 0
387351
}
@@ -425,6 +389,21 @@ _unload_driver() {
425389
fi
426390
fi
427391

392+
if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
393+
echo "Stopping NVIDIA topology daemon..."
394+
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)
395+
396+
kill -SIGTERM "${pid}"
397+
for i in $(seq 1 50); do
398+
kill -0 "${pid}" 2> /dev/null || break
399+
sleep 0.1
400+
done
401+
if [ $i -eq 50 ]; then
402+
echo "Could not stop NVIDIA topology daemon" >&2
403+
return 1
404+
fi
405+
fi
406+
428407
if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
429408
echo "Stopping NVIDIA fabric manager daemon..."
430409
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
@@ -639,16 +618,44 @@ _start_vgpu_topology_daemon() {
639618
nvidia-topologyd
640619
}
641620

642-
_ensure_persistenced() {
643-
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
644-
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
645-
return 0
621+
_start_daemons() {
622+
echo "Starting NVIDIA persistence daemon..."
623+
nvidia-persistenced --persistence-mode
624+
625+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
626+
echo "Copying gridd.conf..."
627+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
628+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
629+
echo "Copying ClientConfigToken..."
630+
mkdir -p /etc/nvidia/ClientConfigToken/
631+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
632+
fi
633+
634+
echo "Starting nvidia-gridd.."
635+
LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd
636+
637+
# Start virtual topology daemon
638+
_start_vgpu_topology_daemon
646639
fi
647640

648-
if command -v nvidia-persistenced >/dev/null 2>&1; then
649-
nvidia-persistenced --persistence-mode || true
650-
else
651-
echo "nvidia-persistenced not found; continuing without persistence"
641+
if _assert_nvlink5_system; then
642+
_ensure_nvlink5_prerequisites || return 1
643+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
644+
645+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
646+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
647+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
648+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
649+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
650+
--fm-config-file $fm_config_file \
651+
--fm-pid-file $fm_pid_file \
652+
--nvlsm-config-file $nvlsm_config_file \
653+
--nvlsm-pid-file $nvlsm_pid_file
654+
655+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
656+
elif _assert_nvswitch_system; then
657+
echo "Starting NVIDIA fabric manager daemon..."
658+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
652659
fi
653660
}
654661

@@ -660,7 +667,7 @@ _read_conf_file() {
660667
_build_driver_config() {
661668
cat <<EOF
662669
DRIVER_VERSION=${DRIVER_VERSION}
663-
DRIVER_TYPE=${DRIVER_TYPE:-passthrough}
670+
DRIVER_TYPE=${DRIVER_TYPE}
664671
KERNEL_VERSION=$(uname -r)
665672
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
666673
USE_HOST_MOFED=${USE_HOST_MOFED}
@@ -673,7 +680,7 @@ EOF
673680
}
674681

675682
_store_driver_config() {
676-
local config_file="/run/nvidia/driver-config.state"
683+
local config_file="${RUN_DIR}/nvidia-driver.state"
677684
echo "Storing driver configuration state..."
678685
_build_driver_config > "$config_file"
679686
echo "Driver configuration stored at $config_file"
@@ -700,9 +707,9 @@ _install_userspace_components() {
700707
--x-sysconfig-path=/tmp/null
701708
}
702709

703-
_copy_kernel_module_sources() {
710+
_move_kernel_module_sources() {
704711
mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
705-
cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/
712+
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/
706713
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
707714
}
708715

@@ -736,21 +743,18 @@ init() {
736743
# Fast path: if NVIDIA kernel modules are already loaded and config matches,
737744
# skip kernel module build/load and only reinstall userspace components.
738745
# This handles non-clean restarts where modules are in use and can't be unloaded.
739-
if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
746+
if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/nvidia-driver.state ]; then
740747
current_config=$(_build_driver_config)
741-
stored_config=$(cat /run/nvidia/driver-config.state)
748+
stored_config=$(cat /run/nvidia/nvidia-driver.state)
742749

743750
if [ "${current_config}" = "${stored_config}" ]; then
744751
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
745752
_unmount_rootfs
746-
_update_package_cache
747-
_resolve_kernel_version || exit 1
748-
_install_prerequisites
749753
_install_userspace_components
750754
_resolve_kernel_type || exit 1
751-
_copy_kernel_module_sources
755+
_move_kernel_module_sources
752756
_mount_rootfs
753-
_ensure_persistenced
757+
_start_daemons
754758
_write_kernel_update_hook
755759
_store_driver_config
756760
echo "Userspace-only install complete"
@@ -764,10 +768,7 @@ init() {
764768
_install_userspace_components
765769
_resolve_kernel_type || exit 1
766770

767-
# Move (not copy) kernel module sources since this is the full install path
768-
mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
769-
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/
770-
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
771+
_move_kernel_module_sources
771772

772773
if _kernel_requires_package; then
773774
_update_ca_certificates

0 commit comments

Comments
 (0)