Skip to content

Commit 0fb8195

Browse files
Refactor fast-path logic in OpenShift
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 0a036ed commit 0fb8195

File tree

3 files changed

+102
-126
lines changed

3 files changed

+102
-126
lines changed

rhel9/common.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,51 @@ _gdrcopy_enabled() {
4545
fi
4646
return 1
4747
}
48+
49+
# Build driver configuration for state comparison
50+
_build_driver_config() {
51+
local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params=""
52+
53+
# Read module parameters from conf files
54+
if [ -f "/drivers/nvidia.conf" ]; then
55+
nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ')
56+
fi
57+
if [ -f "/drivers/nvidia-uvm.conf" ]; then
58+
nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ')
59+
fi
60+
if [ -f "/drivers/nvidia-modeset.conf" ]; then
61+
nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ')
62+
fi
63+
if [ -f "/drivers/nvidia-peermem.conf" ]; then
64+
nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ')
65+
fi
66+
67+
local config="DRIVER_VERSION=${DRIVER_VERSION}
68+
DRIVER_TYPE=${DRIVER_TYPE:-passthrough}
69+
KERNEL_VERSION=$(uname -r)
70+
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
71+
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
72+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
73+
NVIDIA_MODULE_PARAMS=${nvidia_params}
74+
NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params}
75+
NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params}
76+
NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}"
77+
78+
# Append config file contents directly
79+
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
80+
if [ -f "/drivers/$conf_file" ]; then
81+
config="${config}
82+
$(cat "/drivers/$conf_file")"
83+
fi
84+
done
85+
86+
echo "$config"
87+
}
88+
89+
# Check if fast path should be used (driver already loaded with matching config)
90+
_should_use_fast_path() {
91+
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
92+
local current_config=$(_build_driver_config)
93+
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
94+
[ "${current_config}" = "${stored_config}" ]
95+
}

rhel9/nvidia-driver

Lines changed: 41 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -402,44 +402,7 @@ _load_driver() {
402402
set +o xtrace -o nounset
403403
fi
404404

405-
echo "Starting NVIDIA persistence daemon..."
406-
nvidia-persistenced --persistence-mode
407-
408-
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
409-
echo "Copying gridd.conf..."
410-
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
411-
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
412-
echo "Copying ClientConfigToken..."
413-
mkdir -p /etc/nvidia/ClientConfigToken/
414-
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
415-
fi
416-
417-
echo "Starting nvidia-gridd.."
418-
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
419-
420-
# Start virtual topology daemon
421-
_start_vgpu_topology_daemon
422-
fi
423-
424-
if _assert_nvlink5_system; then
425-
_ensure_nvlink5_prerequisites || return 1
426-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
427-
428-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
429-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
430-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
431-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
432-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
433-
--fm-config-file $fm_config_file \
434-
--fm-pid-file $fm_pid_file \
435-
--nvlsm-config-file $nvlsm_config_file \
436-
--nvlsm-pid-file $nvlsm_pid_file
437-
438-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
439-
elif _assert_nvswitch_system; then
440-
echo "Starting NVIDIA fabric manager daemon..."
441-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
442-
fi
405+
_start_daemons
443406
}
444407

445408
# Stop persistenced and unload the kernel modules if they are currently loaded.
@@ -714,42 +677,45 @@ _ensure_persistence() {
714677
fi
715678
}
716679

717-
_build_driver_config() {
718-
local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params=""
719-
720-
# Read module parameters from conf files
721-
if [ -f "/drivers/nvidia.conf" ]; then
722-
nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ')
723-
fi
724-
if [ -f "/drivers/nvidia-uvm.conf" ]; then
725-
nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ')
726-
fi
727-
if [ -f "/drivers/nvidia-modeset.conf" ]; then
728-
nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ')
729-
fi
730-
if [ -f "/drivers/nvidia-peermem.conf" ]; then
731-
nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ')
732-
fi
733-
734-
local config="DRIVER_VERSION=${DRIVER_VERSION}
735-
KERNEL_VERSION=$(uname -r)
736-
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
737-
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
738-
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
739-
NVIDIA_MODULE_PARAMS=${nvidia_params}
740-
NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params}
741-
NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params}
742-
NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}"
743-
744-
# Append config file contents directly
745-
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
746-
if [ -f "/drivers/$conf_file" ]; then
747-
config="${config}
748-
$(cat "/drivers/$conf_file")"
749-
fi
750-
done
751-
752-
echo "$config"
680+
_start_daemons() {
681+
echo "Starting NVIDIA persistence daemon..."
682+
nvidia-persistenced --persistence-mode
683+
684+
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
685+
echo "Copying gridd.conf..."
686+
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
687+
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
688+
echo "Copying ClientConfigToken..."
689+
mkdir -p /etc/nvidia/ClientConfigToken/
690+
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
691+
fi
692+
693+
echo "Starting nvidia-gridd.."
694+
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd
695+
696+
# Start virtual topology daemon
697+
_start_vgpu_topology_daemon
698+
fi
699+
700+
if _assert_nvlink5_system; then
701+
_ensure_nvlink5_prerequisites || return 1
702+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
703+
704+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
705+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
706+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
707+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
708+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
709+
--fm-config-file $fm_config_file \
710+
--fm-pid-file $fm_pid_file \
711+
--nvlsm-config-file $nvlsm_config_file \
712+
--nvlsm-pid-file $nvlsm_pid_file
713+
714+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
715+
elif _assert_nvswitch_system; then
716+
echo "Starting NVIDIA fabric manager daemon..."
717+
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
718+
fi
753719
}
754720

755721
_store_driver_config() {
@@ -759,13 +725,6 @@ _store_driver_config() {
759725
echo "Driver configuration stored at $config_file"
760726
}
761727

762-
_should_use_fast_path() {
763-
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
764-
local current_config=$(_build_driver_config)
765-
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
766-
[ "${current_config}" = "${stored_config}" ]
767-
}
768-
769728
_userspace_only_install() {
770729
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
771730

@@ -802,7 +761,7 @@ _userspace_only_install() {
802761
fi
803762

804763
_mount_rootfs
805-
_ensure_persistence
764+
_start_daemons
806765
_write_kernel_update_hook
807766
_store_driver_config
808767

rhel9/ocp_dtk_entrypoint

Lines changed: 13 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,50 +10,6 @@ echo "Running $*"
1010
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
1111
source $SCRIPT_DIR/common.sh
1212

13-
_build_driver_config() {
14-
local nvidia_params="" nvidia_uvm_params="" nvidia_modeset_params="" nvidia_peermem_params=""
15-
16-
# Read module parameters from conf files
17-
if [ -f "/drivers/nvidia.conf" ]; then
18-
nvidia_params=$(cat "/drivers/nvidia.conf" | tr '\n' ' ')
19-
fi
20-
if [ -f "/drivers/nvidia-uvm.conf" ]; then
21-
nvidia_uvm_params=$(cat "/drivers/nvidia-uvm.conf" | tr '\n' ' ')
22-
fi
23-
if [ -f "/drivers/nvidia-modeset.conf" ]; then
24-
nvidia_modeset_params=$(cat "/drivers/nvidia-modeset.conf" | tr '\n' ' ')
25-
fi
26-
if [ -f "/drivers/nvidia-peermem.conf" ]; then
27-
nvidia_peermem_params=$(cat "/drivers/nvidia-peermem.conf" | tr '\n' ' ')
28-
fi
29-
30-
local config="DRIVER_VERSION=${DRIVER_VERSION}
31-
KERNEL_VERSION=$(uname -r)
32-
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
33-
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
34-
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
35-
NVIDIA_MODULE_PARAMS=${nvidia_params}
36-
NVIDIA_UVM_MODULE_PARAMS=${nvidia_uvm_params}
37-
NVIDIA_MODESET_MODULE_PARAMS=${nvidia_modeset_params}
38-
NVIDIA_PEERMEM_MODULE_PARAMS=${nvidia_peermem_params}"
39-
40-
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
41-
if [ -f "/drivers/$conf_file" ]; then
42-
config="${config}
43-
$(cat "/drivers/$conf_file")"
44-
fi
45-
done
46-
47-
echo "$config"
48-
}
49-
50-
_should_use_fast_path() {
51-
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
52-
local current_config=$(_build_driver_config)
53-
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
54-
[ "${current_config}" = "${stored_config}" ]
55-
}
56-
5713
nv-ctr-run-with-dtk() {
5814
set -x
5915

@@ -131,6 +87,19 @@ dtk-build-driver() {
13187
sleep inf
13288
fi
13389

90+
# Check if fast path is being used - if so, skip building and signal completion
91+
if _should_use_fast_path; then
92+
echo "Fast path detected in DTK container: driver already loaded with matching config, skipping build"
93+
echo "Signaling driver_built and sleeping forever..."
94+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
95+
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
96+
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
97+
sleep 30
98+
done
99+
echo "WARNING: driver_built flag disappeared"
100+
exit 0
101+
fi
102+
134103
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
135104
echo "WARNING: broken Driver Toolkit image detected:"
136105
echo "- Node kernel: $(uname -r)"

0 commit comments

Comments
 (0)