Skip to content

Commit 84ce300

Browse files
feat: detect config changes (version, kernel, module params, RDMA) to trigger driver reinstall
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent a540c4f commit 84ce300

File tree

1 file changed

+137
-59
lines changed

1 file changed

+137
-59
lines changed

cmd/driver-manager/main.go

Lines changed: 137 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package main
2020

2121
import (
22+
"bytes"
2223
"context"
2324
"errors"
2425
"fmt"
@@ -40,14 +41,32 @@ import (
4041
)
4142

4243
const (
43-
driverRoot = "/run/nvidia/driver"
44-
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45-
operatorNamespace = "gpu-operator"
46-
pausedStr = "paused-for-driver-upgrade"
47-
defaultDrainTimeout = time.Second * 0
48-
defaultGracePeriod = 5 * time.Minute
44+
driverRoot = "/run/nvidia/driver"
45+
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+
driverConfigStateFile = "/run/nvidia/driver-config.state"
47+
operatorNamespace = "gpu-operator"
48+
pausedStr = "paused-for-driver-upgrade"
49+
defaultDrainTimeout = time.Second * 0
50+
defaultGracePeriod = 5 * time.Minute
4951

5052
nvidiaDomainPrefix = "nvidia.com"
53+
)
54+
55+
const (
56+
nvidiaModuleConfigFile = "/drivers/nvidia.conf"
57+
nvidiaUVMModuleConfigFile = "/drivers/nvidia-uvm.conf"
58+
nvidiaModsetModuleConfigFile = "/drivers/nvidia-modeset.conf"
59+
nvidiaPeermemModuleConfigFile = "/drivers/nvidia-peermem.conf"
60+
)
61+
62+
var (
63+
// Driver module config files
64+
driverConfigFiles = []string{
65+
nvidiaModuleConfigFile,
66+
nvidiaUVMModuleConfigFile,
67+
nvidiaModsetModuleConfigFile,
68+
nvidiaPeermemModuleConfigFile,
69+
}
5170

5271
nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5372
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -304,8 +323,21 @@ func (dm *DriverManager) uninstallDriver() error {
304323
return fmt.Errorf("failed to evict GPU operator components: %w", err)
305324
}
306325

307-
if skip, reason := dm.shouldSkipUninstall(); skip {
308-
dm.log.Infof("Skipping driver uninstall: %s", reason)
326+
if dm.shouldSkipUninstall() {
327+
dm.log.Info("Fast path activated: desired driver version and configuration already present")
328+
329+
// Clean up stale artifacts from previous container before rescheduling operands
330+
dm.log.Info("Cleaning up stale mounts and state files...")
331+
332+
// Unmount stale rootfs from previous container
333+
if err := dm.unmountRootfs(); err != nil {
334+
return fmt.Errorf("failed to unmount stale rootfs: %w", err)
335+
}
336+
337+
// Remove stale PID file from previous container
338+
dm.removePIDFile()
339+
340+
// Now safe to reschedule operands
309341
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
310342
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
311343
}
@@ -653,68 +685,113 @@ func (dm *DriverManager) isDriverLoaded() bool {
653685
return err == nil
654686
}
655687

656-
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
657-
if dm.config.forceReinstall {
658-
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
659-
return false, ""
688+
// getValueWithOverride extracts a value from config by key, but returns override if non-empty
689+
func getValueWithOverride(config, key, override string) string {
690+
if override != "" {
691+
return override
660692
}
661-
662-
if !dm.isDriverLoaded() {
663-
return false, ""
693+
for _, line := range strings.Split(config, "\n") {
694+
if strings.HasPrefix(line, key+"=") {
695+
return strings.TrimPrefix(line, key+"=")
696+
}
664697
}
698+
return ""
699+
}
665700

666-
if dm.config.driverVersion == "" {
667-
return false, "Driver version environment variable is not set"
701+
// getKernelVersion returns the current kernel version
702+
func getKernelVersion() string {
703+
var utsname unix.Utsname
704+
if err := unix.Uname(&utsname); err != nil {
705+
return ""
668706
}
669707

670-
version, err := dm.detectCurrentDriverVersion()
671-
if err != nil {
672-
dm.log.Warnf("Unable to determine installed driver version: %v", err)
673-
// If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674-
dm.log.Info("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed")
675-
return false, ""
676-
}
708+
release := utsname.Release[:]
709+
nullIdx := bytes.IndexByte(release, 0)
710+
return string(release[:nullIdx])
711+
}
677712

678-
if version != dm.config.driverVersion {
679-
dm.log.Infof("Installed driver version %s does not match desired %s, proceeding with uninstall", version, dm.config.driverVersion)
680-
return false, ""
713+
// buildCurrentConfig constructs the current driver configuration string
714+
func (dm *DriverManager) buildCurrentConfig(storedConfig string) string {
715+
driverVersion := getValueWithOverride(storedConfig, "DRIVER_VERSION", dm.config.driverVersion)
716+
kernelVersion := getValueWithOverride(storedConfig, "KERNEL_VERSION", getKernelVersion())
717+
kernelModuleType := getValueWithOverride(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE"))
718+
driverTypeEnv := os.Getenv("DRIVER_TYPE")
719+
if driverTypeEnv == "" {
720+
driverTypeEnv = "passthrough"
721+
}
722+
driverType := getValueWithOverride(storedConfig, "DRIVER_TYPE", driverTypeEnv)
723+
724+
// Read module parameters from conf files
725+
nvidiaParams := readModuleParams(nvidiaModuleConfigFile)
726+
nvidiaUVMParams := readModuleParams(nvidiaUVMModuleConfigFile)
727+
nvidiaModeset := readModuleParams(nvidiaModsetModuleConfigFile)
728+
nvidiaPeermem := readModuleParams(nvidiaPeermemModuleConfigFile)
729+
730+
var config strings.Builder
731+
config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion))
732+
config.WriteString(fmt.Sprintf("DRIVER_TYPE=%s\n", driverType))
733+
config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion))
734+
config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled))
735+
config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed))
736+
config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType))
737+
config.WriteString(fmt.Sprintf("NVIDIA_MODULE_PARAMS=%s\n", nvidiaParams))
738+
config.WriteString(fmt.Sprintf("NVIDIA_UVM_MODULE_PARAMS=%s\n", nvidiaUVMParams))
739+
config.WriteString(fmt.Sprintf("NVIDIA_MODESET_MODULE_PARAMS=%s\n", nvidiaModeset))
740+
config.WriteString(fmt.Sprintf("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n", nvidiaPeermem))
741+
742+
// Append config file contents directly
743+
for _, file := range driverConfigFiles {
744+
if data, err := os.ReadFile(file); err == nil && len(data) > 0 {
745+
config.Write(data)
746+
}
681747
}
682748

683-
dm.log.Infof("Installed driver version %s matches desired version, skipping uninstall", version)
684-
return true, "desired version already present"
749+
return config.String()
685750
}
686751

687-
func (dm *DriverManager) detectCurrentDriverVersion() (string, error) {
688-
baseCtx := dm.ctx
689-
if baseCtx == nil {
690-
baseCtx = context.Background()
752+
// readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
753+
func readModuleParams(filepath string) string {
754+
data, err := os.ReadFile(filepath)
755+
if err != nil {
756+
return ""
691757
}
758+
// Convert newlines to spaces to match bash implementation
759+
return strings.ReplaceAll(strings.TrimSpace(string(data)), "\n", " ")
760+
}
692761

693-
ctx, cancel := context.WithTimeout(baseCtx, 10*time.Second)
694-
defer cancel()
695-
696-
// Try chroot to /run/nvidia/driver for containerized driver
697-
cmd := exec.CommandContext(ctx, "chroot", "/run/nvidia/driver", "modinfo", "-F", "version", "nvidia")
698-
cmd.Env = append(os.Environ(), "LC_ALL=C")
699-
cmdOutput, chrootErr := cmd.Output()
700-
if chrootErr == nil {
701-
version := strings.TrimSpace(string(cmdOutput))
702-
if version != "" {
703-
dm.log.Infof("Driver version detected via chroot: %s", version)
704-
return version, nil
762+
// driverModuleBuildNeeded checks if driver modules need to be rebuilt
763+
func (dm *DriverManager) driverModuleBuildNeeded() bool {
764+
storedData, err := os.ReadFile(driverConfigStateFile)
765+
if err != nil {
766+
if os.IsNotExist(err) {
767+
dm.log.Info("No previous driver configuration found")
768+
return true
705769
}
770+
dm.log.Warnf("Failed to read driver config state file: %v", err)
771+
return true
706772
}
707773

708-
// Second try to read from /sys/module/nvidia/version if available
709-
if versionData, err := os.ReadFile("/sys/module/nvidia/version"); err == nil {
710-
version := strings.TrimSpace(string(versionData))
711-
if version != "" {
712-
dm.log.Infof("Driver version detected from /sys/module/nvidia/version: %s", version)
713-
return version, nil
714-
}
774+
storedConfig := string(storedData)
775+
currentConfig := dm.buildCurrentConfig(storedConfig)
776+
777+
return currentConfig != storedConfig
778+
}
779+
780+
func (dm *DriverManager) shouldSkipUninstall() bool {
781+
if dm.config.forceReinstall {
782+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
783+
return false
715784
}
716785

717-
return "", fmt.Errorf("all version detection methods failed: chroot: %v", chrootErr)
786+
// Only skip uninstall if driver IS loaded AND config matches (fast path optimization)
787+
if dm.isDriverLoaded() && !dm.driverModuleBuildNeeded() {
788+
dm.log.Info("Driver is loaded with matching config, enabling fast path")
789+
return true
790+
}
791+
792+
// Driver not loaded or config changed - proceed with cleanup
793+
dm.log.Info("Proceeding with cleanup operations")
794+
return false
718795
}
719796

720797
func (dm *DriverManager) isNouveauLoaded() bool {
@@ -727,6 +804,12 @@ func (dm *DriverManager) unloadNouveau() error {
727804
return unix.DeleteModule("nouveau", 0)
728805
}
729806

807+
func (dm *DriverManager) removePIDFile() {
808+
if err := os.Remove(driverPIDFile); err != nil && !os.IsNotExist(err) {
809+
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
810+
}
811+
}
812+
730813
func (dm *DriverManager) cleanupDriver() error {
731814
dm.log.Info("Cleaning up NVIDIA driver")
732815

@@ -740,12 +823,7 @@ func (dm *DriverManager) cleanupDriver() error {
740823
return fmt.Errorf("failed to unmount rootfs: %w", err)
741824
}
742825

743-
// Remove PID file
744-
if _, err := os.Stat(driverPIDFile); err == nil {
745-
if err := os.Remove(driverPIDFile); err != nil {
746-
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
747-
}
748-
}
826+
dm.removePIDFile()
749827

750828
return nil
751829
}

0 commit comments

Comments
 (0)