Skip to content

Commit ff04036

Browse files
feat: detect config changes (version, kernel, module params, RDMA) to trigger driver reinstall
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 37ae9a0 commit ff04036

File tree

1 file changed

+103
-48
lines changed

1 file changed

+103
-48
lines changed

cmd/driver-manager/main.go

Lines changed: 103 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package main
2020

2121
import (
22+
"bytes"
2223
"context"
2324
"errors"
2425
"fmt"
@@ -40,14 +41,25 @@ import (
4041
)
4142

4243
const (
43-
driverRoot = "/run/nvidia/driver"
44-
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45-
operatorNamespace = "gpu-operator"
46-
pausedStr = "paused-for-driver-upgrade"
47-
defaultDrainTimeout = time.Second * 0
48-
defaultGracePeriod = 5 * time.Minute
44+
driverRoot = "/run/nvidia/driver"
45+
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+
driverConfigStateFile = "/run/nvidia/driver-config.state"
47+
operatorNamespace = "gpu-operator"
48+
pausedStr = "paused-for-driver-upgrade"
49+
defaultDrainTimeout = time.Second * 0
50+
defaultGracePeriod = 5 * time.Minute
4951

5052
nvidiaDomainPrefix = "nvidia.com"
53+
)
54+
55+
var (
56+
// Driver module config files
57+
driverConfigFiles = []string{
58+
"/drivers/nvidia.conf",
59+
"/drivers/nvidia-uvm.conf",
60+
"/drivers/nvidia-modeset.conf",
61+
"/drivers/nvidia-peermem.conf",
62+
}
5163

5264
nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -653,68 +665,111 @@ func (dm *DriverManager) isDriverLoaded() bool {
653665
return err == nil
654666
}
655667

656-
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
657-
if dm.config.forceReinstall {
658-
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
659-
return false, ""
668+
// getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
669+
func getConfigValueOrDefault(config, key, defaultVal string) string {
670+
if defaultVal != "" {
671+
return defaultVal
672+
}
673+
for _, line := range strings.Split(config, "\n") {
674+
if strings.HasPrefix(line, key+"=") {
675+
return strings.TrimPrefix(line, key+"=")
676+
}
660677
}
678+
return ""
679+
}
661680

662-
if !dm.isDriverLoaded() {
663-
return false, ""
681+
// getKernelVersion returns the current kernel version
682+
func getKernelVersion() string {
683+
var utsname unix.Utsname
684+
if err := unix.Uname(&utsname); err != nil {
685+
return ""
686+
}
687+
return string(utsname.Release[:bytes.IndexByte(utsname.Release[:], 0)])
688+
}
689+
690+
// buildCurrentConfig constructs the current driver configuration string
691+
func (dm *DriverManager) buildCurrentConfig(storedConfig string) string {
692+
driverVersion := getConfigValueOrDefault(storedConfig, "DRIVER_VERSION", dm.config.driverVersion)
693+
kernelVersion := getConfigValueOrDefault(storedConfig, "KERNEL_VERSION", getKernelVersion())
694+
kernelModuleType := getConfigValueOrDefault(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE"))
695+
696+
// Read module parameters from conf files
697+
nvidiaParams := readModuleParams("/drivers/nvidia.conf")
698+
nvidiaUvmParams := readModuleParams("/drivers/nvidia-uvm.conf")
699+
nvidiaModeset := readModuleParams("/drivers/nvidia-modeset.conf")
700+
nvidiaPeermem := readModuleParams("/drivers/nvidia-peermem.conf")
701+
702+
var config strings.Builder
703+
config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion))
704+
config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion))
705+
config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled))
706+
config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed))
707+
config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType))
708+
config.WriteString(fmt.Sprintf("NVIDIA_MODULE_PARAMS=%s\n", nvidiaParams))
709+
config.WriteString(fmt.Sprintf("NVIDIA_UVM_MODULE_PARAMS=%s\n", nvidiaUvmParams))
710+
config.WriteString(fmt.Sprintf("NVIDIA_MODESET_MODULE_PARAMS=%s\n", nvidiaModeset))
711+
config.WriteString(fmt.Sprintf("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n", nvidiaPeermem))
712+
713+
// Append config file contents directly
714+
for _, file := range driverConfigFiles {
715+
if data, err := os.ReadFile(file); err == nil && len(data) > 0 {
716+
config.Write(data)
717+
}
664718
}
665719

666-
if dm.config.driverVersion == "" {
667-
return false, "Driver version environment variable is not set"
720+
return config.String()
721+
}
722+
723+
// readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
724+
func readModuleParams(filepath string) string {
725+
data, err := os.ReadFile(filepath)
726+
if err != nil {
727+
return ""
668728
}
729+
// Convert newlines to spaces to match bash implementation
730+
return strings.ReplaceAll(strings.TrimSpace(string(data)), "\n", " ")
731+
}
669732

670-
version, err := dm.detectCurrentDriverVersion()
733+
// hasDriverConfigChanged checks if the current driver configuration differs from stored state
734+
func (dm *DriverManager) hasDriverConfigChanged() (bool, string) {
735+
storedData, err := os.ReadFile(driverConfigStateFile)
671736
if err != nil {
672-
dm.log.Warnf("Unable to determine installed driver version: %v", err)
673-
// If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674-
dm.log.Info("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed")
675-
return false, ""
737+
if os.IsNotExist(err) {
738+
return true, "no previous driver configuration found"
739+
}
740+
dm.log.Warnf("Failed to read driver config state file: %v", err)
741+
return true, "unable to read previous driver configuration"
676742
}
677743

678-
if version != dm.config.driverVersion {
679-
dm.log.Infof("Installed driver version %s does not match desired %s, proceeding with uninstall", version, dm.config.driverVersion)
744+
storedConfig := string(storedData)
745+
currentConfig := dm.buildCurrentConfig(storedConfig)
746+
747+
if currentConfig == storedConfig {
680748
return false, ""
681749
}
682750

683-
dm.log.Infof("Installed driver version %s matches desired version, skipping uninstall", version)
684-
return true, "desired version already present"
751+
return true, "driver configuration changed"
685752
}
686753

687-
func (dm *DriverManager) detectCurrentDriverVersion() (string, error) {
688-
baseCtx := dm.ctx
689-
if baseCtx == nil {
690-
baseCtx = context.Background()
754+
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
755+
if dm.config.forceReinstall {
756+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
757+
return false, ""
691758
}
692759

693-
ctx, cancel := context.WithTimeout(baseCtx, 10*time.Second)
694-
defer cancel()
695-
696-
// Try chroot to /run/nvidia/driver for containerized driver
697-
cmd := exec.CommandContext(ctx, "chroot", "/run/nvidia/driver", "modinfo", "-F", "version", "nvidia")
698-
cmd.Env = append(os.Environ(), "LC_ALL=C")
699-
cmdOutput, chrootErr := cmd.Output()
700-
if chrootErr == nil {
701-
version := strings.TrimSpace(string(cmdOutput))
702-
if version != "" {
703-
dm.log.Infof("Driver version detected via chroot: %s", version)
704-
return version, nil
705-
}
760+
if !dm.isDriverLoaded() {
761+
dm.log.Info("Driver not currently loaded, proceeding with installation")
762+
return false, ""
706763
}
707764

708-
// Second try to read from /sys/module/nvidia/version if available
709-
if versionData, err := os.ReadFile("/sys/module/nvidia/version"); err == nil {
710-
version := strings.TrimSpace(string(versionData))
711-
if version != "" {
712-
dm.log.Infof("Driver version detected from /sys/module/nvidia/version: %s", version)
713-
return version, nil
714-
}
765+
// Check if driver configuration (including version) has changed
766+
if configChanged, reason := dm.hasDriverConfigChanged(); configChanged {
767+
dm.log.Infof("Driver configuration has changed: %s", reason)
768+
return false, reason
715769
}
716770

717-
return "", fmt.Errorf("all version detection methods failed: chroot: %v", chrootErr)
771+
dm.log.Info("Installed driver version and configuration match desired state, skipping uninstall")
772+
return true, "desired version and configuration already present"
718773
}
719774

720775
func (dm *DriverManager) isNouveauLoaded() bool {

0 commit comments

Comments
 (0)