1919package main
2020
2121import (
22+ "bytes"
2223 "context"
2324 "errors"
2425 "fmt"
@@ -40,14 +41,25 @@ import (
4041)
4142
4243const (
43- driverRoot = "/run/nvidia/driver"
44- driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45- operatorNamespace = "gpu-operator"
46- pausedStr = "paused-for-driver-upgrade"
47- defaultDrainTimeout = time .Second * 0
48- defaultGracePeriod = 5 * time .Minute
44+ driverRoot = "/run/nvidia/driver"
45+ driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+ driverConfigStateFile = "/run/nvidia/driver-config.state"
47+ operatorNamespace = "gpu-operator"
48+ pausedStr = "paused-for-driver-upgrade"
49+ defaultDrainTimeout = time .Second * 0
50+ defaultGracePeriod = 5 * time .Minute
4951
5052 nvidiaDomainPrefix = "nvidia.com"
53+ )
54+
55+ var (
56+ // Driver module config files
57+ driverConfigFiles = []string {
58+ "/drivers/nvidia.conf" ,
59+ "/drivers/nvidia-uvm.conf" ,
60+ "/drivers/nvidia-modeset.conf" ,
61+ "/drivers/nvidia-peermem.conf" ,
62+ }
5163
5264 nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365 nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -305,7 +317,24 @@ func (dm *DriverManager) uninstallDriver() error {
305317 }
306318
307319 if skip , reason := dm .shouldSkipUninstall (); skip {
308- dm .log .Infof ("Skipping driver uninstall: %s" , reason )
320+ dm .log .Infof ("Fast path activated: %s" , reason )
321+
322+ // Clean up stale artifacts from previous container before rescheduling operands
323+ dm .log .Info ("Cleaning up stale mounts and state files..." )
324+
325+ // Unmount stale rootfs from previous container
326+ if err := dm .unmountRootfs (); err != nil {
327+ return fmt .Errorf ("failed to unmount stale rootfs: %w" , err )
328+ }
329+
330+ // Remove stale PID file from previous container
331+ if _ , err := os .Stat (driverPIDFile ); err == nil {
332+ if err := os .Remove (driverPIDFile ); err != nil {
333+ dm .log .Warnf ("Failed to remove PID file: %v" , err )
334+ }
335+ }
336+
337+ // Now safe to reschedule operands
309338 if err := dm .rescheduleGPUOperatorComponents (); err != nil {
310339 dm .log .Warnf ("Failed to reschedule GPU operator components: %v" , err )
311340 }
@@ -653,68 +682,115 @@ func (dm *DriverManager) isDriverLoaded() bool {
653682 return err == nil
654683}
655684
656- func ( dm * DriverManager ) shouldSkipUninstall () ( bool , string ) {
657- if dm . config . forceReinstall {
658- dm . log . Info ( "Force reinstall is enabled, proceeding with driver uninstall" )
659- return false , ""
685+ // getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
686+ func getConfigValueOrDefault ( config , key , defaultVal string ) string {
687+ if defaultVal != "" {
688+ return defaultVal
660689 }
690+ for _ , line := range strings .Split (config , "\n " ) {
691+ if strings .HasPrefix (line , key + "=" ) {
692+ return strings .TrimPrefix (line , key + "=" )
693+ }
694+ }
695+ return ""
696+ }
661697
662- if ! dm .isDriverLoaded () {
663- return false , ""
698+ // getKernelVersion returns the current kernel version
699+ func getKernelVersion () string {
700+ var utsname unix.Utsname
701+ if err := unix .Uname (& utsname ); err != nil {
702+ return ""
664703 }
704+ return string (utsname .Release [:bytes .IndexByte (utsname .Release [:], 0 )])
705+ }
665706
666- if dm .config .driverVersion == "" {
667- return false , "Driver version environment variable is not set"
707+ // buildCurrentConfig constructs the current driver configuration string
708+ func (dm * DriverManager ) buildCurrentConfig (storedConfig string ) string {
709+ driverVersion := getConfigValueOrDefault (storedConfig , "DRIVER_VERSION" , dm .config .driverVersion )
710+ kernelVersion := getConfigValueOrDefault (storedConfig , "KERNEL_VERSION" , getKernelVersion ())
711+ kernelModuleType := getConfigValueOrDefault (storedConfig , "KERNEL_MODULE_TYPE" , os .Getenv ("KERNEL_MODULE_TYPE" ))
712+ driverTypeEnv := os .Getenv ("DRIVER_TYPE" )
713+ if driverTypeEnv == "" {
714+ driverTypeEnv = "passthrough"
715+ }
716+ driverType := getConfigValueOrDefault (storedConfig , "DRIVER_TYPE" , driverTypeEnv )
717+
718+ // Read module parameters from conf files
719+ nvidiaParams := readModuleParams ("/drivers/nvidia.conf" )
720+ nvidiaUvmParams := readModuleParams ("/drivers/nvidia-uvm.conf" )
721+ nvidiaModeset := readModuleParams ("/drivers/nvidia-modeset.conf" )
722+ nvidiaPeermem := readModuleParams ("/drivers/nvidia-peermem.conf" )
723+
724+ var config strings.Builder
725+ config .WriteString (fmt .Sprintf ("DRIVER_VERSION=%s\n " , driverVersion ))
726+ config .WriteString (fmt .Sprintf ("DRIVER_TYPE=%s\n " , driverType ))
727+ config .WriteString (fmt .Sprintf ("KERNEL_VERSION=%s\n " , kernelVersion ))
728+ config .WriteString (fmt .Sprintf ("GPU_DIRECT_RDMA_ENABLED=%v\n " , dm .config .gpuDirectRDMAEnabled ))
729+ config .WriteString (fmt .Sprintf ("USE_HOST_MOFED=%v\n " , dm .config .useHostMofed ))
730+ config .WriteString (fmt .Sprintf ("KERNEL_MODULE_TYPE=%s\n " , kernelModuleType ))
731+ config .WriteString (fmt .Sprintf ("NVIDIA_MODULE_PARAMS=%s\n " , nvidiaParams ))
732+ config .WriteString (fmt .Sprintf ("NVIDIA_UVM_MODULE_PARAMS=%s\n " , nvidiaUvmParams ))
733+ config .WriteString (fmt .Sprintf ("NVIDIA_MODESET_MODULE_PARAMS=%s\n " , nvidiaModeset ))
734+ config .WriteString (fmt .Sprintf ("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n " , nvidiaPeermem ))
735+
736+ // Append config file contents directly
737+ for _ , file := range driverConfigFiles {
738+ if data , err := os .ReadFile (file ); err == nil && len (data ) > 0 {
739+ config .Write (data )
740+ }
668741 }
669742
670- version , err := dm .detectCurrentDriverVersion ()
743+ return config .String ()
744+ }
745+
746+ // readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
747+ func readModuleParams (filepath string ) string {
748+ data , err := os .ReadFile (filepath )
671749 if err != nil {
672- dm .log .Warnf ("Unable to determine installed driver version: %v" , err )
673- // If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674- dm .log .Info ("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed" )
675- return false , ""
750+ return ""
676751 }
752+ // Convert newlines to spaces to match bash implementation
753+ return strings .ReplaceAll (strings .TrimSpace (string (data )), "\n " , " " )
754+ }
677755
678- if version != dm .config .driverVersion {
679- dm .log .Infof ("Installed driver version %s does not match desired %s, proceeding with uninstall" , version , dm .config .driverVersion )
680- return false , ""
756+ // hasDriverConfigChanged checks if the current driver configuration differs from stored state
757+ func (dm * DriverManager ) hasDriverConfigChanged () (bool , string ) {
758+ storedData , err := os .ReadFile (driverConfigStateFile )
759+ if err != nil {
760+ if os .IsNotExist (err ) {
761+ return true , "no previous driver configuration found"
762+ }
763+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
764+ return true , "unable to read previous driver configuration"
681765 }
682766
683- dm .log .Infof ("Installed driver version %s matches desired version, skipping uninstall" , version )
684- return true , "desired version already present"
685- }
767+ storedConfig := string (storedData )
768+ currentConfig := dm .buildCurrentConfig (storedConfig )
686769
687- func (dm * DriverManager ) detectCurrentDriverVersion () (string , error ) {
688- baseCtx := dm .ctx
689- if baseCtx == nil {
690- baseCtx = context .Background ()
770+ if currentConfig == storedConfig {
771+ return false , ""
691772 }
692773
693- ctx , cancel := context . WithTimeout ( baseCtx , 10 * time . Second )
694- defer cancel ()
774+ return true , "driver configuration changed"
775+ }
695776
696- // Try chroot to /run/nvidia/driver for containerized driver
697- cmd := exec .CommandContext (ctx , "chroot" , "/run/nvidia/driver" , "modinfo" , "-F" , "version" , "nvidia" )
698- cmd .Env = append (os .Environ (), "LC_ALL=C" )
699- cmdOutput , chrootErr := cmd .Output ()
700- if chrootErr == nil {
701- version := strings .TrimSpace (string (cmdOutput ))
702- if version != "" {
703- dm .log .Infof ("Driver version detected via chroot: %s" , version )
704- return version , nil
705- }
777+ func (dm * DriverManager ) shouldSkipUninstall () (bool , string ) {
778+ if dm .config .forceReinstall {
779+ dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
780+ return false , ""
706781 }
707782
708- // Second try to read from /sys/module/nvidia/version if available
709- if versionData , err := os .ReadFile ("/sys/module/nvidia/version" ); err == nil {
710- version := strings .TrimSpace (string (versionData ))
711- if version != "" {
712- dm .log .Infof ("Driver version detected from /sys/module/nvidia/version: %s" , version )
713- return version , nil
783+ // Only skip uninstall if driver IS loaded AND config matches (fast path optimization)
784+ if dm .isDriverLoaded () {
785+ if configChanged , _ := dm .hasDriverConfigChanged (); ! configChanged {
786+ dm .log .Info ("Driver is loaded with matching config, enabling fast path" )
787+ return true , "desired version and configuration already present"
714788 }
715789 }
716790
717- return "" , fmt .Errorf ("all version detection methods failed: chroot: %v" , chrootErr )
791+ // Driver not loaded or config changed - proceed with cleanup
792+ dm .log .Info ("Proceeding with cleanup operations" )
793+ return false , ""
718794}
719795
720796func (dm * DriverManager ) isNouveauLoaded () bool {
0 commit comments