1919package main
2020
2121import (
22+ "bytes"
2223 "context"
2324 "errors"
2425 "fmt"
@@ -40,14 +41,32 @@ import (
4041)
4142
4243const (
43- driverRoot = "/run/nvidia/driver"
44- driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45- operatorNamespace = "gpu-operator"
46- pausedStr = "paused-for-driver-upgrade"
47- defaultDrainTimeout = time .Second * 0
48- defaultGracePeriod = 5 * time .Minute
44+ driverRoot = "/run/nvidia/driver"
45+ driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+ driverConfigStateFile = "/run/nvidia/driver-config.state"
47+ operatorNamespace = "gpu-operator"
48+ pausedStr = "paused-for-driver-upgrade"
49+ defaultDrainTimeout = time .Second * 0
50+ defaultGracePeriod = 5 * time .Minute
4951
5052 nvidiaDomainPrefix = "nvidia.com"
53+ )
54+
55+ const (
56+ nvidiaModuleConfigFile = "/drivers/nvidia.conf"
57+ nvidiaUVMModuleConfigFile = "/drivers/nvidia-uvm.conf"
58+ nvidiaModsetModuleConfigFile = "/drivers/nvidia-modeset.conf"
59+ nvidiaPeermemModuleConfigFile = "/drivers/nvidia-peermem.conf"
60+ )
61+
62+ var (
63+ // Driver module config files
64+ driverConfigFiles = []string {
65+ nvidiaModuleConfigFile ,
66+ nvidiaUVMModuleConfigFile ,
67+ nvidiaModsetModuleConfigFile ,
68+ nvidiaPeermemModuleConfigFile ,
69+ }
5170
5271 nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5372 nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -304,8 +323,21 @@ func (dm *DriverManager) uninstallDriver() error {
304323 return fmt .Errorf ("failed to evict GPU operator components: %w" , err )
305324 }
306325
307- if skip , reason := dm .shouldSkipUninstall (); skip {
308- dm .log .Infof ("Skipping driver uninstall: %s" , reason )
326+ if dm .shouldSkipUninstall () {
327+ dm .log .Info ("Fast path activated: desired driver version and configuration already present" )
328+
329+ // Clean up stale artifacts from previous container before rescheduling operands
330+ dm .log .Info ("Cleaning up stale mounts and state files..." )
331+
332+ // Unmount stale rootfs from previous container
333+ if err := dm .unmountRootfs (); err != nil {
334+ return fmt .Errorf ("failed to unmount stale rootfs: %w" , err )
335+ }
336+
337+ // Remove stale PID file from previous container
338+ dm .removePIDFile ()
339+
340+ // Now safe to reschedule operands
309341 if err := dm .rescheduleGPUOperatorComponents (); err != nil {
310342 dm .log .Warnf ("Failed to reschedule GPU operator components: %v" , err )
311343 }
@@ -653,68 +685,113 @@ func (dm *DriverManager) isDriverLoaded() bool {
653685 return err == nil
654686}
655687
656- func ( dm * DriverManager ) shouldSkipUninstall () ( bool , string ) {
657- if dm . config . forceReinstall {
658- dm . log . Info ( "Force reinstall is enabled, proceeding with driver uninstall" )
659- return false , ""
688+ // getValueWithOverride extracts a value from config by key, but returns override if non-empty
689+ func getValueWithOverride ( config , key , override string ) string {
690+ if override != "" {
691+ return override
660692 }
661-
662- if ! dm .isDriverLoaded () {
663- return false , ""
693+ for _ , line := range strings .Split (config , "\n " ) {
694+ if strings .HasPrefix (line , key + "=" ) {
695+ return strings .TrimPrefix (line , key + "=" )
696+ }
664697 }
698+ return ""
699+ }
665700
666- if dm .config .driverVersion == "" {
667- return false , "Driver version environment variable is not set"
701+ // getKernelVersion returns the current kernel version
702+ func getKernelVersion () string {
703+ var utsname unix.Utsname
704+ if err := unix .Uname (& utsname ); err != nil {
705+ return ""
668706 }
669707
670- version , err := dm .detectCurrentDriverVersion ()
671- if err != nil {
672- dm .log .Warnf ("Unable to determine installed driver version: %v" , err )
673- // If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674- dm .log .Info ("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed" )
675- return false , ""
676- }
708+ release := utsname .Release [:]
709+ nullIdx := bytes .IndexByte (release , 0 )
710+ return string (release [:nullIdx ])
711+ }
677712
678- if version != dm .config .driverVersion {
679- dm .log .Infof ("Installed driver version %s does not match desired %s, proceeding with uninstall" , version , dm .config .driverVersion )
680- return false , ""
713+ // buildCurrentConfig constructs the current driver configuration string
714+ func (dm * DriverManager ) buildCurrentConfig (storedConfig string ) string {
715+ driverVersion := getValueWithOverride (storedConfig , "DRIVER_VERSION" , dm .config .driverVersion )
716+ kernelVersion := getValueWithOverride (storedConfig , "KERNEL_VERSION" , getKernelVersion ())
717+ kernelModuleType := getValueWithOverride (storedConfig , "KERNEL_MODULE_TYPE" , os .Getenv ("KERNEL_MODULE_TYPE" ))
718+ driverTypeEnv := os .Getenv ("DRIVER_TYPE" )
719+ if driverTypeEnv == "" {
720+ driverTypeEnv = "passthrough"
721+ }
722+ driverType := getValueWithOverride (storedConfig , "DRIVER_TYPE" , driverTypeEnv )
723+
724+ // Read module parameters from conf files
725+ nvidiaParams := readModuleParams (nvidiaModuleConfigFile )
726+ nvidiaUVMParams := readModuleParams (nvidiaUVMModuleConfigFile )
727+ nvidiaModeset := readModuleParams (nvidiaModsetModuleConfigFile )
728+ nvidiaPeermem := readModuleParams (nvidiaPeermemModuleConfigFile )
729+
730+ var config strings.Builder
731+ config .WriteString (fmt .Sprintf ("DRIVER_VERSION=%s\n " , driverVersion ))
732+ config .WriteString (fmt .Sprintf ("DRIVER_TYPE=%s\n " , driverType ))
733+ config .WriteString (fmt .Sprintf ("KERNEL_VERSION=%s\n " , kernelVersion ))
734+ config .WriteString (fmt .Sprintf ("GPU_DIRECT_RDMA_ENABLED=%v\n " , dm .config .gpuDirectRDMAEnabled ))
735+ config .WriteString (fmt .Sprintf ("USE_HOST_MOFED=%v\n " , dm .config .useHostMofed ))
736+ config .WriteString (fmt .Sprintf ("KERNEL_MODULE_TYPE=%s\n " , kernelModuleType ))
737+ config .WriteString (fmt .Sprintf ("NVIDIA_MODULE_PARAMS=%s\n " , nvidiaParams ))
738+ config .WriteString (fmt .Sprintf ("NVIDIA_UVM_MODULE_PARAMS=%s\n " , nvidiaUVMParams ))
739+ config .WriteString (fmt .Sprintf ("NVIDIA_MODESET_MODULE_PARAMS=%s\n " , nvidiaModeset ))
740+ config .WriteString (fmt .Sprintf ("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n " , nvidiaPeermem ))
741+
742+ // Append config file contents directly
743+ for _ , file := range driverConfigFiles {
744+ if data , err := os .ReadFile (file ); err == nil && len (data ) > 0 {
745+ config .Write (data )
746+ }
681747 }
682748
683- dm .log .Infof ("Installed driver version %s matches desired version, skipping uninstall" , version )
684- return true , "desired version already present"
749+ return config .String ()
685750}
686751
687- func (dm * DriverManager ) detectCurrentDriverVersion () (string , error ) {
688- baseCtx := dm .ctx
689- if baseCtx == nil {
690- baseCtx = context .Background ()
752+ // readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
753+ func readModuleParams (filepath string ) string {
754+ data , err := os .ReadFile (filepath )
755+ if err != nil {
756+ return ""
691757 }
758+ // Convert newlines to spaces to match bash implementation
759+ return strings .ReplaceAll (strings .TrimSpace (string (data )), "\n " , " " )
760+ }
692761
693- ctx , cancel := context .WithTimeout (baseCtx , 10 * time .Second )
694- defer cancel ()
695-
696- // Try chroot to /run/nvidia/driver for containerized driver
697- cmd := exec .CommandContext (ctx , "chroot" , "/run/nvidia/driver" , "modinfo" , "-F" , "version" , "nvidia" )
698- cmd .Env = append (os .Environ (), "LC_ALL=C" )
699- cmdOutput , chrootErr := cmd .Output ()
700- if chrootErr == nil {
701- version := strings .TrimSpace (string (cmdOutput ))
702- if version != "" {
703- dm .log .Infof ("Driver version detected via chroot: %s" , version )
704- return version , nil
762+ // driverModuleBuildNeeded checks if driver modules need to be rebuilt
763+ func (dm * DriverManager ) driverModuleBuildNeeded () bool {
764+ storedData , err := os .ReadFile (driverConfigStateFile )
765+ if err != nil {
766+ if os .IsNotExist (err ) {
767+ dm .log .Info ("No previous driver configuration found" )
768+ return true
705769 }
770+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
771+ return true
706772 }
707773
708- // Second try to read from /sys/module/nvidia/version if available
709- if versionData , err := os .ReadFile ("/sys/module/nvidia/version" ); err == nil {
710- version := strings .TrimSpace (string (versionData ))
711- if version != "" {
712- dm .log .Infof ("Driver version detected from /sys/module/nvidia/version: %s" , version )
713- return version , nil
714- }
774+ storedConfig := string (storedData )
775+ currentConfig := dm .buildCurrentConfig (storedConfig )
776+
777+ return currentConfig != storedConfig
778+ }
779+
780+ func (dm * DriverManager ) shouldSkipUninstall () bool {
781+ if dm .config .forceReinstall {
782+ dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
783+ return false
715784 }
716785
717- return "" , fmt .Errorf ("all version detection methods failed: chroot: %v" , chrootErr )
786+ // Only skip uninstall if driver IS loaded AND config matches (fast path optimization)
787+ if dm .isDriverLoaded () && ! dm .driverModuleBuildNeeded () {
788+ dm .log .Info ("Driver is loaded with matching config, enabling fast path" )
789+ return true
790+ }
791+
792+ // Driver not loaded or config changed - proceed with cleanup
793+ dm .log .Info ("Proceeding with cleanup operations" )
794+ return false
718795}
719796
720797func (dm * DriverManager ) isNouveauLoaded () bool {
@@ -727,6 +804,12 @@ func (dm *DriverManager) unloadNouveau() error {
727804 return unix .DeleteModule ("nouveau" , 0 )
728805}
729806
807+ func (dm * DriverManager ) removePIDFile () {
808+ if err := os .Remove (driverPIDFile ); err != nil && ! os .IsNotExist (err ) {
809+ dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
810+ }
811+ }
812+
730813func (dm * DriverManager ) cleanupDriver () error {
731814 dm .log .Info ("Cleaning up NVIDIA driver" )
732815
@@ -740,12 +823,7 @@ func (dm *DriverManager) cleanupDriver() error {
740823 return fmt .Errorf ("failed to unmount rootfs: %w" , err )
741824 }
742825
743- // Remove PID file
744- if _ , err := os .Stat (driverPIDFile ); err == nil {
745- if err := os .Remove (driverPIDFile ); err != nil {
746- dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
747- }
748- }
826+ dm .removePIDFile ()
749827
750828 return nil
751829}
0 commit comments