@@ -42,6 +42,7 @@ import (
4242const (
4343 driverRoot = "/run/nvidia/driver"
4444 driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45+ driverConfigStateFile = "/run/nvidia/nvidia-driver.state"
4546 operatorNamespace = "gpu-operator"
4647 pausedStr = "paused-for-driver-upgrade"
4748 defaultDrainTimeout = time .Second * 0
@@ -77,6 +78,7 @@ type config struct {
7778 gpuDirectRDMAEnabled bool
7879 useHostMofed bool
7980 kubeconfig string
81+ forceReinstall bool
8082}
8183
8284// ComponentState tracks the deployment state of GPU operator components
@@ -208,6 +210,13 @@ func main() {
208210 EnvVars : []string {"KUBECONFIG" },
209211 Value : "" ,
210212 },
213+ & cli.BoolFlag {
214+ Name : "force-reinstall" ,
215+ Usage : "Force driver reinstall regardless of current state" ,
216+ Destination : & cfg .forceReinstall ,
217+ EnvVars : []string {"FORCE_REINSTALL" },
218+ Value : false ,
219+ },
211220 }
212221
213222 app .Commands = []* cli.Command {
@@ -288,6 +297,26 @@ func (dm *DriverManager) uninstallDriver() error {
288297 return fmt .Errorf ("failed to evict GPU operator components: %w" , err )
289298 }
290299
300+ if dm .shouldSkipUninstall () {
301+ dm .log .Info ("Fast path activated: desired driver version and configuration already present" )
302+
303+ // Clean up stale artifacts from previous container before rescheduling operands
304+ dm .log .Info ("Cleaning up stale mounts and state files..." )
305+
306+ // Unmount stale rootfs from previous container
307+ if err := dm .unmountRootfs (); err != nil {
308+ return fmt .Errorf ("failed to unmount stale rootfs: %w" , err )
309+ }
310+
311+ // Remove stale PID file from previous container
312+ dm .removePIDFile ()
313+
314+ if err := dm .rescheduleGPUOperatorComponents (); err != nil {
315+ dm .log .Warnf ("Failed to reschedule GPU operator components: %v" , err )
316+ }
317+ return nil
318+ }
319+
291320 drainOpts := kube.DrainOptions {
292321 Force : dm .config .drainUseForce ,
293322 DeleteEmptyDirData : dm .config .drainDeleteEmptyDirData ,
@@ -629,6 +658,61 @@ func (dm *DriverManager) isDriverLoaded() bool {
629658 return err == nil
630659}
631660
661+ // readStoredDigest reads the driver configuration digest from the state file
662+ func readStoredDigest () (string , error ) {
663+ data , err := os .ReadFile (driverConfigStateFile )
664+ if err != nil {
665+ return "" , err
666+ }
667+ return strings .TrimSpace (string (data )), nil
668+ }
669+
670+ // getCurrentDigest returns the digest from the environment variable
671+ func getCurrentDigest () string {
672+ return os .Getenv ("DRIVER_CONFIG_DIGEST" )
673+ }
674+
675+ // shouldUpdateDriverConfig checks if the driver configuration needs to be updated
676+ func (dm * DriverManager ) shouldUpdateDriverConfig () bool {
677+ if ! dm .isDriverLoaded () {
678+ return true
679+ }
680+
681+ currentDigest := getCurrentDigest ()
682+ if currentDigest == "" {
683+ dm .log .Warn ("DRIVER_CONFIG_DIGEST env var not set, assuming config changed" )
684+ return true
685+ }
686+
687+ storedDigest , err := readStoredDigest ()
688+ if err != nil {
689+ if os .IsNotExist (err ) {
690+ dm .log .Info ("No previous driver configuration found" )
691+ } else {
692+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
693+ }
694+ return true
695+ }
696+
697+ return currentDigest != storedDigest
698+ }
699+
700+ func (dm * DriverManager ) shouldSkipUninstall () bool {
701+ if dm .config .forceReinstall {
702+ dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
703+ return false
704+ }
705+
706+ if ! dm .shouldUpdateDriverConfig () {
707+ dm .log .Info ("Driver is loaded with matching config, enabling fast path" )
708+ return true
709+ }
710+
711+ // Driver not loaded or config changed - proceed with cleanup
712+ dm .log .Info ("Proceeding with cleanup operations" )
713+ return false
714+ }
715+
632716func (dm * DriverManager ) isNouveauLoaded () bool {
633717 _ , err := os .Stat ("/sys/module/nouveau/refcnt" )
634718 return err == nil
@@ -639,6 +723,12 @@ func (dm *DriverManager) unloadNouveau() error {
639723 return unix .DeleteModule ("nouveau" , 0 )
640724}
641725
726+ func (dm * DriverManager ) removePIDFile () {
727+ if err := os .Remove (driverPIDFile ); err != nil && ! os .IsNotExist (err ) {
728+ dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
729+ }
730+ }
731+
642732func (dm * DriverManager ) cleanupDriver () error {
643733 dm .log .Info ("Cleaning up NVIDIA driver" )
644734
@@ -652,12 +742,7 @@ func (dm *DriverManager) cleanupDriver() error {
652742 return fmt .Errorf ("failed to unmount rootfs: %w" , err )
653743 }
654744
655- // Remove PID file
656- if _ , err := os .Stat (driverPIDFile ); err == nil {
657- if err := os .Remove (driverPIDFile ); err != nil {
658- dm .log .Warnf ("Failed to remove PID file %s: %v" , driverPIDFile , err )
659- }
660- }
745+ dm .removePIDFile ()
661746
662747 return nil
663748}
0 commit comments