Skip to content

Commit 819620b

Browse files
Add fast path optimization to skip driver reinstall when configuration digest matches
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 69eec27 commit 819620b

File tree

1 file changed

+91
-6
lines changed

1 file changed

+91
-6
lines changed

cmd/driver-manager/main.go

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
const (
4343
driverRoot = "/run/nvidia/driver"
4444
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45+
driverConfigStateFile = "/run/nvidia/nvidia-driver.state"
4546
operatorNamespace = "gpu-operator"
4647
pausedStr = "paused-for-driver-upgrade"
4748
defaultDrainTimeout = time.Second * 0
@@ -77,6 +78,7 @@ type config struct {
7778
gpuDirectRDMAEnabled bool
7879
useHostMofed bool
7980
kubeconfig string
81+
forceReinstall bool
8082
}
8183

8284
// ComponentState tracks the deployment state of GPU operator components
@@ -208,6 +210,13 @@ func main() {
208210
EnvVars: []string{"KUBECONFIG"},
209211
Value: "",
210212
},
213+
&cli.BoolFlag{
214+
Name: "force-reinstall",
215+
Usage: "Force driver reinstall regardless of current state",
216+
Destination: &cfg.forceReinstall,
217+
EnvVars: []string{"FORCE_REINSTALL"},
218+
Value: false,
219+
},
211220
}
212221

213222
app.Commands = []*cli.Command{
@@ -288,6 +297,26 @@ func (dm *DriverManager) uninstallDriver() error {
288297
return fmt.Errorf("failed to evict GPU operator components: %w", err)
289298
}
290299

300+
if dm.shouldSkipUninstall() {
301+
dm.log.Info("Fast path activated: desired driver version and configuration already present")
302+
303+
// Clean up stale artifacts from previous container before rescheduling operands
304+
dm.log.Info("Cleaning up stale mounts and state files...")
305+
306+
// Unmount stale rootfs from previous container
307+
if err := dm.unmountRootfs(); err != nil {
308+
return fmt.Errorf("failed to unmount stale rootfs: %w", err)
309+
}
310+
311+
// Remove stale PID file from previous container
312+
dm.removePIDFile()
313+
314+
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
315+
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
316+
}
317+
return nil
318+
}
319+
291320
drainOpts := kube.DrainOptions{
292321
Force: dm.config.drainUseForce,
293322
DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData,
@@ -629,6 +658,61 @@ func (dm *DriverManager) isDriverLoaded() bool {
629658
return err == nil
630659
}
631660

661+
// readStoredDigest reads the driver configuration digest from the state file
662+
func readStoredDigest() (string, error) {
663+
data, err := os.ReadFile(driverConfigStateFile)
664+
if err != nil {
665+
return "", err
666+
}
667+
return strings.TrimSpace(string(data)), nil
668+
}
669+
670+
// getCurrentDigest returns the digest from the environment variable
671+
func getCurrentDigest() string {
672+
return os.Getenv("DRIVER_CONFIG_DIGEST")
673+
}
674+
675+
// shouldUpdateDriverConfig checks if the driver configuration needs to be updated
676+
func (dm *DriverManager) shouldUpdateDriverConfig() bool {
677+
if !dm.isDriverLoaded() {
678+
return true
679+
}
680+
681+
currentDigest := getCurrentDigest()
682+
if currentDigest == "" {
683+
dm.log.Warn("DRIVER_CONFIG_DIGEST env var not set, assuming config changed")
684+
return true
685+
}
686+
687+
storedDigest, err := readStoredDigest()
688+
if err != nil {
689+
if os.IsNotExist(err) {
690+
dm.log.Info("No previous driver configuration found")
691+
} else {
692+
dm.log.Warnf("Failed to read driver config state file: %v", err)
693+
}
694+
return true
695+
}
696+
697+
return currentDigest != storedDigest
698+
}
699+
700+
func (dm *DriverManager) shouldSkipUninstall() bool {
701+
if dm.config.forceReinstall {
702+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
703+
return false
704+
}
705+
706+
if !dm.shouldUpdateDriverConfig() {
707+
dm.log.Info("Driver is loaded with matching config, enabling fast path")
708+
return true
709+
}
710+
711+
// Driver not loaded or config changed - proceed with cleanup
712+
dm.log.Info("Proceeding with cleanup operations")
713+
return false
714+
}
715+
632716
func (dm *DriverManager) isNouveauLoaded() bool {
633717
_, err := os.Stat("/sys/module/nouveau/refcnt")
634718
return err == nil
@@ -639,6 +723,12 @@ func (dm *DriverManager) unloadNouveau() error {
639723
return unix.DeleteModule("nouveau", 0)
640724
}
641725

726+
func (dm *DriverManager) removePIDFile() {
727+
if err := os.Remove(driverPIDFile); err != nil && !os.IsNotExist(err) {
728+
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
729+
}
730+
}
731+
642732
func (dm *DriverManager) cleanupDriver() error {
643733
dm.log.Info("Cleaning up NVIDIA driver")
644734

@@ -652,12 +742,7 @@ func (dm *DriverManager) cleanupDriver() error {
652742
return fmt.Errorf("failed to unmount rootfs: %w", err)
653743
}
654744

655-
// Remove PID file
656-
if _, err := os.Stat(driverPIDFile); err == nil {
657-
if err := os.Remove(driverPIDFile); err != nil {
658-
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
659-
}
660-
}
745+
dm.removePIDFile()
661746

662747
return nil
663748
}

0 commit comments

Comments
 (0)