-
Notifications
You must be signed in to change notification settings - Fork 22
Add shouldSkipUninstall to avoid GPU driver teardown on restart #103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2cabf99
a540c4f
9ad1573
2b7ad55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,15 +40,23 @@ import ( | |
| ) | ||
|
|
||
| const ( | ||
| driverRoot = "/run/nvidia/driver" | ||
| driverPIDFile = "/run/nvidia/nvidia-driver.pid" | ||
| operatorNamespace = "gpu-operator" | ||
| pausedStr = "paused-for-driver-upgrade" | ||
| defaultDrainTimeout = time.Second * 0 | ||
| defaultGracePeriod = 5 * time.Minute | ||
| driverRoot = "/run/nvidia/driver" | ||
| driverPIDFile = "/run/nvidia/nvidia-driver.pid" | ||
| driverConfigStateFile = "/run/nvidia/nvidia-driver.state" | ||
| operatorNamespace = "gpu-operator" | ||
| pausedStr = "paused-for-driver-upgrade" | ||
| defaultDrainTimeout = time.Second * 0 | ||
| defaultGracePeriod = 5 * time.Minute | ||
|
|
||
| nvidiaDomainPrefix = "nvidia.com" | ||
|
|
||
| nvidiaModuleConfigFile = driverRoot + "/drivers/nvidia.conf" | ||
| nvidiaUVMModuleConfigFile = driverRoot + "/drivers/nvidia-uvm.conf" | ||
| nvidiaModesetModuleConfigFile = driverRoot + "/drivers/nvidia-modeset.conf" | ||
| nvidiaPeermemModuleConfigFile = driverRoot + "/drivers/nvidia-peermem.conf" | ||
| ) | ||
|
|
||
| var ( | ||
| nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver" | ||
| nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator" | ||
| nvidiaContainerToolkitDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.container-toolkit" | ||
|
|
@@ -77,6 +85,8 @@ type config struct { | |
| gpuDirectRDMAEnabled bool | ||
| useHostMofed bool | ||
| kubeconfig string | ||
| driverVersion string | ||
| forceReinstall bool | ||
| } | ||
|
|
||
| // ComponentState tracks the deployment state of GPU operator components | ||
|
|
@@ -208,6 +218,20 @@ func main() { | |
| EnvVars: []string{"KUBECONFIG"}, | ||
| Value: "", | ||
| }, | ||
| &cli.StringFlag{ | ||
| Name: "driver-version", | ||
| Usage: "Desired NVIDIA driver version", | ||
| Destination: &cfg.driverVersion, | ||
| EnvVars: []string{"DRIVER_VERSION"}, | ||
| Value: "", | ||
| }, | ||
| &cli.BoolFlag{ | ||
| Name: "force-reinstall", | ||
| Usage: "Force driver reinstall regardless of current state", | ||
| Destination: &cfg.forceReinstall, | ||
| EnvVars: []string{"FORCE_REINSTALL"}, | ||
| Value: false, | ||
| }, | ||
| } | ||
|
|
||
| app.Commands = []*cli.Command{ | ||
|
|
@@ -288,6 +312,26 @@ func (dm *DriverManager) uninstallDriver() error { | |
| return fmt.Errorf("failed to evict GPU operator components: %w", err) | ||
| } | ||
|
|
||
| if dm.shouldSkipUninstall() { | ||
| dm.log.Info("Fast path activated: desired driver version and configuration already present") | ||
|
|
||
| // Clean up stale artifacts from previous container before rescheduling operands | ||
| dm.log.Info("Cleaning up stale mounts and state files...") | ||
|
|
||
| // Unmount stale rootfs from previous container | ||
| if err := dm.unmountRootfs(); err != nil { | ||
| return fmt.Errorf("failed to unmount stale rootfs: %w", err) | ||
| } | ||
|
|
||
| // Remove stale PID file from previous container | ||
| dm.removePIDFile() | ||
|
|
||
| if err := dm.rescheduleGPUOperatorComponents(); err != nil { | ||
| dm.log.Warnf("Failed to reschedule GPU operator components: %v", err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| drainOpts := kube.DrainOptions{ | ||
| Force: dm.config.drainUseForce, | ||
| DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData, | ||
|
|
@@ -629,6 +673,100 @@ func (dm *DriverManager) isDriverLoaded() bool { | |
| return err == nil | ||
| } | ||
|
|
||
| // getKernelVersion returns the current kernel version | ||
| func getKernelVersion() string { | ||
| var utsname unix.Utsname | ||
| if err := unix.Uname(&utsname); err != nil { | ||
| return "" | ||
| } | ||
|
|
||
| return strings.Trim(string(utsname.Release[:]), " \r\n\t\u0000\uffff") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's move this to a private method and call that method from Let's name this method |
||
| } | ||
|
|
||
| // buildCurrentConfig constructs the current driver configuration string | ||
| func (dm *DriverManager) buildCurrentConfig() string { | ||
| driverType := os.Getenv("DRIVER_TYPE") | ||
| if driverType == "" { | ||
| driverType = "passthrough" | ||
| } | ||
|
|
||
| // Read module parameters from conf files | ||
| nvidiaParams := readModuleParams(nvidiaModuleConfigFile) | ||
| nvidiaUVMParams := readModuleParams(nvidiaUVMModuleConfigFile) | ||
| nvidiaModeset := readModuleParams(nvidiaModesetModuleConfigFile) | ||
| nvidiaPeermem := readModuleParams(nvidiaPeermemModuleConfigFile) | ||
|
|
||
| configTemplate := `DRIVER_VERSION=%s | ||
| DRIVER_TYPE=%s | ||
| KERNEL_VERSION=%s | ||
| GPU_DIRECT_RDMA_ENABLED=%v | ||
| USE_HOST_MOFED=%v | ||
| KERNEL_MODULE_TYPE=%s | ||
| NVIDIA_MODULE_PARAMS=%s | ||
| NVIDIA_UVM_MODULE_PARAMS=%s | ||
| NVIDIA_MODESET_MODULE_PARAMS=%s | ||
| NVIDIA_PEERMEM_MODULE_PARAMS=%s | ||
| ` | ||
|
Comment on lines
+699
to
+709
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question -- Instead of duplicating this logic (of crafting the currently applied config file) in both the k8s-driver-manager and driver container, is there an alternative solution we should consider? What if we offloaded this to the operator? When rendering the driver daemonset, the operator would craft the currently applied config, compute the digest of it, and pass this digest to the k8s-driver-manager and driver containers via an envvar, i.e. The operator would compute the value of
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There might be a few difficulties in this approach.
I think the duplication is worth having the logic live in the containers. Each container can independently validates its own node's actual state (kernel version via uname -r, module state via /sys/module/nvidia/refcnt) without depending on operator coordination. Let me know if I misunderstood.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, the container's still need to check if the
Right, this is the one hole in my proposal. Taking a step back, do we really need to include the
Yes agreed. The driver container will still write to a config / state file post-install to capture the current state. My proposal not only aims to reduce code duplication, but also to reduce how tightly coupled this feature is with the relevant components. As currently implemented, if we ever add a new envvar that needs to be captured in the state file, we would always need to update the k8s-driver-manager code to ensure this envvar gets added to the state file it constructs. With my proposal, changes to k8s-driver-manager wouldn't be needed as long as the operator communicated the desired config through some envvar, like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea of decoupling the driver state management from the driver manager, but I am not sure if we should do away with persisting the
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not very familiar with live kernel patching, but from scanning https://docs.kernel.org/livepatch/livepatch.html I would not expect the actual kernel version string (as reported by
With that said, even if we determine we need to persist The k8s-driver-manager would build this file and compare it with the existing file at
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thinking about this a bit more, I don't think the driver state calculation should be done in the operator layer as it would be a scope creep. I think this logic should reside in the components comprised by the driver daemonset. The state management is specific to the driver daemonset after all.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would also prefer if we had a single complete representation of the state as opposed to the combination of the |
||
| return fmt.Sprintf(configTemplate, | ||
| dm.config.driverVersion, | ||
| driverType, | ||
| getKernelVersion(), | ||
| dm.config.gpuDirectRDMAEnabled, | ||
| dm.config.useHostMofed, | ||
| os.Getenv("KERNEL_MODULE_TYPE"), | ||
| nvidiaParams, | ||
| nvidiaUVMParams, | ||
| nvidiaModeset, | ||
| nvidiaPeermem, | ||
| ) | ||
| } | ||
|
|
||
| // readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string | ||
| func readModuleParams(filepath string) string { | ||
| data, err := os.ReadFile(filepath) | ||
| if err != nil { | ||
| return "" | ||
| } | ||
| // Convert newlines to spaces and trim whitespace/null bytes | ||
| return strings.Trim(strings.ReplaceAll(string(data), "\n", " "), " \r\n\t\u0000\uffff") | ||
| } | ||
|
|
||
| // shouldUpdateDriverConfig checks if the driver configuration needs to be updated | ||
| func (dm *DriverManager) shouldUpdateDriverConfig() bool { | ||
| if !dm.isDriverLoaded() { | ||
| return true | ||
| } | ||
|
|
||
| storedConfig, err := os.ReadFile(driverConfigStateFile) | ||
| if err != nil { | ||
| if os.IsNotExist(err) { | ||
| dm.log.Info("No previous driver configuration found") | ||
| } else { | ||
| dm.log.Warnf("Failed to read driver config state file: %v", err) | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| currentConfig := dm.buildCurrentConfig() | ||
| return currentConfig != string(storedConfig) | ||
| } | ||
|
|
||
| func (dm *DriverManager) shouldSkipUninstall() bool { | ||
| if dm.config.forceReinstall { | ||
karthikvetrivel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall") | ||
| return false | ||
| } | ||
|
|
||
| if !dm.shouldUpdateDriverConfig() { | ||
| dm.log.Info("Driver is loaded with matching config, enabling fast path") | ||
| return true | ||
| } | ||
|
|
||
| // Driver not loaded or config changed - proceed with cleanup | ||
| dm.log.Info("Proceeding with cleanup operations") | ||
| return false | ||
| } | ||
|
|
||
| func (dm *DriverManager) isNouveauLoaded() bool { | ||
| _, err := os.Stat("/sys/module/nouveau/refcnt") | ||
| return err == nil | ||
|
|
@@ -639,6 +777,12 @@ func (dm *DriverManager) unloadNouveau() error { | |
| return unix.DeleteModule("nouveau", 0) | ||
| } | ||
|
|
||
| func (dm *DriverManager) removePIDFile() { | ||
| if err := os.Remove(driverPIDFile); err != nil && !os.IsNotExist(err) { | ||
| dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err) | ||
| } | ||
| } | ||
|
|
||
| func (dm *DriverManager) cleanupDriver() error { | ||
| dm.log.Info("Cleaning up NVIDIA driver") | ||
|
|
||
|
|
@@ -652,12 +796,7 @@ func (dm *DriverManager) cleanupDriver() error { | |
| return fmt.Errorf("failed to unmount rootfs: %w", err) | ||
| } | ||
|
|
||
| // Remove PID file | ||
| if _, err := os.Stat(driverPIDFile); err == nil { | ||
| if err := os.Remove(driverPIDFile); err != nil { | ||
| dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err) | ||
| } | ||
| } | ||
| dm.removePIDFile() | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.