Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 151 additions & 12 deletions cmd/driver-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,23 @@ import (
)

const (
driverRoot = "/run/nvidia/driver"
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
operatorNamespace = "gpu-operator"
pausedStr = "paused-for-driver-upgrade"
defaultDrainTimeout = time.Second * 0
defaultGracePeriod = 5 * time.Minute
driverRoot = "/run/nvidia/driver"
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
driverConfigStateFile = "/run/nvidia/nvidia-driver.state"
operatorNamespace = "gpu-operator"
pausedStr = "paused-for-driver-upgrade"
defaultDrainTimeout = time.Second * 0
defaultGracePeriod = 5 * time.Minute

nvidiaDomainPrefix = "nvidia.com"

nvidiaModuleConfigFile = driverRoot + "/drivers/nvidia.conf"
nvidiaUVMModuleConfigFile = driverRoot + "/drivers/nvidia-uvm.conf"
nvidiaModesetModuleConfigFile = driverRoot + "/drivers/nvidia-modeset.conf"
nvidiaPeermemModuleConfigFile = driverRoot + "/drivers/nvidia-peermem.conf"
)

var (
nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
nvidiaContainerToolkitDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.container-toolkit"
Expand Down Expand Up @@ -77,6 +85,8 @@ type config struct {
gpuDirectRDMAEnabled bool
useHostMofed bool
kubeconfig string
driverVersion string
forceReinstall bool
}

// ComponentState tracks the deployment state of GPU operator components
Expand Down Expand Up @@ -208,6 +218,20 @@ func main() {
EnvVars: []string{"KUBECONFIG"},
Value: "",
},
&cli.StringFlag{
Name: "driver-version",
Usage: "Desired NVIDIA driver version",
Destination: &cfg.driverVersion,
EnvVars: []string{"DRIVER_VERSION"},
Value: "",
},
&cli.BoolFlag{
Name: "force-reinstall",
Usage: "Force driver reinstall regardless of current state",
Destination: &cfg.forceReinstall,
EnvVars: []string{"FORCE_REINSTALL"},
Value: false,
},
}

app.Commands = []*cli.Command{
Expand Down Expand Up @@ -288,6 +312,26 @@ func (dm *DriverManager) uninstallDriver() error {
return fmt.Errorf("failed to evict GPU operator components: %w", err)
}

if dm.shouldSkipUninstall() {
dm.log.Info("Fast path activated: desired driver version and configuration already present")

// Clean up stale artifacts from previous container before rescheduling operands
dm.log.Info("Cleaning up stale mounts and state files...")

// Unmount stale rootfs from previous container
if err := dm.unmountRootfs(); err != nil {
return fmt.Errorf("failed to unmount stale rootfs: %w", err)
}

// Remove stale PID file from previous container
dm.removePIDFile()

if err := dm.rescheduleGPUOperatorComponents(); err != nil {
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
}
return nil
}

drainOpts := kube.DrainOptions{
Force: dm.config.drainUseForce,
DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData,
Expand Down Expand Up @@ -629,6 +673,100 @@ func (dm *DriverManager) isDriverLoaded() bool {
return err == nil
}

// getKernelVersion returns the current kernel version
func getKernelVersion() string {
var utsname unix.Utsname
if err := unix.Uname(&utsname); err != nil {
return ""
}

return strings.Trim(string(utsname.Release[:]), " \r\n\t\u0000\uffff")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this to a private method and call that method from getKernelVersion and readModuleParams method.

Let's name this method SanitizeString perhaps

}

// buildCurrentConfig constructs the current driver configuration string
func (dm *DriverManager) buildCurrentConfig() string {
driverType := os.Getenv("DRIVER_TYPE")
if driverType == "" {
driverType = "passthrough"
}

// Read module parameters from conf files
nvidiaParams := readModuleParams(nvidiaModuleConfigFile)
nvidiaUVMParams := readModuleParams(nvidiaUVMModuleConfigFile)
nvidiaModeset := readModuleParams(nvidiaModesetModuleConfigFile)
nvidiaPeermem := readModuleParams(nvidiaPeermemModuleConfigFile)

configTemplate := `DRIVER_VERSION=%s
DRIVER_TYPE=%s
KERNEL_VERSION=%s
GPU_DIRECT_RDMA_ENABLED=%v
USE_HOST_MOFED=%v
KERNEL_MODULE_TYPE=%s
NVIDIA_MODULE_PARAMS=%s
NVIDIA_UVM_MODULE_PARAMS=%s
NVIDIA_MODESET_MODULE_PARAMS=%s
NVIDIA_PEERMEM_MODULE_PARAMS=%s
`
Comment on lines +699 to +709
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- Instead of duplicating this logic (of crafting the currently applied config file) in both the k8s-driver-manager and driver container, is there an alternative solution we should consider?

What if we offloaded this to the operator? When rendering the driver daemonset, the operator would craft the currently applied config, compute the digest of it, and pass this digest to the k8s-driver-manager and driver containers via an envvar, i.e. CURRENT_DRIVER_CONFIG=<....>. At runtime, the k8s-driver-manager and driver containers would compare the value of CURRENT_DRIVER_CONFIG with the digest stored in the /run/nvidia/driver/nvidia-driver.state file to determine whether the fast-path should be taken or not. After completing its install, the main driver container would write the current digest to /run/nvidia/driver/nvidia-driver.state.

The operator would compute the value of CURRENT_DRIVER_CONFIG by taking the digest of all the relevant envvars / settings + the resourceVersion of the kernel module parameters config map (if it exists).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be a few difficulties in this approach.

  1. The operator cannot know KERNEL_VERSION at render time, right? What if nodes run heterogeneous kernels that update independently.

  2. The container must check /sys/module/nvidia/refcnt to verify actual kernel state, which I think that operator cannot do since it lacks access to each node's /sys filesystem (fact-check me on this). Even with operator digests, containers still need config-building logic to write state files post-install, so we wouldn't see too much code-savings.

I think the duplication is worth having the logic live in the containers. Each container can independently validates its own node's actual state (kernel version via uname -r, module state via /sys/module/nvidia/refcnt) without depending on operator coordination.

Let me know if I misunderstood.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The container must check /sys/module/nvidia/refcnt to verify actual kernel state

Yes, the container's still need to check if the nvidia module is loaded or not, that will always be required. This is slightly orthogonal to the config / state file content though.

The operator cannot know KERNEL_VERSION at render time, right?

Right, this is the one hole in my proposal. Taking a step back, do we really need to include the KERNEL_VERSION in the state file? Will the nvidia modules ever still be loaded when the kernel version has been updated? If the KERNEL_VERSION changes, it means the node was rebooted and since the nvidia module does not persist a reboot, it won't be loaded into the kernel anymore -- which means the fast-path install will never be an option.

containers still need config-building logic to write state files post-install

Yes agreed. The driver container will still write to a config / state file post-install to capture the current state. My proposal not only aims to reduce code duplication, but also to reduce how tightly coupled this feature is with the relevant components. As currently implemented, if we ever add a new envvar that needs to be captured in the state file, we would always need to update the k8s-driver-manager code to ensure this envvar gets added to the state file it constructs. With my proposal, changes to k8s-driver-manager wouldn't be needed as long as the operator communicated the desired config through some envvar, like CURRENT_DRIVER_CONFIG.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of decoupling the driver state management from the driver manager, but I am not sure if we should do away with persisting the KERNEL_VERSION in the state. How would we account for scenarios like Live kernel patching or similar mechanisms where the kernel maybe updated without node reboot?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not very familiar with live kernel patching, but from scanning https://docs.kernel.org/livepatch/livepatch.html I would not expect the actual kernel version string (as reported by uname) to change -- live patching allows you to patch certain functions in kernel code without regenerating the entire kernel image. SUSE has some docs that highlight this as well:

To determine the kernel patching status, use the klp -v patches command. The uname command's output does not change for patched kernels.

With that said, even if we determine we need to persist KERNEL_VERSION in the state file, I think my proposal could still work. The state file could be two lines:

<digest of current config>
<kernel version>

The k8s-driver-manager would build this file and compare it with the existing file at /run/nvidia/nvidia-driver.state.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we offloaded this to the operator?

Thinking about this a bit more, I don't think the driver state calculation should be done in the operator layer as it would be a scope creep. I think this logic should reside in the components comprised by the driver daemonset. The state management is specific to the driver daemonset after all.

Copy link
Contributor

@tariq1890 tariq1890 Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also prefer if we had a single complete representation of the state as opposed to the combination of the <current-config-digest> and <kernel-version>

return fmt.Sprintf(configTemplate,
dm.config.driverVersion,
driverType,
getKernelVersion(),
dm.config.gpuDirectRDMAEnabled,
dm.config.useHostMofed,
os.Getenv("KERNEL_MODULE_TYPE"),
nvidiaParams,
nvidiaUVMParams,
nvidiaModeset,
nvidiaPeermem,
)
}

// readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
func readModuleParams(filepath string) string {
data, err := os.ReadFile(filepath)
if err != nil {
return ""
}
// Convert newlines to spaces and trim whitespace/null bytes
return strings.Trim(strings.ReplaceAll(string(data), "\n", " "), " \r\n\t\u0000\uffff")
}

// shouldUpdateDriverConfig checks if the driver configuration needs to be updated
func (dm *DriverManager) shouldUpdateDriverConfig() bool {
if !dm.isDriverLoaded() {
return true
}

storedConfig, err := os.ReadFile(driverConfigStateFile)
if err != nil {
if os.IsNotExist(err) {
dm.log.Info("No previous driver configuration found")
} else {
dm.log.Warnf("Failed to read driver config state file: %v", err)
}
return true
}

currentConfig := dm.buildCurrentConfig()
return currentConfig != string(storedConfig)
}

func (dm *DriverManager) shouldSkipUninstall() bool {
if dm.config.forceReinstall {
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
return false
}

if !dm.shouldUpdateDriverConfig() {
dm.log.Info("Driver is loaded with matching config, enabling fast path")
return true
}

// Driver not loaded or config changed - proceed with cleanup
dm.log.Info("Proceeding with cleanup operations")
return false
}

func (dm *DriverManager) isNouveauLoaded() bool {
_, err := os.Stat("/sys/module/nouveau/refcnt")
return err == nil
Expand All @@ -639,6 +777,12 @@ func (dm *DriverManager) unloadNouveau() error {
return unix.DeleteModule("nouveau", 0)
}

func (dm *DriverManager) removePIDFile() {
if err := os.Remove(driverPIDFile); err != nil && !os.IsNotExist(err) {
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
}
}

func (dm *DriverManager) cleanupDriver() error {
dm.log.Info("Cleaning up NVIDIA driver")

Expand All @@ -652,12 +796,7 @@ func (dm *DriverManager) cleanupDriver() error {
return fmt.Errorf("failed to unmount rootfs: %w", err)
}

// Remove PID file
if _, err := os.Stat(driverPIDFile); err == nil {
if err := os.Remove(driverPIDFile); err != nil {
dm.log.Warnf("Failed to remove PID file %s: %v", driverPIDFile, err)
}
}
dm.removePIDFile()

return nil
}
Expand Down