Skip to content

Commit 1c1fa72

Browse files
nixprimegvisor-bot
authored andcommitted
Work around nvidia-container-cli 1.17.7 bug
PiperOrigin-RevId: 761200938
1 parent 31a268f commit 1c1fa72

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

runsc/container/container.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2067,6 +2067,10 @@ func nvproxySetup(spec *specs.Spec, conf *config.Config, goferPid int) error {
20672067
fmt.Sprintf("--pid=%d", goferPid),
20682068
fmt.Sprintf("--device=%s", devices),
20692069
}
2070+
if nvidiaContainerCliConfigureNeedsCudaCompatModeFlag(cliPath) {
2071+
// "mount" is the flag's intended default value.
2072+
argv = append(argv, "--cuda-compat-mode=mount")
2073+
}
20702074
// Pass driver capabilities allowed by configuration as flags. See
20712075
// nvidia-container-toolkit/cmd/nvidia-container-runtime-hook/main.go:doPrestart().
20722076
driverCaps, err := specutils.NVProxyDriverCapsFromEnv(spec, conf)
@@ -2091,6 +2095,43 @@ func nvproxySetup(spec *specs.Spec, conf *config.Config, goferPid int) error {
20912095
return nil
20922096
}
20932097

2098+
func nvidiaContainerCliConfigureNeedsCudaCompatModeFlag(cliPath string) bool {
2099+
cmd := exec.Cmd{
2100+
Path: cliPath,
2101+
Args: []string{cliPath, "--version"},
2102+
}
2103+
log.Debugf("Executing %q", cmd.Args)
2104+
out, err := cmd.Output()
2105+
if err != nil {
2106+
log.Warningf("Failed to execute nvidia-container-cli --version: %v", err)
2107+
return false
2108+
}
2109+
m := regexp.MustCompile(`^cli-version: (\d+)\.(\d+)\.(\d+)`).FindSubmatch(out)
2110+
if m == nil {
2111+
log.Warningf("Failed to find version number in nvidia-container-cli --version: %s", out)
2112+
return false
2113+
}
2114+
major, err := strconv.Atoi(string(m[1]))
2115+
if err != nil {
2116+
log.Warningf("Invalid major version number in nvidia-container-cli --version: %v", err)
2117+
return false
2118+
}
2119+
minor, err := strconv.Atoi(string(m[2]))
2120+
if err != nil {
2121+
log.Warningf("Invalid minor version number in nvidia-container-cli --version: %v", err)
2122+
return false
2123+
}
2124+
release, err := strconv.Atoi(string(m[3]))
2125+
if err != nil {
2126+
log.Warningf("Invalid release version number in nvidia-container-cli --version: %v", err)
2127+
return false
2128+
}
2129+
// In nvidia-container-cli 1.17.7, in which the --cuda-compat-mode flag
2130+
// first appears, failing to pass this flag to nvidia-container-cli
2131+
// configure causes all other flags to be ignored.
2132+
return major == 1 && minor == 17 && release == 7
2133+
}
2134+
20942135
// CheckStopped checks if the container is stopped and updates its status.
20952136
func (c *Container) CheckStopped() {
20962137
if state, err := c.Sandbox.ContainerRuntimeState(c.ID); err != nil {

0 commit comments

Comments
 (0)