Skip to content

Commit bb6a01a

Browse files
committed
More fixes
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent f01bca6 commit bb6a01a

File tree

6 files changed

+164
-99
lines changed

6 files changed

+164
-99
lines changed

cmd/nvidia-mig-manager/main.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package main
1919
import (
2020
"fmt"
2121
"os"
22-
"os/exec"
2322
"path/filepath"
2423
"strings"
2524
"sync"
@@ -408,10 +407,11 @@ func runScript(migConfigValue string, driverLibraryPath string, nvidiaSMIPath st
408407
reconfigure.WithShutdownHostGPUClients(withShutdownHostGPUClientsFlag),
409408
)
410409

411-
cmd := exec.Command(reconfigureScriptFlag, args...)
412-
cmd.Stdout = os.Stdout
413-
cmd.Stderr = os.Stderr
414-
return cmd.Run()
410+
reconfigurer, err := reconfigure.New(options...)
411+
if err != nil {
412+
return err
413+
}
414+
return reconfigurer.Reconfigure()
415415
}
416416

417417
func ContinuouslySyncMigConfigChanges(clientset *kubernetes.Clientset, migConfig *SyncableMigConfig) chan struct{} {

pkg/mig/reconfigure/cdi.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,11 @@ func (opts *reconfigurer) createControlDeviceNodes() error {
7777

7878
func (opts *reconfigurer) runNvidiaSMI() error {
7979
if opts.DriverRootCtrPath == opts.DevRootCtrPath {
80-
cmd := exec.Command("chroot", opts.DriverRootCtrPath, opts.NVIDIASMIPath)
80+
cmd := exec.Command("chroot", opts.DriverRootCtrPath, opts.NVIDIASMIPath) //nolint:gosec
8181
return opts.Run(cmd)
8282
}
8383

84-
cmd := exec.Command("chroot", opts.HostRootMount, opts.NVIDIASMIPath)
84+
cmd := exec.Command("chroot", opts.HostRootMount, opts.NVIDIASMIPath) //nolint:gosec
8585
cmd.Env = append(cmd.Env, ldPreloadEnvVar+"="+opts.DriverLibraryPath)
8686
return opts.Run(cmd)
8787
}

pkg/mig/reconfigure/node-labeller_mock.go

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/mig/reconfigure/options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ type reconfigureMIGOptions struct {
7272
NVIDIASMIPath string
7373
// NVIDIACDIHookPath is the path to the nvidia-cdi-hook executable on the HOST.
7474
NVIDIACDIHookPath string
75+
76+
hostNVIDIADir string
7577
}
7678

7779
// Functional options for the above members, sorted alphabetically.

pkg/mig/reconfigure/reconfigure.go

Lines changed: 107 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"context"
2222
"errors"
2323
"fmt"
24+
"io"
2425
"os"
2526
"os/exec"
2627
"path/filepath"
@@ -33,8 +34,6 @@ import (
3334
)
3435

3536
const (
36-
migPartedCliName = "nvidia-mig-parted"
37-
3837
configStatePending = "pending"
3938
configStateRebooting = "rebooting"
4039

@@ -66,20 +65,15 @@ func New(opts ...Option) (Reconfigurer, error) {
6665
return nil, err
6766
}
6867

69-
c := &commandWithOutput{}
68+
migParted, err := o.createMIGPartedCLI()
69+
if err != nil {
70+
return nil, err
71+
}
7072

71-
// TODO: If WITH_SHUTDOWN_HOST_GPU_CLIENTS is set we need to run mig-parted
72-
// on the host. We also need to update the MIG_CONFIG_FILE to refer to one
73-
// that we copy to the host.
7473
r := &reconfigurer{
7574
reconfigureMIGOptions: o,
76-
commandRunner: c,
77-
migParted: &migPartedCLI{
78-
MIGPartedConfigFile: o.MIGPartedConfigFile,
79-
SelectedMIGConfig: o.SelectedMIGConfig,
80-
DriverLibraryPath: o.DriverLibraryPath,
81-
commandRunner: c,
82-
},
75+
commandRunner: &commandWithOutput{},
76+
migParted: migParted,
8377
node: &node{
8478
clientset: o.clientset,
8579
name: o.NodeName,
@@ -89,6 +83,90 @@ func New(opts ...Option) (Reconfigurer, error) {
8983
return r, nil
9084
}
9185

86+
func (opts *reconfigureMIGOptions) createMIGPartedCLI() (*migPartedCLI, error) {
87+
c := &commandWithOutput{}
88+
89+
if !opts.WithShutdownHostGPUClients {
90+
m := &migPartedCLI{
91+
path: "/usr/bin/nvidia-mig-parted",
92+
MIGPartedConfigFile: opts.MIGPartedConfigFile,
93+
SelectedMIGConfig: opts.SelectedMIGConfig,
94+
DriverLibraryPath: opts.DriverLibraryPath,
95+
commandRunner: c,
96+
}
97+
return m, nil
98+
}
99+
100+
if opts.hostNVIDIADir == "" {
101+
return nil, fmt.Errorf("HOST_NVIDIA_DIR must be specified")
102+
}
103+
104+
hostRoot, err := os.OpenRoot(opts.HostRootMount)
105+
if err != nil {
106+
return nil, fmt.Errorf("failed to open host root: %w", err)
107+
}
108+
defer hostRoot.Close()
109+
110+
// TODO: Once we switch to go 1.25, we can use os.Root.MkdirAll.
111+
hostNVIDIADir := strings.TrimPrefix(opts.hostNVIDIADir, "/")
112+
if err := hostRoot.Mkdir(hostNVIDIADir, 0755); err != nil {
113+
return nil, fmt.Errorf("failed to create directory: %w", err)
114+
}
115+
if err := hostRoot.Mkdir(filepath.Join(hostNVIDIADir, "mig-manager"), 0755); err != nil {
116+
return nil, fmt.Errorf("failed to create directory: %w", err)
117+
}
118+
119+
migParted, err := os.Open("/usr/bin/nvidia-mig-parted")
120+
if err != nil {
121+
return nil, err
122+
}
123+
defer migParted.Close()
124+
125+
hostMigPartedPath := filepath.Join(hostNVIDIADir, "mig-manager", "nvidia-mig-parted")
126+
hostMigParted, err := hostRoot.Create(hostMigPartedPath)
127+
if err != nil {
128+
return nil, err
129+
}
130+
defer hostMigParted.Close()
131+
132+
if _, err := io.Copy(hostMigParted, migParted); err != nil {
133+
return nil, err
134+
}
135+
// Ensure that the file is executable.
136+
if err := hostMigParted.Chmod(0755); err != nil {
137+
return nil, err
138+
}
139+
140+
configFile, err := os.Open(opts.MIGPartedConfigFile)
141+
if err != nil {
142+
return nil, err
143+
}
144+
defer configFile.Close()
145+
146+
hostConfigFilePath := filepath.Join(hostNVIDIADir, "mig-manager", "config.yaml")
147+
hostConfigFile, err := hostRoot.Create(hostConfigFilePath)
148+
if err != nil {
149+
return nil, err
150+
}
151+
defer hostConfigFile.Close()
152+
153+
if _, err := io.Copy(hostConfigFile, configFile); err != nil {
154+
return nil, err
155+
}
156+
157+
m := &migPartedCLI{
158+
root: hostRoot.Name(),
159+
path: "/" + hostMigPartedPath,
160+
MIGPartedConfigFile: "/" + hostConfigFilePath,
161+
SelectedMIGConfig: opts.SelectedMIGConfig,
162+
// TODO: We may need to update this for the host.
163+
DriverLibraryPath: "",
164+
commandRunner: c,
165+
}
166+
167+
return m, nil
168+
}
169+
92170
// Reconfigure configures MIG (Multi-Instance GPU) settings on a Kubernetes
93171
// node. It validates the requested configuration, checks the current state,
94172
// applies MIG mode changes, manages host GPU client services, and handles
@@ -106,7 +184,7 @@ func (opts *reconfigurer) Reconfigure() (rerr error) {
106184
// If we're returning due to an error in restarting the services, then
107185
// don't restart them here.
108186
if restartSystemdClients {
109-
if err := systemdClients.Restart(); err != nil {
187+
if err := opts.hostStartSystemdServices(systemdClients); err != nil {
110188
rerr = errors.Join(rerr, err)
111189
}
112190

@@ -249,6 +327,8 @@ func (opts *reconfigurer) Reconfigure() (rerr error) {
249327
}
250328

251329
type migPartedCLI struct {
330+
root string
331+
path string
252332
MIGPartedConfigFile string
253333
SelectedMIGConfig string
254334
DriverLibraryPath string
@@ -358,82 +438,6 @@ func (opts *reconfigureMIGOptions) hostStartSystemdServices(systemdGPUClients gp
358438
return nil
359439
}
360440

361-
func stopSystemdService(opts *reconfigureMIGOptions, service string) (bool, error) {
362-
cmd := exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-active", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
363-
if err := cmd.Run(); err == nil {
364-
log.Infof("%s %s (active, will-restart)", "stop", service)
365-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "stop", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name, action is controlled parameter.
366-
if err := cmd.Run(); err != nil {
367-
return false, err
368-
}
369-
return true, nil
370-
}
371-
372-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-enabled", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
373-
if err := cmd.Run(); err != nil {
374-
log.Infof("Skipping %s (no-exist)", service)
375-
return false, nil
376-
}
377-
378-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-failed", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
379-
if err := cmd.Run(); err == nil {
380-
log.Infof("Skipping %s (is-failed, will-restart)", service)
381-
return true, nil
382-
}
383-
384-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-enabled", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
385-
if err := cmd.Run(); err != nil {
386-
log.Infof("Skipping %s (disabled)", service)
387-
return false, nil
388-
}
389-
390-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "show", "--property=Type", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
391-
output, err := cmd.Output()
392-
if err != nil {
393-
return false, err
394-
}
395-
if strings.TrimSpace(string(output)) == "Type=oneshot" {
396-
log.Infof("Skipping %s (inactive, oneshot, no-restart)", service)
397-
return false, nil
398-
}
399-
400-
log.Infof("Skipping %s (inactive, will-restart)", service)
401-
return true, nil
402-
}
403-
404-
func shouldRestartService(opts *reconfigureMIGOptions, service string) bool {
405-
cmd := exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-active", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
406-
if err := cmd.Run(); err == nil {
407-
return false
408-
}
409-
410-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-enabled", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
411-
if err := cmd.Run(); err != nil {
412-
return false
413-
}
414-
415-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-failed", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
416-
if err := cmd.Run(); err == nil {
417-
return true
418-
}
419-
420-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "-q", "is-enabled", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
421-
if err := cmd.Run(); err != nil {
422-
return false
423-
}
424-
425-
cmd = exec.Command("chroot", opts.HostRootMount, "systemctl", "show", "--property=Type", service) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
426-
output, err := cmd.Output()
427-
if err != nil {
428-
return false
429-
}
430-
if strings.TrimSpace(string(output)) == "Type=oneshot" {
431-
return false
432-
}
433-
434-
return true
435-
}
436-
437441
func (c *commandWithOutput) Run(cmd *exec.Cmd) error {
438442
cmd.Stdout = os.Stdout
439443
cmd.Stderr = os.Stderr
@@ -446,8 +450,19 @@ func (opts *migPartedCLI) runMigParted(args ...string) error {
446450
}
447451

448452
func (opts *migPartedCLI) migPartedCmd(args ...string) *exec.Cmd {
449-
cmd := exec.Command(migPartedCliName, args...)
450-
cmd.Env = append(os.Environ(), fmt.Sprintf("%s=%s", ldPreloadEnvVar, opts.DriverLibraryPath))
453+
var commandAndArgs []string
454+
455+
if opts.root != "" && opts.root != "/" {
456+
commandAndArgs = append(commandAndArgs, "chroot", opts.root)
457+
}
458+
commandAndArgs = append(commandAndArgs, opts.path)
459+
commandAndArgs = append(commandAndArgs, args...)
460+
461+
cmd := exec.Command(commandAndArgs[0], commandAndArgs[1:]...) //nolint:gosec
462+
if opts.DriverLibraryPath != "" {
463+
cmd.Env = append(os.Environ(), fmt.Sprintf("%s=%s", ldPreloadEnvVar, opts.DriverLibraryPath))
464+
}
465+
451466
return cmd
452467
}
453468

pkg/mig/reconfigure/reconfigure_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ func (n *nodeWithLabels) setNodeLabelValue(label string, value string) error {
5555
return nil
5656
}
5757

58+
func (n *nodeWithLabels) getK8sGPUClients(s string) gpuClients {
59+
return n.mock.getK8sGPUClients(s)
60+
}
61+
5862
func TestReconfigure(t *testing.T) {
5963
testCases := []struct {
6064
description string

0 commit comments

Comments
 (0)