@@ -21,6 +21,7 @@ import (
2121 "context"
2222 "errors"
2323 "fmt"
24+ "io"
2425 "os"
2526 "os/exec"
2627 "path/filepath"
@@ -33,8 +34,6 @@ import (
3334)
3435
3536const (
36- migPartedCliName = "nvidia-mig-parted"
37-
3837 configStatePending = "pending"
3938 configStateRebooting = "rebooting"
4039
@@ -66,20 +65,15 @@ func New(opts ...Option) (Reconfigurer, error) {
6665 return nil , err
6766 }
6867
69- c := & commandWithOutput {}
68+ migParted , err := o .createMIGPartedCLI ()
69+ if err != nil {
70+ return nil , err
71+ }
7072
71- // TODO: If WITH_SHUTDOWN_HOST_GPU_CLIENTS is set we need to run mig-parted
72- // on the host. We also need to update the MIG_CONFIG_FILE to refer to one
73- // that we copy to the host.
7473 r := & reconfigurer {
7574 reconfigureMIGOptions : o ,
76- commandRunner : c ,
77- migParted : & migPartedCLI {
78- MIGPartedConfigFile : o .MIGPartedConfigFile ,
79- SelectedMIGConfig : o .SelectedMIGConfig ,
80- DriverLibraryPath : o .DriverLibraryPath ,
81- commandRunner : c ,
82- },
75+ commandRunner : & commandWithOutput {},
76+ migParted : migParted ,
8377 node : & node {
8478 clientset : o .clientset ,
8579 name : o .NodeName ,
@@ -89,6 +83,90 @@ func New(opts ...Option) (Reconfigurer, error) {
8983 return r , nil
9084}
9185
86+ func (opts * reconfigureMIGOptions ) createMIGPartedCLI () (* migPartedCLI , error ) {
87+ c := & commandWithOutput {}
88+
89+ if ! opts .WithShutdownHostGPUClients {
90+ m := & migPartedCLI {
91+ path : "/usr/bin/nvidia-mig-parted" ,
92+ MIGPartedConfigFile : opts .MIGPartedConfigFile ,
93+ SelectedMIGConfig : opts .SelectedMIGConfig ,
94+ DriverLibraryPath : opts .DriverLibraryPath ,
95+ commandRunner : c ,
96+ }
97+ return m , nil
98+ }
99+
100+ if opts .hostNVIDIADir == "" {
101+ return nil , fmt .Errorf ("HOST_NVIDIA_DIR must be specified" )
102+ }
103+
104+ hostRoot , err := os .OpenRoot (opts .HostRootMount )
105+ if err != nil {
106+ return nil , fmt .Errorf ("failed to open host root: %w" , err )
107+ }
108+ defer hostRoot .Close ()
109+
110+ // TODO: Once we switch to go 1.25, we can use os.Root.MkdirAll.
111+ hostNVIDIADir := strings .TrimPrefix (opts .hostNVIDIADir , "/" )
112+ if err := hostRoot .Mkdir (hostNVIDIADir , 0755 ); err != nil {
113+ return nil , fmt .Errorf ("failed to create directory: %w" , err )
114+ }
115+ if err := hostRoot .Mkdir (filepath .Join (hostNVIDIADir , "mig-manager" ), 0755 ); err != nil {
116+ return nil , fmt .Errorf ("failed to create directory: %w" , err )
117+ }
118+
119+ migParted , err := os .Open ("/usr/bin/nvidia-mig-parted" )
120+ if err != nil {
121+ return nil , err
122+ }
123+ defer migParted .Close ()
124+
125+ hostMigPartedPath := filepath .Join (hostNVIDIADir , "mig-manager" , "nvidia-mig-parted" )
126+ hostMigParted , err := hostRoot .Create (hostMigPartedPath )
127+ if err != nil {
128+ return nil , err
129+ }
130+ defer hostMigParted .Close ()
131+
132+ if _ , err := io .Copy (hostMigParted , migParted ); err != nil {
133+ return nil , err
134+ }
135+ // Ensure that the file is executable.
136+ if err := hostMigParted .Chmod (0755 ); err != nil {
137+ return nil , err
138+ }
139+
140+ configFile , err := os .Open (opts .MIGPartedConfigFile )
141+ if err != nil {
142+ return nil , err
143+ }
144+ defer configFile .Close ()
145+
146+ hostConfigFilePath := filepath .Join (hostNVIDIADir , "mig-manager" , "config.yaml" )
147+ hostConfigFile , err := hostRoot .Create (hostConfigFilePath )
148+ if err != nil {
149+ return nil , err
150+ }
151+ defer hostConfigFile .Close ()
152+
153+ if _ , err := io .Copy (hostConfigFile , configFile ); err != nil {
154+ return nil , err
155+ }
156+
157+ m := & migPartedCLI {
158+ root : hostRoot .Name (),
159+ path : "/" + hostMigPartedPath ,
160+ MIGPartedConfigFile : "/" + hostConfigFilePath ,
161+ SelectedMIGConfig : opts .SelectedMIGConfig ,
162+ // TODO: We may need to update this for the host.
163+ DriverLibraryPath : "" ,
164+ commandRunner : c ,
165+ }
166+
167+ return m , nil
168+ }
169+
92170// Reconfigure configures MIG (Multi-Instance GPU) settings on a Kubernetes
93171// node. It validates the requested configuration, checks the current state,
94172// applies MIG mode changes, manages host GPU client services, and handles
@@ -106,7 +184,7 @@ func (opts *reconfigurer) Reconfigure() (rerr error) {
106184 // If we're returning due to an error in restarting the services, then
107185 // don't restart them here.
108186 if restartSystemdClients {
109- if err := systemdClients . Restart ( ); err != nil {
187+ if err := opts . hostStartSystemdServices ( systemdClients ); err != nil {
110188 rerr = errors .Join (rerr , err )
111189 }
112190
@@ -249,6 +327,8 @@ func (opts *reconfigurer) Reconfigure() (rerr error) {
249327}
250328
251329type migPartedCLI struct {
330+ root string
331+ path string
252332 MIGPartedConfigFile string
253333 SelectedMIGConfig string
254334 DriverLibraryPath string
@@ -358,82 +438,6 @@ func (opts *reconfigureMIGOptions) hostStartSystemdServices(systemdGPUClients gp
358438 return nil
359439}
360440
361- func stopSystemdService (opts * reconfigureMIGOptions , service string ) (bool , error ) {
362- cmd := exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-active" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
363- if err := cmd .Run (); err == nil {
364- log .Infof ("%s %s (active, will-restart)" , "stop" , service )
365- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "stop" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name, action is controlled parameter.
366- if err := cmd .Run (); err != nil {
367- return false , err
368- }
369- return true , nil
370- }
371-
372- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-enabled" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
373- if err := cmd .Run (); err != nil {
374- log .Infof ("Skipping %s (no-exist)" , service )
375- return false , nil
376- }
377-
378- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-failed" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
379- if err := cmd .Run (); err == nil {
380- log .Infof ("Skipping %s (is-failed, will-restart)" , service )
381- return true , nil
382- }
383-
384- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-enabled" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
385- if err := cmd .Run (); err != nil {
386- log .Infof ("Skipping %s (disabled)" , service )
387- return false , nil
388- }
389-
390- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "show" , "--property=Type" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
391- output , err := cmd .Output ()
392- if err != nil {
393- return false , err
394- }
395- if strings .TrimSpace (string (output )) == "Type=oneshot" {
396- log .Infof ("Skipping %s (inactive, oneshot, no-restart)" , service )
397- return false , nil
398- }
399-
400- log .Infof ("Skipping %s (inactive, will-restart)" , service )
401- return true , nil
402- }
403-
404- func shouldRestartService (opts * reconfigureMIGOptions , service string ) bool {
405- cmd := exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-active" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
406- if err := cmd .Run (); err == nil {
407- return false
408- }
409-
410- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-enabled" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
411- if err := cmd .Run (); err != nil {
412- return false
413- }
414-
415- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-failed" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
416- if err := cmd .Run (); err == nil {
417- return true
418- }
419-
420- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "-q" , "is-enabled" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
421- if err := cmd .Run (); err != nil {
422- return false
423- }
424-
425- cmd = exec .Command ("chroot" , opts .HostRootMount , "systemctl" , "show" , "--property=Type" , service ) // #nosec G204 -- HostRootMount validated via dirpath, service validated via systemd_service_name.
426- output , err := cmd .Output ()
427- if err != nil {
428- return false
429- }
430- if strings .TrimSpace (string (output )) == "Type=oneshot" {
431- return false
432- }
433-
434- return true
435- }
436-
437441func (c * commandWithOutput ) Run (cmd * exec.Cmd ) error {
438442 cmd .Stdout = os .Stdout
439443 cmd .Stderr = os .Stderr
@@ -446,8 +450,19 @@ func (opts *migPartedCLI) runMigParted(args ...string) error {
446450}
447451
448452func (opts * migPartedCLI ) migPartedCmd (args ... string ) * exec.Cmd {
449- cmd := exec .Command (migPartedCliName , args ... )
450- cmd .Env = append (os .Environ (), fmt .Sprintf ("%s=%s" , ldPreloadEnvVar , opts .DriverLibraryPath ))
453+ var commandAndArgs []string
454+
455+ if opts .root != "" && opts .root != "/" {
456+ commandAndArgs = append (commandAndArgs , "chroot" , opts .root )
457+ }
458+ commandAndArgs = append (commandAndArgs , opts .path )
459+ commandAndArgs = append (commandAndArgs , args ... )
460+
461+ cmd := exec .Command (commandAndArgs [0 ], commandAndArgs [1 :]... ) //nolint:gosec
462+ if opts .DriverLibraryPath != "" {
463+ cmd .Env = append (os .Environ (), fmt .Sprintf ("%s=%s" , ldPreloadEnvVar , opts .DriverLibraryPath ))
464+ }
465+
451466 return cmd
452467}
453468
0 commit comments