@@ -20,12 +20,14 @@ import (
2020 "fmt"
2121 "os"
2222 "os/exec"
23+ "strings"
2324
2425 log "github.com/sirupsen/logrus"
2526 cli "github.com/urfave/cli/v2"
2627 "k8s.io/client-go/kubernetes"
2728 "k8s.io/client-go/tools/cache"
2829 "k8s.io/client-go/tools/clientcmd"
30+ "sigs.k8s.io/yaml"
2931
3032 "context"
3133 "sync"
@@ -38,6 +40,7 @@ import (
3840 "k8s.io/apimachinery/pkg/util/wait"
3941
4042 "github.com/NVIDIA/vgpu-device-manager/internal/info"
43+ "github.com/NVIDIA/vgpu-device-manager/pkg/types"
4144)
4245
4346const (
@@ -47,6 +50,14 @@ const (
4750 vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
4851 pluginStateLabel = "nvidia.com/gpu.deploy.sandbox-device-plugin"
4952 validatorStateLabel = "nvidia.com/gpu.deploy.sandbox-validator"
53+
54+ defaultReconfigureMIGScript = "/usr/bin/reconfigure-mig.sh"
55+ defaultHostRootMount = "/host"
56+ defaultHostNvidiaDir = "/usr/local/nvidia"
57+ defaultHostMigManagerStateFile = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
58+ defaultHostKubeletSystemdService = "kubelet.service"
59+
60+ migConfigDisabled = "all-disabled"
5061)
5162
5263var (
@@ -56,10 +67,25 @@ var (
5667 configFileFlag string
5768 defaultVGPUConfigFlag string
5869
70+ reconfigureMIGScriptFlag string
71+ migPartedConfigFileFlag string
72+ hostRootMountFlag string
73+ hostNvidiaDirFlag string
74+ hostMigManagerStateFileFlag string
75+ hostKubeletSystemdServiceFlag string
76+ gpuClientsFileFlag string
77+ withRebootFlag bool
78+ withShutdownHostGPUClientsFlag bool
79+
5980 pluginDeployed string
6081 validatorDeployed string
6182)
6283
84+ type GPUClients struct {
85+ Version string `json:"version" yaml:"version"`
86+ SystemdServices []string `json:"systemd-services" yaml:"systemd-services"`
87+ }
88+
6389// SyncableVGPUConfig is used to synchronize on changes to a configuration value.
6490// That is, callers of Get() will block until a call to Set() is made.
6591// Multiple calls to Set() do not queue, meaning that only calls to Get() made
@@ -148,6 +174,78 @@ func main() {
148174 Destination : & defaultVGPUConfigFlag ,
149175 EnvVars : []string {"DEFAULT_VGPU_CONFIG" },
150176 },
177+ & cli.StringFlag {
178+ Name : "reconfigure-mig-script" ,
179+ Aliases : []string {"s" },
180+ Value : defaultReconfigureMIGScript ,
181+ Usage : "script to run to do the actual MIG reconfiguration" ,
182+ Destination : & reconfigureMIGScriptFlag ,
183+ EnvVars : []string {"RECONFIGURE_MIG_SCRIPT" },
184+ },
185+ & cli.StringFlag {
186+ Name : "mig-parted-config-file" ,
187+ Aliases : []string {"mc" },
188+ Value : "" ,
189+ Usage : "the path to the mig-parted configuration file" ,
190+ Destination : & migPartedConfigFileFlag ,
191+ EnvVars : []string {"MIG_PARTED_CONFIG_FILE" },
192+ },
193+ & cli.StringFlag {
194+ Name : "host-root-mount" ,
195+ Aliases : []string {"m" },
196+ Value : defaultHostRootMount ,
197+ Usage : "container path where host root directory is mounted" ,
198+ Destination : & hostRootMountFlag ,
199+ EnvVars : []string {"HOST_ROOT_MOUNT" },
200+ },
201+ & cli.StringFlag {
202+ Name : "host-nvidia-dir" ,
203+ Aliases : []string {"i" },
204+ Value : defaultHostNvidiaDir ,
205+ Usage : "host path of the directory where NVIDIA managed software directory is typically located" ,
206+ Destination : & hostNvidiaDirFlag ,
207+ EnvVars : []string {"HOST_NVIDIA_DIR" },
208+ },
209+ & cli.StringFlag {
210+ Name : "host-mig-manager-state-file" ,
211+ Aliases : []string {"o" },
212+ Value : defaultHostMigManagerStateFile ,
213+ Usage : "host path where the host's systemd mig-manager state file is located" ,
214+ Destination : & hostMigManagerStateFileFlag ,
215+ EnvVars : []string {"HOST_MIG_MANAGER_STATE_FILE" },
216+ },
217+ & cli.StringFlag {
218+ Name : "host-kubelet-systemd-service" ,
219+ Aliases : []string {"k" },
220+ Value : defaultHostKubeletSystemdService ,
221+ Usage : "name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration" ,
222+ Destination : & hostKubeletSystemdServiceFlag ,
223+ EnvVars : []string {"HOST_KUBELET_SYSTEMD_SERVICE" },
224+ },
225+ & cli.StringFlag {
226+ Name : "gpu-clients-file" ,
227+ Aliases : []string {"g" },
228+ Value : "" ,
229+ Usage : "the path to the file listing the GPU clients that need to be shutdown across a MIG configuration" ,
230+ Destination : & gpuClientsFileFlag ,
231+ EnvVars : []string {"GPU_CLIENTS_FILE" },
232+ },
233+ & cli.BoolFlag {
234+ Name : "with-reboot" ,
235+ Aliases : []string {"r" },
236+ Value : false ,
237+ Usage : "reboot the node if changing the MIG mode fails for any reason" ,
238+ Destination : & withRebootFlag ,
239+ EnvVars : []string {"WITH_REBOOT" },
240+ },
241+ & cli.BoolFlag {
242+ Name : "with-shutdown-host-gpu-clients" ,
243+ Aliases : []string {"w" },
244+ Value : false ,
245+ Usage : "shutdown/restart any required host GPU clients across a MIG configuration" ,
246+ Destination : & withShutdownHostGPUClientsFlag ,
247+ EnvVars : []string {"WITH_SHUTDOWN_HOST_GPU_CLIENTS" },
248+ },
151249 }
152250
153251 log .Infof ("version: %s" , c .Version )
@@ -296,6 +394,30 @@ func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error
296394 return fmt .Errorf ("unable to shutdown gpu operands: %v" , err )
297395 }
298396
397+ vgpuType , err := types .ParseVGPUType (selectedConfig )
398+ if err != nil {
399+ return fmt .Errorf ("unable to parse vGPU type: %s" , err )
400+ }
401+ if vgpuType .G > 0 {
402+ log .Info ("Running reconfigure MIG script" )
403+
404+ selectedMigConfig , err := convertToMIGConfigFormat (selectedConfig )
405+ if err != nil {
406+ return fmt .Errorf ("unable to convert vGPU type config to MIG config: %s" , err )
407+ }
408+
409+ err = runReconfigureMIGScript (selectedMigConfig )
410+ if err != nil {
411+ return fmt .Errorf ("unable to run reconfigure MIG script: %s" , err )
412+ }
413+ } else {
414+ log .Info ("Disabling MIG if enabled" )
415+ err = runReconfigureMIGScript (migConfigDisabled )
416+ if err != nil {
417+ return fmt .Errorf ("unable to run reconfigure MIG script: %s" , err )
418+ }
419+ }
420+
299421 log .Info ("Applying the selected vGPU device configuration to the node" )
300422 err = applyConfig (selectedConfig )
301423 if err != nil {
@@ -504,3 +626,78 @@ func setNodeLabelValue(clientset *kubernetes.Clientset, label, value string) err
504626
505627 return nil
506628}
629+
630+ func parseGPUCLientsFile (file string ) (* GPUClients , error ) {
631+ var err error
632+ var yamlBytes []byte
633+
634+ if file == "" {
635+ return & GPUClients {}, nil
636+ }
637+
638+ yamlBytes , err = os .ReadFile (file )
639+ if err != nil {
640+ return nil , fmt .Errorf ("read error: %v" , err )
641+ }
642+
643+ var clients GPUClients
644+ err = yaml .Unmarshal (yamlBytes , & clients )
645+ if err != nil {
646+ return nil , fmt .Errorf ("unmarshal error: %v" , err )
647+ }
648+
649+ return & clients , nil
650+ }
651+
652+ func runReconfigureMIGScript (migConfigValue string ) error {
653+ gpuClients , err := parseGPUCLientsFile (gpuClientsFileFlag )
654+ if err != nil {
655+ return fmt .Errorf ("error parsing host's GPU clients file: %s" , err )
656+ }
657+
658+ args := []string {
659+ "-n" , nodeNameFlag ,
660+ "-f" , migPartedConfigFileFlag ,
661+ "-c" , migConfigValue ,
662+ "-m" , hostRootMountFlag ,
663+ "-i" , hostNvidiaDirFlag ,
664+ "-o" , hostMigManagerStateFileFlag ,
665+ "-g" , strings .Join (gpuClients .SystemdServices , "," ),
666+ "-k" , hostKubeletSystemdServiceFlag ,
667+ }
668+ if withRebootFlag {
669+ args = append (args , "-r" )
670+ }
671+ if withShutdownHostGPUClientsFlag {
672+ args = append (args , "-w" )
673+ }
674+
675+ stdLogger := log .StandardLogger ()
676+
677+ cmd := exec .Command (reconfigureMIGScriptFlag , args ... )
678+ cmd .Stdout = stdLogger .Writer ()
679+ cmd .Stderr = stdLogger .WriterLevel (log .ErrorLevel )
680+ return cmd .Run ()
681+ }
682+
683+ // convertToMIGConfigFormat converts a vGPU type string to the MIG config
684+ // equivalent, e.g. "all-{g}g.{gb}gb[.me]".
685+ func convertToMIGConfigFormat (s string ) (string , error ) {
686+ vgpu , err := types .ParseVGPUType (s )
687+ if err != nil {
688+ return "" , fmt .Errorf ("failed to parse vGPU type: %v" , err )
689+ }
690+
691+ // Base format: all-{g}g.{gb}gb
692+ result := fmt .Sprintf ("all-%dg.%dgb" , vgpu .G , vgpu .GB )
693+
694+ // Add .me suffix if media extension attribute is present
695+ for _ , attr := range vgpu .Attr {
696+ if attr == types .AttributeMediaExtensions {
697+ result += ".me"
698+ break
699+ }
700+ }
701+
702+ return result , nil
703+ }
0 commit comments