@@ -26,6 +26,7 @@ import (
2626 "k8s.io/client-go/kubernetes"
2727 "k8s.io/client-go/tools/cache"
2828 "k8s.io/client-go/tools/clientcmd"
29+ "sigs.k8s.io/yaml"
2930
3031 "context"
3132 "sync"
@@ -38,6 +39,7 @@ import (
3839 "k8s.io/apimachinery/pkg/util/wait"
3940
4041 "github.com/NVIDIA/vgpu-device-manager/internal/info"
42+ "github.com/NVIDIA/vgpu-device-manager/pkg/types"
4143)
4244
4345const (
@@ -47,6 +49,13 @@ const (
4749 vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
4850 pluginStateLabel = "nvidia.com/gpu.deploy.sandbox-device-plugin"
4951 validatorStateLabel = "nvidia.com/gpu.deploy.sandbox-validator"
52+
53+ defaultHostRootMount = "/host"
54+ defaultHostNvidiaDir = "/usr/local/nvidia"
55+ defaultHostMigManagerStateFile = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
56+ defaultHostKubeletSystemdService = "kubelet.service"
57+
58+ migConfigDisabled = "all-disabled"
5059)
5160
5261var (
@@ -56,10 +65,24 @@ var (
5665 configFileFlag string
5766 defaultVGPUConfigFlag string
5867
68+ migPartedConfigFileFlag string
69+ hostRootMountFlag string
70+ hostNvidiaDirFlag string
71+ hostMigManagerStateFileFlag string
72+ hostKubeletSystemdServiceFlag string
73+ gpuClientsFileFlag string
74+ withRebootFlag bool
75+ withShutdownHostGPUClientsFlag bool
76+
5977 pluginDeployed string
6078 validatorDeployed string
6179)
6280
81+ type GPUClients struct {
82+ Version string `json:"version" yaml:"version"`
83+ SystemdServices []string `json:"systemd-services" yaml:"systemd-services"`
84+ }
85+
6386// SyncableVGPUConfig is used to synchronize on changes to a configuration value.
6487// That is, callers of Get() will block until a call to Set() is made.
6588// Multiple calls to Set() do not queue, meaning that only calls to Get() made
@@ -148,6 +171,70 @@ func main() {
148171 Destination : & defaultVGPUConfigFlag ,
149172 EnvVars : []string {"DEFAULT_VGPU_CONFIG" },
150173 },
174+ & cli.StringFlag {
175+ Name : "mig-parted-config-file" ,
176+ Aliases : []string {"mc" },
177+ Value : "" ,
178+ Usage : "the path to the mig-parted configuration file" ,
179+ Destination : & migPartedConfigFileFlag ,
180+ EnvVars : []string {"MIG_PARTED_CONFIG_FILE" },
181+ },
182+ & cli.StringFlag {
183+ Name : "host-root-mount" ,
184+ Aliases : []string {"m" },
185+ Value : defaultHostRootMount ,
186+ Usage : "container path where host root directory is mounted" ,
187+ Destination : & hostRootMountFlag ,
188+ EnvVars : []string {"HOST_ROOT_MOUNT" },
189+ },
190+ & cli.StringFlag {
191+ Name : "host-nvidia-dir" ,
192+ Aliases : []string {"i" },
193+ Value : defaultHostNvidiaDir ,
194+ Usage : "host path of the directory where NVIDIA managed software directory is typically located" ,
195+ Destination : & hostNvidiaDirFlag ,
196+ EnvVars : []string {"HOST_NVIDIA_DIR" },
197+ },
198+ & cli.StringFlag {
199+ Name : "host-mig-manager-state-file" ,
200+ Aliases : []string {"o" },
201+ Value : defaultHostMigManagerStateFile ,
202+ Usage : "host path where the host's systemd mig-manager state file is located" ,
203+ Destination : & hostMigManagerStateFileFlag ,
204+ EnvVars : []string {"HOST_MIG_MANAGER_STATE_FILE" },
205+ },
206+ & cli.StringFlag {
207+ Name : "host-kubelet-systemd-service" ,
208+ Aliases : []string {"k" },
209+ Value : defaultHostKubeletSystemdService ,
210+ Usage : "name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration" ,
211+ Destination : & hostKubeletSystemdServiceFlag ,
212+ EnvVars : []string {"HOST_KUBELET_SYSTEMD_SERVICE" },
213+ },
214+ & cli.StringFlag {
215+ Name : "gpu-clients-file" ,
216+ Aliases : []string {"g" },
217+ Value : "" ,
218+ Usage : "the path to the file listing the GPU clients that need to be shutdown across a MIG configuration" ,
219+ Destination : & gpuClientsFileFlag ,
220+ EnvVars : []string {"GPU_CLIENTS_FILE" },
221+ },
222+ & cli.BoolFlag {
223+ Name : "with-reboot" ,
224+ Aliases : []string {"r" },
225+ Value : false ,
226+ Usage : "reboot the node if changing the MIG mode fails for any reason" ,
227+ Destination : & withRebootFlag ,
228+ EnvVars : []string {"WITH_REBOOT" },
229+ },
230+ & cli.BoolFlag {
231+ Name : "with-shutdown-host-gpu-clients" ,
232+ Aliases : []string {"w" },
233+ Value : false ,
234+ Usage : "shutdown/restart any required host GPU clients across a MIG configuration" ,
235+ Destination : & withShutdownHostGPUClientsFlag ,
236+ EnvVars : []string {"WITH_SHUTDOWN_HOST_GPU_CLIENTS" },
237+ },
151238 }
152239
153240 log .Infof ("version: %s" , c .Version )
@@ -296,6 +383,30 @@ func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error
296383 return fmt .Errorf ("unable to shutdown gpu operands: %v" , err )
297384 }
298385
386+ vgpuType , err := types .ParseVGPUType (selectedConfig )
387+ if err != nil {
388+ return fmt .Errorf ("unable to parse vGPU type: %s" , err )
389+ }
390+ if vgpuType .G > 0 {
391+ log .Info ("Running reconfigure MIG script" )
392+
393+ selectedMigConfig , err := convertToMIGConfigFormat (selectedConfig )
394+ if err != nil {
395+ return fmt .Errorf ("unable to convert vGPU type config to MIG config: %s" , err )
396+ }
397+
398+ err = reconfigureMIG (clientset , selectedMigConfig )
399+ if err != nil {
400+ return fmt .Errorf ("unable to run reconfigure MIG script: %s" , err )
401+ }
402+ } else {
403+ log .Info ("Disabling MIG if enabled" )
404+ err = reconfigureMIG (clientset , migConfigDisabled )
405+ if err != nil {
406+ return fmt .Errorf ("unable to run reconfigure MIG script: %s" , err )
407+ }
408+ }
409+
299410 log .Info ("Applying the selected vGPU device configuration to the node" )
300411 err = applyConfig (selectedConfig )
301412 if err != nil {
@@ -504,3 +615,69 @@ func setNodeLabelValue(clientset *kubernetes.Clientset, label, value string) err
504615
505616 return nil
506617}
618+
619+ func parseGPUCLientsFile (file string ) (* GPUClients , error ) {
620+ var err error
621+ var yamlBytes []byte
622+
623+ if file == "" {
624+ return & GPUClients {}, nil
625+ }
626+
627+ yamlBytes , err = os .ReadFile (file )
628+ if err != nil {
629+ return nil , fmt .Errorf ("read error: %v" , err )
630+ }
631+
632+ var clients GPUClients
633+ err = yaml .Unmarshal (yamlBytes , & clients )
634+ if err != nil {
635+ return nil , fmt .Errorf ("unmarshal error: %v" , err )
636+ }
637+
638+ return & clients , nil
639+ }
640+
641+ func reconfigureMIG (clientset * kubernetes.Clientset , migConfigValue string ) error {
642+ gpuClients , err := parseGPUCLientsFile (gpuClientsFileFlag )
643+ if err != nil {
644+ return fmt .Errorf ("error parsing host's GPU clients file: %s" , err )
645+ }
646+
647+ opts := & MIGReconfigOptions {
648+ NodeName : nodeNameFlag ,
649+ MIGPartedConfigFile : migPartedConfigFileFlag ,
650+ SelectedMIGConfig : migConfigValue ,
651+ WithReboot : withRebootFlag ,
652+ WithShutdownHostGPUClients : withShutdownHostGPUClientsFlag ,
653+ HostRootMount : hostRootMountFlag ,
654+ HostNvidiaDir : hostNvidiaDirFlag ,
655+ HostMIGManagerStateFile : hostMigManagerStateFileFlag ,
656+ HostGPUClientServices : gpuClients .SystemdServices ,
657+ HostKubeletService : hostKubeletSystemdServiceFlag ,
658+ }
659+
660+ return ReconfigureMIG (clientset , opts )
661+ }
662+
663+ // convertToMIGConfigFormat converts a vGPU type string to the MIG config format.
664+ // Examples: "A100-1-5C" -> "all-1g.5gb", "A100-1-5CME" -> "all-1g.5gb.me"
665+ func convertToMIGConfigFormat (s string ) (string , error ) {
666+ vgpu , err := types .ParseVGPUType (s )
667+ if err != nil {
668+ return "" , fmt .Errorf ("failed to parse vGPU type: %v" , err )
669+ }
670+
671+ // Base format: all-{g}g.{gb}gb
672+ result := fmt .Sprintf ("all-%dg.%dgb" , vgpu .G , vgpu .GB )
673+
674+ // Add .me suffix if media extension attribute is present
675+ for _ , attr := range vgpu .Attr {
676+ if attr == types .AttributeMediaExtensions {
677+ result += ".me"
678+ break
679+ }
680+ }
681+
682+ return result , nil
683+ }
0 commit comments