Skip to content

Commit 9c10da3

Browse files
committed
Add MIG config support when MIG-backed vGPU type
Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent 5f37569 commit 9c10da3

File tree

3 files changed

+560
-0
lines changed

3 files changed

+560
-0
lines changed

cmd/nvidia-k8s-vgpu-dm/main.go

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/client-go/kubernetes"
2727
"k8s.io/client-go/tools/cache"
2828
"k8s.io/client-go/tools/clientcmd"
29+
"sigs.k8s.io/yaml"
2930

3031
"context"
3132
"sync"
@@ -38,6 +39,7 @@ import (
3839
"k8s.io/apimachinery/pkg/util/wait"
3940

4041
"github.com/NVIDIA/vgpu-device-manager/internal/info"
42+
"github.com/NVIDIA/vgpu-device-manager/pkg/types"
4143
)
4244

4345
const (
@@ -47,6 +49,13 @@ const (
4749
vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
4850
pluginStateLabel = "nvidia.com/gpu.deploy.sandbox-device-plugin"
4951
validatorStateLabel = "nvidia.com/gpu.deploy.sandbox-validator"
52+
53+
defaultHostRootMount = "/host"
54+
defaultHostNvidiaDir = "/usr/local/nvidia"
55+
defaultHostMigManagerStateFile = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
56+
defaultHostKubeletSystemdService = "kubelet.service"
57+
58+
migConfigDisabled = "all-disabled"
5059
)
5160

5261
var (
@@ -56,10 +65,24 @@ var (
5665
configFileFlag string
5766
defaultVGPUConfigFlag string
5867

68+
migPartedConfigFileFlag string
69+
hostRootMountFlag string
70+
hostNvidiaDirFlag string
71+
hostMigManagerStateFileFlag string
72+
hostKubeletSystemdServiceFlag string
73+
gpuClientsFileFlag string
74+
withRebootFlag bool
75+
withShutdownHostGPUClientsFlag bool
76+
5977
pluginDeployed string
6078
validatorDeployed string
6179
)
6280

81+
type GPUClients struct {
82+
Version string `json:"version" yaml:"version"`
83+
SystemdServices []string `json:"systemd-services" yaml:"systemd-services"`
84+
}
85+
6386
// SyncableVGPUConfig is used to synchronize on changes to a configuration value.
6487
// That is, callers of Get() will block until a call to Set() is made.
6588
// Multiple calls to Set() do not queue, meaning that only calls to Get() made
@@ -148,6 +171,70 @@ func main() {
148171
Destination: &defaultVGPUConfigFlag,
149172
EnvVars: []string{"DEFAULT_VGPU_CONFIG"},
150173
},
174+
&cli.StringFlag{
175+
Name: "mig-parted-config-file",
176+
Aliases: []string{"mc"},
177+
Value: "",
178+
Usage: "the path to the mig-parted configuration file",
179+
Destination: &migPartedConfigFileFlag,
180+
EnvVars: []string{"MIG_PARTED_CONFIG_FILE"},
181+
},
182+
&cli.StringFlag{
183+
Name: "host-root-mount",
184+
Aliases: []string{"m"},
185+
Value: defaultHostRootMount,
186+
Usage: "container path where host root directory is mounted",
187+
Destination: &hostRootMountFlag,
188+
EnvVars: []string{"HOST_ROOT_MOUNT"},
189+
},
190+
&cli.StringFlag{
191+
Name: "host-nvidia-dir",
192+
Aliases: []string{"i"},
193+
Value: defaultHostNvidiaDir,
194+
Usage: "host path of the directory where NVIDIA managed software directory is typically located",
195+
Destination: &hostNvidiaDirFlag,
196+
EnvVars: []string{"HOST_NVIDIA_DIR"},
197+
},
198+
&cli.StringFlag{
199+
Name: "host-mig-manager-state-file",
200+
Aliases: []string{"o"},
201+
Value: defaultHostMigManagerStateFile,
202+
Usage: "host path where the host's systemd mig-manager state file is located",
203+
Destination: &hostMigManagerStateFileFlag,
204+
EnvVars: []string{"HOST_MIG_MANAGER_STATE_FILE"},
205+
},
206+
&cli.StringFlag{
207+
Name: "host-kubelet-systemd-service",
208+
Aliases: []string{"k"},
209+
Value: defaultHostKubeletSystemdService,
210+
Usage: "name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration",
211+
Destination: &hostKubeletSystemdServiceFlag,
212+
EnvVars: []string{"HOST_KUBELET_SYSTEMD_SERVICE"},
213+
},
214+
&cli.StringFlag{
215+
Name: "gpu-clients-file",
216+
Aliases: []string{"g"},
217+
Value: "",
218+
Usage: "the path to the file listing the GPU clients that need to be shutdown across a MIG configuration",
219+
Destination: &gpuClientsFileFlag,
220+
EnvVars: []string{"GPU_CLIENTS_FILE"},
221+
},
222+
&cli.BoolFlag{
223+
Name: "with-reboot",
224+
Aliases: []string{"r"},
225+
Value: false,
226+
Usage: "reboot the node if changing the MIG mode fails for any reason",
227+
Destination: &withRebootFlag,
228+
EnvVars: []string{"WITH_REBOOT"},
229+
},
230+
&cli.BoolFlag{
231+
Name: "with-shutdown-host-gpu-clients",
232+
Aliases: []string{"w"},
233+
Value: false,
234+
Usage: "shutdown/restart any required host GPU clients across a MIG configuration",
235+
Destination: &withShutdownHostGPUClientsFlag,
236+
EnvVars: []string{"WITH_SHUTDOWN_HOST_GPU_CLIENTS"},
237+
},
151238
}
152239

153240
log.Infof("version: %s", c.Version)
@@ -296,6 +383,30 @@ func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error
296383
return fmt.Errorf("unable to shutdown gpu operands: %v", err)
297384
}
298385

386+
vgpuType, err := types.ParseVGPUType(selectedConfig)
387+
if err != nil {
388+
return fmt.Errorf("unable to parse vGPU type: %s", err)
389+
}
390+
if vgpuType.G > 0 {
391+
log.Info("Running reconfigure MIG script")
392+
393+
selectedMigConfig, err := convertToMIGConfigFormat(selectedConfig)
394+
if err != nil {
395+
return fmt.Errorf("unable to convert vGPU type config to MIG config: %s", err)
396+
}
397+
398+
err = reconfigureMIG(clientset, selectedMigConfig)
399+
if err != nil {
400+
return fmt.Errorf("unable to run reconfigure MIG script: %s", err)
401+
}
402+
} else {
403+
log.Info("Disabling MIG if enabled")
404+
err = reconfigureMIG(clientset, migConfigDisabled)
405+
if err != nil {
406+
return fmt.Errorf("unable to run reconfigure MIG script: %s", err)
407+
}
408+
}
409+
299410
log.Info("Applying the selected vGPU device configuration to the node")
300411
err = applyConfig(selectedConfig)
301412
if err != nil {
@@ -504,3 +615,69 @@ func setNodeLabelValue(clientset *kubernetes.Clientset, label, value string) err
504615

505616
return nil
506617
}
618+
619+
func parseGPUCLientsFile(file string) (*GPUClients, error) {
620+
var err error
621+
var yamlBytes []byte
622+
623+
if file == "" {
624+
return &GPUClients{}, nil
625+
}
626+
627+
yamlBytes, err = os.ReadFile(file)
628+
if err != nil {
629+
return nil, fmt.Errorf("read error: %v", err)
630+
}
631+
632+
var clients GPUClients
633+
err = yaml.Unmarshal(yamlBytes, &clients)
634+
if err != nil {
635+
return nil, fmt.Errorf("unmarshal error: %v", err)
636+
}
637+
638+
return &clients, nil
639+
}
640+
641+
func reconfigureMIG(clientset *kubernetes.Clientset, migConfigValue string) error {
642+
gpuClients, err := parseGPUCLientsFile(gpuClientsFileFlag)
643+
if err != nil {
644+
return fmt.Errorf("error parsing host's GPU clients file: %s", err)
645+
}
646+
647+
opts := &MIGReconfigOptions{
648+
NodeName: nodeNameFlag,
649+
MIGPartedConfigFile: migPartedConfigFileFlag,
650+
SelectedMIGConfig: migConfigValue,
651+
WithReboot: withRebootFlag,
652+
WithShutdownHostGPUClients: withShutdownHostGPUClientsFlag,
653+
HostRootMount: hostRootMountFlag,
654+
HostNvidiaDir: hostNvidiaDirFlag,
655+
HostMIGManagerStateFile: hostMigManagerStateFileFlag,
656+
HostGPUClientServices: gpuClients.SystemdServices,
657+
HostKubeletService: hostKubeletSystemdServiceFlag,
658+
}
659+
660+
return ReconfigureMIG(clientset, opts)
661+
}
662+
663+
// convertToMIGConfigFormat converts a vGPU type string to the MIG config format.
664+
// Examples: "A100-1-5C" -> "all-1g.5gb", "A100-1-5CME" -> "all-1g.5gb.me"
665+
func convertToMIGConfigFormat(s string) (string, error) {
666+
vgpu, err := types.ParseVGPUType(s)
667+
if err != nil {
668+
return "", fmt.Errorf("failed to parse vGPU type: %v", err)
669+
}
670+
671+
// Base format: all-{g}g.{gb}gb
672+
result := fmt.Sprintf("all-%dg.%dgb", vgpu.G, vgpu.GB)
673+
674+
// Add .me suffix if media extension attribute is present
675+
for _, attr := range vgpu.Attr {
676+
if attr == types.AttributeMediaExtensions {
677+
result += ".me"
678+
break
679+
}
680+
}
681+
682+
return result, nil
683+
}

0 commit comments

Comments
 (0)