NVIDIA
diff --git a/‎cmd/nvidia-k8s-vgpu-dm/main.go‎
Lines changed: 242 additions & 0 deletions b/‎cmd/nvidia-k8s-vgpu-dm/main.go‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎cmd/nvidia-k8s-vgpu-dm/nvml.go‎
Lines changed: 37 additions & 0 deletions b/‎cmd/nvidia-k8s-vgpu-dm/nvml.go‎
Lines changed: 37 additions & 0 deletions
@@ -26,6 +26,7 @@ import (
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/tools/clientcmd"
+	"sigs.k8s.io/yaml"
 
 	"context"
 	"sync"
@@ -37,7 +38,13 @@ import (
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/util/wait"
 
+	migpartedv1 "github.com/NVIDIA/mig-parted/api/spec/v1"
+	migtypes "github.com/NVIDIA/mig-parted/pkg/types"
+
+	v1 "github.com/NVIDIA/vgpu-device-manager/api/spec/v1"
+	"github.com/NVIDIA/vgpu-device-manager/cmd/nvidia-vgpu-dm/assert"
 	"github.com/NVIDIA/vgpu-device-manager/internal/info"
+	"github.com/NVIDIA/vgpu-device-manager/pkg/types"
 )
 
 const (
@@ -47,6 +54,13 @@ const (
 	vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
 	pluginStateLabel     = "nvidia.com/gpu.deploy.sandbox-device-plugin"
 	validatorStateLabel  = "nvidia.com/gpu.deploy.sandbox-validator"
+
+	defaultHostRootMount             = "/host"
+	defaultHostNvidiaDir             = "/usr/local/nvidia"
+	defaultHostMigManagerStateFile   = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
+	defaultHostKubeletSystemdService = "kubelet.service"
+
+	migConfigDisabled = "all-disabled"
 )
 
 var (
@@ -56,10 +70,23 @@ var (
 	configFileFlag        string
 	defaultVGPUConfigFlag string
 
+	hostRootMountFlag              string
+	hostNvidiaDirFlag              string
+	hostMigManagerStateFileFlag    string
+	hostKubeletSystemdServiceFlag  string
+	gpuClientsFileFlag             string
+	withRebootFlag                 bool
+	withShutdownHostGPUClientsFlag bool
+
 	pluginDeployed    string
 	validatorDeployed string
 )
 
+type GPUClients struct {
+	Version         string   `json:"version"          yaml:"version"`
+	SystemdServices []string `json:"systemd-services" yaml:"systemd-services"`
+}
+
 // SyncableVGPUConfig is used to synchronize on changes to a configuration value.
 // That is, callers of Get() will block until a call to Set() is made.
 // Multiple calls to Set() do not queue, meaning that only calls to Get() made
@@ -148,6 +175,62 @@ func main() {
 			Destination: &defaultVGPUConfigFlag,
 			EnvVars:     []string{"DEFAULT_VGPU_CONFIG"},
 		},
+		&cli.StringFlag{
+			Name:        "host-root-mount",
+			Aliases:     []string{"m"},
+			Value:       defaultHostRootMount,
+			Usage:       "container path where host root directory is mounted",
+			Destination: &hostRootMountFlag,
+			EnvVars:     []string{"HOST_ROOT_MOUNT"},
+		},
+		&cli.StringFlag{
+			Name:        "host-nvidia-dir",
+			Aliases:     []string{"i"},
+			Value:       defaultHostNvidiaDir,
+			Usage:       "host path of the directory where NVIDIA managed software directory is typically located",
+			Destination: &hostNvidiaDirFlag,
+			EnvVars:     []string{"HOST_NVIDIA_DIR"},
+		},
+		&cli.StringFlag{
+			Name:        "host-mig-manager-state-file",
+			Aliases:     []string{"o"},
+			Value:       defaultHostMigManagerStateFile,
+			Usage:       "host path where the host's systemd mig-manager state file is located",
+			Destination: &hostMigManagerStateFileFlag,
+			EnvVars:     []string{"HOST_MIG_MANAGER_STATE_FILE"},
+		},
+		&cli.StringFlag{
+			Name:        "host-kubelet-systemd-service",
+			Aliases:     []string{"k"},
+			Value:       defaultHostKubeletSystemdService,
+			Usage:       "name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration",
+			Destination: &hostKubeletSystemdServiceFlag,
+			EnvVars:     []string{"HOST_KUBELET_SYSTEMD_SERVICE"},
+		},
+		&cli.StringFlag{
+			Name:        "gpu-clients-file",
+			Aliases:     []string{"g"},
+			Value:       "",
+			Usage:       "the path to the file listing the GPU clients that need to be shutdown across a MIG configuration",
+			Destination: &gpuClientsFileFlag,
+			EnvVars:     []string{"GPU_CLIENTS_FILE"},
+		},
+		&cli.BoolFlag{
+			Name:        "with-reboot",
+			Aliases:     []string{"r"},
+			Value:       false,
+			Usage:       "reboot the node if changing the MIG mode fails for any reason",
+			Destination: &withRebootFlag,
+			EnvVars:     []string{"WITH_REBOOT"},
+		},
+		&cli.BoolFlag{
+			Name:        "with-shutdown-host-gpu-clients",
+			Aliases:     []string{"w"},
+			Value:       false,
+			Usage:       "shutdown/restart any required host GPU clients across a MIG configuration",
+			Destination: &withShutdownHostGPUClientsFlag,
+			EnvVars:     []string{"WITH_SHUTDOWN_HOST_GPU_CLIENTS"},
+		},
 	}
 
 	log.Infof("version: %s", c.Version)
@@ -296,6 +379,10 @@ func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error
 		return fmt.Errorf("unable to shutdown gpu operands: %v", err)
 	}
 
+	if err := handleMIGConfiguration(clientset, selectedConfig); err != nil {
+		return fmt.Errorf("unable to handle MIG configuration: %v", err)
+	}
+
 	log.Info("Applying the selected vGPU device configuration to the node")
 	err = applyConfig(selectedConfig)
 	if err != nil {
@@ -504,3 +591,158 @@ func setNodeLabelValue(clientset *kubernetes.Clientset, label, value string) err
 
 	return nil
 }
+
+func handleMIGConfiguration(clientset kubernetes.Interface, selectedConfig string) error {
+	if err := isNVMLAvailable(); err != nil {
+		log.Infof("Skipping MIG configuration due to NVML error: %v, proceeding with vGPU configuration", err)
+		return nil
+	}
+
+	migConfig, err := determineMIGConfig(selectedConfig)
+	if err != nil {
+		return err
+	}
+
+	configFile, err := saveMIGConfigToTempFile(migConfig)
+	if err != nil {
+		return fmt.Errorf("failed to save MIG config to temporary file: %w", err)
+	}
+
+	return updateMIGConfig(clientset.(*kubernetes.Clientset), configFile, selectedConfig)
+}
+
+func determineMIGConfig(selectedConfig string) (*migpartedv1.Spec, error) {
+	f := &assert.Flags{
+		ConfigFile:     configFileFlag,
+		SelectedConfig: selectedConfig,
+		ValidConfig:    false, // We don't need to validate the config here, just parse it.
+	}
+
+	log.Debugf("Parsing vGPU config file...")
+	spec, err := assert.ParseConfigFile(f)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing config file: %v", err)
+	}
+
+	log.Debugf("Selecting specific vGPU config...")
+	vgpuConfig, err := assert.GetSelectedVGPUConfig(f, spec)
+	if err != nil {
+		return nil, fmt.Errorf("error selecting VGPU config: %v", err)
+	}
+
+	return convertToMIGConfig(vgpuConfig, selectedConfig)
+}
+
+func convertToMIGConfig(vgpuConfig v1.VGPUConfigSpecSlice, selectedConfig string) (*migpartedv1.Spec, error) {
+	var migConfigSpecs migpartedv1.MigConfigSpecSlice
+
+	for _, vgpuSpec := range vgpuConfig {
+		migSpec := migpartedv1.MigConfigSpec{
+			DeviceFilter: vgpuSpec.DeviceFilter,
+			Devices:      vgpuSpec.Devices,
+			MigDevices:   make(migtypes.MigConfig),
+		}
+
+		migEnabled := false
+		for vgpuType := range vgpuSpec.VGPUDevices {
+			vgpu, err := types.ParseVGPUType(vgpuType)
+			if err != nil {
+				return nil, fmt.Errorf("failed to parse vGPU type %s: %w", vgpuType, err)
+			}
+
+			if vgpu.G > 0 {
+				migEnabled = true
+				migProfile := fmt.Sprintf("%dg.%dgb", vgpu.G, vgpu.GB)
+				for _, attr := range vgpu.Attr {
+					if attr == types.AttributeMediaExtensions {
+						migProfile += ".me"
+						break
+					}
+				}
+				migSpec.MigDevices[migProfile] = vgpuSpec.VGPUDevices[vgpuType]
+			}
+		}
+
+		migSpec.MigEnabled = migEnabled
+
+		migConfigSpecs = append(migConfigSpecs, migSpec)
+	}
+
+	spec := &migpartedv1.Spec{
+		Version: migpartedv1.Version,
+		MigConfigs: map[string]migpartedv1.MigConfigSpecSlice{
+			selectedConfig: migConfigSpecs,
+		},
+	}
+	return spec, nil
+}
+
+func saveMIGConfigToTempFile(migConfig *migpartedv1.Spec) (string, error) {
+	tempFile, err := os.CreateTemp("", "mig-parted-config-*.yaml")
+	if err != nil {
+		return "", fmt.Errorf("failed to create temporary file: %w", err)
+	}
+	defer tempFile.Close()
+
+	yamlData, err := yaml.Marshal(migConfig)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal MIG config to YAML: %w", err)
+	}
+
+	if _, err := tempFile.Write(yamlData); err != nil {
+		return "", fmt.Errorf("failed to write YAML data to temporary file: %w", err)
+	}
+
+	return tempFile.Name(), nil
+}
+
+func updateMIGConfig(clientset *kubernetes.Clientset, migPartedConfigFile, selectedConfig string) error {
+
+	defer func() {
+		if err := os.Remove(migPartedConfigFile); err != nil {
+			log.Errorf("Failed to remove temporary mig-parted config file %s: %v", migPartedConfigFile, err)
+		}
+	}()
+
+	gpuClients, err := parseGPUCLientsFile(gpuClientsFileFlag)
+	if err != nil {
+		return fmt.Errorf("error parsing host's GPU clients file: %w", err)
+	}
+
+	opts := &reconfigureMIGOptions{
+		NodeName:                   nodeNameFlag,
+		MIGPartedConfigFile:        migPartedConfigFile,
+		SelectedMIGConfig:          selectedConfig,
+		WithReboot:                 withRebootFlag,
+		WithShutdownHostGPUClients: withShutdownHostGPUClientsFlag,
+		HostRootMount:              hostRootMountFlag,
+		HostNvidiaDir:              hostNvidiaDirFlag,
+		HostMIGManagerStateFile:    hostMigManagerStateFileFlag,
+		HostGPUClientServices:      gpuClients.SystemdServices,
+		HostKubeletService:         hostKubeletSystemdServiceFlag,
+	}
+
+	return reconfigureMIG(clientset, opts)
+}
+
+func parseGPUCLientsFile(file string) (*GPUClients, error) {
+	var err error
+	var yamlBytes []byte
+
+	if file == "" {
+		return &GPUClients{}, nil
+	}
+
+	yamlBytes, err = os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("read error: %w", err)
+	}
+
+	var clients GPUClients
+	err = yaml.Unmarshal(yamlBytes, &clients)
+	if err != nil {
+		return nil, fmt.Errorf("unmarshal error: %w", err)
+	}
+
+	return &clients, nil
+}
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	log "github.com/sirupsen/logrus"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+func isNVMLAvailable() error {
+	nvmlLib := nvml.New()
+
+	if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
+		return ret
+	}
+
+	if ret := nvmlLib.Shutdown(); ret != nvml.SUCCESS {
+		log.Warnf("error shutting down NVML: %v", ret)
+	}
+
+	return nil
+}