Skip to content

Commit 6348f09

Browse files
committed
Add MIG config support when MIG-backed vGPU type
Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent 5f37569 commit 6348f09

File tree

3 files changed

+599
-0
lines changed

3 files changed

+599
-0
lines changed

cmd/nvidia-k8s-vgpu-dm/main.go

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@ import (
2020
"fmt"
2121
"os"
2222
"os/exec"
23+
"strings"
2324

2425
log "github.com/sirupsen/logrus"
2526
cli "github.com/urfave/cli/v2"
2627
"k8s.io/client-go/kubernetes"
2728
"k8s.io/client-go/tools/cache"
2829
"k8s.io/client-go/tools/clientcmd"
30+
"sigs.k8s.io/yaml"
2931

3032
"context"
3133
"sync"
@@ -38,6 +40,7 @@ import (
3840
"k8s.io/apimachinery/pkg/util/wait"
3941

4042
"github.com/NVIDIA/vgpu-device-manager/internal/info"
43+
"github.com/NVIDIA/vgpu-device-manager/pkg/types"
4144
)
4245

4346
const (
@@ -47,6 +50,14 @@ const (
4750
vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
4851
pluginStateLabel = "nvidia.com/gpu.deploy.sandbox-device-plugin"
4952
validatorStateLabel = "nvidia.com/gpu.deploy.sandbox-validator"
53+
54+
defaultReconfigureMIGScript = "/usr/bin/reconfigure-mig.sh"
55+
defaultHostRootMount = "/host"
56+
defaultHostNvidiaDir = "/usr/local/nvidia"
57+
defaultHostMigManagerStateFile = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
58+
defaultHostKubeletSystemdService = "kubelet.service"
59+
60+
migConfigDisabled = "all-disabled"
5061
)
5162

5263
var (
@@ -56,10 +67,25 @@ var (
5667
configFileFlag string
5768
defaultVGPUConfigFlag string
5869

70+
reconfigureMIGScriptFlag string
71+
migPartedConfigFileFlag string
72+
hostRootMountFlag string
73+
hostNvidiaDirFlag string
74+
hostMigManagerStateFileFlag string
75+
hostKubeletSystemdServiceFlag string
76+
gpuClientsFileFlag string
77+
withRebootFlag bool
78+
withShutdownHostGPUClientsFlag bool
79+
5980
pluginDeployed string
6081
validatorDeployed string
6182
)
6283

84+
type GPUClients struct {
85+
Version string `json:"version" yaml:"version"`
86+
SystemdServices []string `json:"systemd-services" yaml:"systemd-services"`
87+
}
88+
6389
// SyncableVGPUConfig is used to synchronize on changes to a configuration value.
6490
// That is, callers of Get() will block until a call to Set() is made.
6591
// Multiple calls to Set() do not queue, meaning that only calls to Get() made
@@ -148,6 +174,78 @@ func main() {
148174
Destination: &defaultVGPUConfigFlag,
149175
EnvVars: []string{"DEFAULT_VGPU_CONFIG"},
150176
},
177+
&cli.StringFlag{
178+
Name: "reconfigure-mig-script",
179+
Aliases: []string{"s"},
180+
Value: defaultReconfigureMIGScript,
181+
Usage: "script to run to do the actual MIG reconfiguration",
182+
Destination: &reconfigureMIGScriptFlag,
183+
EnvVars: []string{"RECONFIGURE_MIG_SCRIPT"},
184+
},
185+
&cli.StringFlag{
186+
Name: "mig-parted-config-file",
187+
Aliases: []string{"mc"},
188+
Value: "",
189+
Usage: "the path to the mig-parted configuration file",
190+
Destination: &migPartedConfigFileFlag,
191+
EnvVars: []string{"MIG_PARTED_CONFIG_FILE"},
192+
},
193+
&cli.StringFlag{
194+
Name: "host-root-mount",
195+
Aliases: []string{"m"},
196+
Value: defaultHostRootMount,
197+
Usage: "container path where host root directory is mounted",
198+
Destination: &hostRootMountFlag,
199+
EnvVars: []string{"HOST_ROOT_MOUNT"},
200+
},
201+
&cli.StringFlag{
202+
Name: "host-nvidia-dir",
203+
Aliases: []string{"i"},
204+
Value: defaultHostNvidiaDir,
205+
Usage: "host path of the directory where NVIDIA managed software directory is typically located",
206+
Destination: &hostNvidiaDirFlag,
207+
EnvVars: []string{"HOST_NVIDIA_DIR"},
208+
},
209+
&cli.StringFlag{
210+
Name: "host-mig-manager-state-file",
211+
Aliases: []string{"o"},
212+
Value: defaultHostMigManagerStateFile,
213+
Usage: "host path where the host's systemd mig-manager state file is located",
214+
Destination: &hostMigManagerStateFileFlag,
215+
EnvVars: []string{"HOST_MIG_MANAGER_STATE_FILE"},
216+
},
217+
&cli.StringFlag{
218+
Name: "host-kubelet-systemd-service",
219+
Aliases: []string{"k"},
220+
Value: defaultHostKubeletSystemdService,
221+
Usage: "name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration",
222+
Destination: &hostKubeletSystemdServiceFlag,
223+
EnvVars: []string{"HOST_KUBELET_SYSTEMD_SERVICE"},
224+
},
225+
&cli.StringFlag{
226+
Name: "gpu-clients-file",
227+
Aliases: []string{"g"},
228+
Value: "",
229+
Usage: "the path to the file listing the GPU clients that need to be shutdown across a MIG configuration",
230+
Destination: &gpuClientsFileFlag,
231+
EnvVars: []string{"GPU_CLIENTS_FILE"},
232+
},
233+
&cli.BoolFlag{
234+
Name: "with-reboot",
235+
Aliases: []string{"r"},
236+
Value: false,
237+
Usage: "reboot the node if changing the MIG mode fails for any reason",
238+
Destination: &withRebootFlag,
239+
EnvVars: []string{"WITH_REBOOT"},
240+
},
241+
&cli.BoolFlag{
242+
Name: "with-shutdown-host-gpu-clients",
243+
Aliases: []string{"w"},
244+
Value: false,
245+
Usage: "shutdown/restart any required host GPU clients across a MIG configuration",
246+
Destination: &withShutdownHostGPUClientsFlag,
247+
EnvVars: []string{"WITH_SHUTDOWN_HOST_GPU_CLIENTS"},
248+
},
151249
}
152250

153251
log.Infof("version: %s", c.Version)
@@ -296,6 +394,30 @@ func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error
296394
return fmt.Errorf("unable to shutdown gpu operands: %v", err)
297395
}
298396

397+
vgpuType, err := types.ParseVGPUType(selectedConfig)
398+
if err != nil {
399+
return fmt.Errorf("unable to parse vGPU type: %s", err)
400+
}
401+
if vgpuType.G > 0 {
402+
log.Info("Running reconfigure MIG script")
403+
404+
selectedMigConfig, err := convertToMIGConfigFormat(selectedConfig)
405+
if err != nil {
406+
return fmt.Errorf("unable to convert vGPU type config to MIG config: %s", err)
407+
}
408+
409+
err = runReconfigureMIGScript(selectedMigConfig)
410+
if err != nil {
411+
return fmt.Errorf("unable to run reconfigure MIG script: %s", err)
412+
}
413+
} else {
414+
log.Info("Disabling MIG if enabled")
415+
err = runReconfigureMIGScript(migConfigDisabled)
416+
if err != nil {
417+
return fmt.Errorf("unable to run reconfigure MIG script: %s", err)
418+
}
419+
}
420+
299421
log.Info("Applying the selected vGPU device configuration to the node")
300422
err = applyConfig(selectedConfig)
301423
if err != nil {
@@ -504,3 +626,78 @@ func setNodeLabelValue(clientset *kubernetes.Clientset, label, value string) err
504626

505627
return nil
506628
}
629+
630+
func parseGPUCLientsFile(file string) (*GPUClients, error) {
631+
var err error
632+
var yamlBytes []byte
633+
634+
if file == "" {
635+
return &GPUClients{}, nil
636+
}
637+
638+
yamlBytes, err = os.ReadFile(file)
639+
if err != nil {
640+
return nil, fmt.Errorf("read error: %v", err)
641+
}
642+
643+
var clients GPUClients
644+
err = yaml.Unmarshal(yamlBytes, &clients)
645+
if err != nil {
646+
return nil, fmt.Errorf("unmarshal error: %v", err)
647+
}
648+
649+
return &clients, nil
650+
}
651+
652+
func runReconfigureMIGScript(migConfigValue string) error {
653+
gpuClients, err := parseGPUCLientsFile(gpuClientsFileFlag)
654+
if err != nil {
655+
return fmt.Errorf("error parsing host's GPU clients file: %s", err)
656+
}
657+
658+
args := []string{
659+
"-n", nodeNameFlag,
660+
"-f", migPartedConfigFileFlag,
661+
"-c", migConfigValue,
662+
"-m", hostRootMountFlag,
663+
"-i", hostNvidiaDirFlag,
664+
"-o", hostMigManagerStateFileFlag,
665+
"-g", strings.Join(gpuClients.SystemdServices, ","),
666+
"-k", hostKubeletSystemdServiceFlag,
667+
}
668+
if withRebootFlag {
669+
args = append(args, "-r")
670+
}
671+
if withShutdownHostGPUClientsFlag {
672+
args = append(args, "-w")
673+
}
674+
675+
stdLogger := log.StandardLogger()
676+
677+
cmd := exec.Command(reconfigureMIGScriptFlag, args...)
678+
cmd.Stdout = stdLogger.Writer()
679+
cmd.Stderr = stdLogger.WriterLevel(log.ErrorLevel)
680+
return cmd.Run()
681+
}
682+
683+
// convertToMIGConfigFormat converts a vGPU type string to the MIG config
684+
// equivalent, e.g. "all-{g}g.{gb}gb[.me]".
685+
func convertToMIGConfigFormat(s string) (string, error) {
686+
vgpu, err := types.ParseVGPUType(s)
687+
if err != nil {
688+
return "", fmt.Errorf("failed to parse vGPU type: %v", err)
689+
}
690+
691+
// Base format: all-{g}g.{gb}gb
692+
result := fmt.Sprintf("all-%dg.%dgb", vgpu.G, vgpu.GB)
693+
694+
// Add .me suffix if media extension attribute is present
695+
for _, attr := range vgpu.Attr {
696+
if attr == types.AttributeMediaExtensions {
697+
result += ".me"
698+
break
699+
}
700+
}
701+
702+
return result, nil
703+
}

deployments/container/Dockerfile.ubi9

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,21 @@ ARG VERSION="N/A"
3838
ARG GIT_COMMIT="unknown"
3939
RUN make PREFIX=/artifacts cmds
4040

41+
RUN cp ./deployments/container/reconfigure-mig.sh /artifacts/reconfigure-mig.sh
42+
43+
ARG TARGETARCH
44+
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && curl -o /artifacts/kubectl -L "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${OS_ARCH}/kubectl"
45+
RUN chmod a+x /artifacts/kubectl
4146

4247
FROM nvcr.io/nvidia/distroless/go:v3.1.8
4348

4449
ENV NVIDIA_VISIBLE_DEVICES=void
4550

4651
COPY --from=build /artifacts/nvidia-vgpu-dm /usr/bin/nvidia-vgpu-dm
4752
COPY --from=build /artifacts/nvidia-k8s-vgpu-dm /usr/bin/nvidia-k8s-vgpu-dm
53+
COPY --from=build /artifacts/reconfigure-mig.sh /usr/bin/reconfigure-mig.sh
54+
COPY --from=build /artifacts/kubectl /usr/bin/kubectl
55+
COPY --from=nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.12.1-ubi9 /usr/bin/nvidia-mig-parted /usr/bin/nvidia-mig-parted
4856

4957
LABEL version="${VERSION}"
5058
LABEL release="N/A"

0 commit comments

Comments
 (0)