Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
62 changes: 39 additions & 23 deletions cmd/nvidia-mig-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package main
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
Expand All @@ -28,12 +27,14 @@ import (
cli "github.com/urfave/cli/v2"

"github.com/NVIDIA/mig-parted/internal/info"
"github.com/NVIDIA/mig-parted/pkg/mig/reconfigure"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"

"sigs.k8s.io/yaml"
)
Expand Down Expand Up @@ -280,6 +281,8 @@ func validateFlags(c *cli.Context) error {
}

func start(c *cli.Context) error {
klog.InfoS(fmt.Sprintf("Starting %s", c.App.Name), "version", c.App.Version)

config, err := clientcmd.BuildConfigFromFlags("", kubeconfigFlag)
if err != nil {
return fmt.Errorf("error building kubernetes clientcmd config: %s", err)
Expand All @@ -304,7 +307,7 @@ func start(c *cli.Context) error {
log.Infof("Waiting for change to '%s' label", MigConfigLabel)
value := migConfig.Get()
log.Infof("Updating to MIG config: %s", value)
err := runScript(value, driverLibraryPath, nvidiaSMIPath)
err := runScript(value, driverLibraryPath, nvidiaSMIPath, clientset)
if err != nil {
log.Errorf("Error: %s", err)
continue
Expand Down Expand Up @@ -371,36 +374,49 @@ func parseGPUCLientsFile(file string) (*GPUClients, error) {
return &clients, nil
}

func runScript(migConfigValue string, driverLibraryPath string, nvidiaSMIPath string) error {
func runScript(migConfigValue string, driverLibraryPath string, nvidiaSMIPath string, clientset *kubernetes.Clientset) error {
gpuClients, err := parseGPUCLientsFile(gpuClientsFileFlag)
if err != nil {
return fmt.Errorf("error parsing host's GPU clients file: %s", err)
}

args := []string{
"-n", nodeNameFlag,
"-f", configFileFlag,
"-c", migConfigValue,
"-m", hostRootMountFlag,
"-i", hostNvidiaDirFlag,
"-o", hostMigManagerStateFileFlag,
"-g", strings.Join(gpuClients.SystemdServices, ","),
"-k", hostKubeletSystemdServiceFlag,
"-p", defaultGPUClientsNamespaceFlag,
options := []reconfigure.Option{
reconfigure.WithNodeName(nodeNameFlag),
reconfigure.WithMIGPartedConfigFile(configFileFlag),
reconfigure.WithSelectedMIGConfig(migConfigValue),
reconfigure.WithHostRootMount(hostRootMountFlag),
reconfigure.WithHostNVIDIADir(hostNvidiaDirFlag),
reconfigure.WithHostMIGManagerStateFile(hostMigManagerStateFileFlag),
reconfigure.WithHostGPUClientServices(gpuClients.SystemdServices...),
reconfigure.WithHostKubeletService(hostKubeletSystemdServiceFlag),
reconfigure.WithGPUClientNamespace(defaultGPUClientsNamespaceFlag),
reconfigure.WithConfigStateLabel(reconfigure.MIGConfigStateLabel),
reconfigure.WithClientset(clientset),
}

if cdiEnabledFlag {
args = append(args, "-e", "-t", driverRoot, "-a", driverRootCtrPath, "-b", devRoot, "-j", devRootCtrPath, "-l", driverLibraryPath, "-q", nvidiaSMIPath, "-s", nvidiaCDIHookPath)
options = append(options,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- do we actually need the conditional here? Or can we always add these options to the options slice?

reconfigure.WithCDIEnabled(cdiEnabledFlag),
reconfigure.WithDriverRoot(driverRoot),
reconfigure.WithDriverRootCtrPath(driverRootCtrPath),
reconfigure.WithDevRoot(devRoot),
reconfigure.WithDevRootCtrPath(devRootCtrPath),
reconfigure.WithDriverLibraryPath(driverLibraryPath),
reconfigure.WithNVIDIASMIPath(nvidiaSMIPath),
reconfigure.WithNVIDIACDIHookPath(nvidiaCDIHookPath),
)
}
if withRebootFlag {
args = append(args, "-r")
}
if withShutdownHostGPUClientsFlag {
args = append(args, "-d")

options = append(options,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- is there any value in appending these options here? Can we add them to the list of options during initialization of the options slice?

reconfigure.WithAllowReboot(withRebootFlag),
reconfigure.WithShutdownHostGPUClients(withShutdownHostGPUClientsFlag),
)

reconfigurer, err := reconfigure.New(options...)
if err != nil {
return err
}
cmd := exec.Command(reconfigureScriptFlag, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
return cmd.Run()
return reconfigurer.Reconfigure()
}

func ContinuouslySyncMigConfigChanges(clientset *kubernetes.Clientset, migConfig *SyncableMigConfig) chan struct{} {
Expand Down
6 changes: 3 additions & 3 deletions deployments/container/Dockerfile.ubi9
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM nvcr.io/nvidia/cuda:12.9.1-base-ubi9 as build
FROM nvcr.io/nvidia/cuda:12.9.1-base-ubi9 AS build

SHELL ["/bin/bash", "-c"]

Expand All @@ -34,8 +34,8 @@ RUN set -eux; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
ENV GOPATH=/go
ENV PATH=$GOPATH/bin:/usr/local/go/bin:$PATH

WORKDIR /build
COPY . .
Expand Down
6 changes: 3 additions & 3 deletions deployments/container/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM nvcr.io/nvidia/cuda:12.9.1-base-ubuntu20.04 as build
FROM nvcr.io/nvidia/cuda:12.9.1-base-ubuntu20.04 AS build

SHELL ["/bin/bash", "-c"]

Expand All @@ -34,8 +34,8 @@ RUN set -eux; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
ENV GOPATH=/go
ENV PATH=$GOPATH/bin:/usr/local/go/bin:$PATH

WORKDIR /build
COPY . .
Expand Down
16 changes: 15 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,54 @@ go 1.24.0
require (
github.com/NVIDIA/go-nvlib v0.7.4
github.com/NVIDIA/go-nvml v0.12.9-0
github.com/NVIDIA/nvidia-container-toolkit v1.17.8
github.com/go-playground/validator/v10 v10.27.0
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.10.0
github.com/urfave/cli/v2 v2.27.7
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.33.3
k8s.io/apimachinery v0.33.3
k8s.io/client-go v0.33.3
k8s.io/klog/v2 v2.130.1
sigs.k8s.io/yaml v1.4.0
)

require (
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.8 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/opencontainers/runtime-spec v1.2.1 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/crypto v0.36.0 // indirect
golang.org/x/mod v0.20.0 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
golang.org/x/sys v0.31.0 // indirect
Expand All @@ -50,10 +63,11 @@ require (
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
tags.cncf.io/container-device-interface v0.8.1 // indirect
tags.cncf.io/container-device-interface/specs-go v0.8.0 // indirect
)
Loading
Loading