Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/serializer/json"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -224,6 +225,8 @@ const (
wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
// shell indicates what shell to use when invoking commands in a subprocess
shell = "sh"
// defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
defaultVFWaitTimeout = 5 * time.Minute
)

func main() {
Expand Down Expand Up @@ -1591,6 +1594,11 @@ func (v *VGPUManager) validate() error {
return err
}

log.Info("Waiting for VFs to be available...")
if err := waitForVFs(ctx, defaultVFWaitTimeout); err != nil {
return fmt.Errorf("vGPU Manager VFs not ready: %w", err)
}

statusFile := vGPUManagerStatusFile
if hostDriver {
statusFile = hostVGPUManagerStatusFile
Expand Down Expand Up @@ -1622,6 +1630,45 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
return hostDriver, runCommand(command, args, silent)
}

// waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
// It polls sriov_numvfs until all GPUs have their full VF count enabled.
func waitForVFs(ctx context.Context, timeout time.Duration) error {
pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
nvpciLib := nvpci.New()

return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
gpus, err := nvpciLib.GetGPUs()
if err != nil {
log.Warnf("Error getting GPUs: %v", err)
return false, nil
}

var totalExpected, totalEnabled uint64
var pfCount int
for _, gpu := range gpus {
sriovInfo := gpu.SriovInfo
if sriovInfo.IsPF() {
pfCount++
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
}
}

if totalExpected == 0 {
log.Info("No SR-IOV capable GPUs found, skipping VF wait")
return true, nil
}

if totalEnabled == totalExpected {
log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
return true, nil
}

log.Infof("Waiting for VFs: %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
return false, nil
})
}

func (c *CCManager) validate() error {
// delete status file if already present
err := deleteStatusFile(outputDirFlag + "/" + ccManagerStatusFile)
Expand Down