Skip to content

Commit 4011723

Browse files
Merge pull request #2002 from karthikvetrivel/fix-vgpu-dm-wait-for-vfs
2 parents 8ca5c55 + 8097fde commit 4011723

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

cmd/nvidia-validator/main.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"k8s.io/apimachinery/pkg/fields"
4040
"k8s.io/apimachinery/pkg/labels"
4141
"k8s.io/apimachinery/pkg/runtime/serializer/json"
42+
"k8s.io/apimachinery/pkg/util/wait"
4243
"k8s.io/client-go/kubernetes"
4344
"k8s.io/client-go/kubernetes/scheme"
4445
"k8s.io/client-go/rest"
@@ -224,6 +225,8 @@ const (
224225
wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
225226
// shell indicates what shell to use when invoking commands in a subprocess
226227
shell = "sh"
228+
// defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
229+
defaultVFWaitTimeout = 5 * time.Minute
227230
)
228231

229232
func main() {
@@ -1591,6 +1594,11 @@ func (v *VGPUManager) validate() error {
15911594
return err
15921595
}
15931596

1597+
log.Info("Waiting for VFs to be available...")
1598+
if err := waitForVFs(ctx, defaultVFWaitTimeout); err != nil {
1599+
return fmt.Errorf("vGPU Manager VFs not ready: %w", err)
1600+
}
1601+
15941602
statusFile := vGPUManagerStatusFile
15951603
if hostDriver {
15961604
statusFile = hostVGPUManagerStatusFile
@@ -1622,6 +1630,45 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
16221630
return hostDriver, runCommand(command, args, silent)
16231631
}
16241632

1633+
// waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1634+
// It polls sriov_numvfs until all GPUs have their full VF count enabled.
1635+
func waitForVFs(ctx context.Context, timeout time.Duration) error {
1636+
pollInterval := time.Duration(sleepIntervalSecondsFlag) * time.Second
1637+
nvpciLib := nvpci.New()
1638+
1639+
return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
1640+
gpus, err := nvpciLib.GetGPUs()
1641+
if err != nil {
1642+
log.Warnf("Error getting GPUs: %v", err)
1643+
return false, nil
1644+
}
1645+
1646+
var totalExpected, totalEnabled uint64
1647+
var pfCount int
1648+
for _, gpu := range gpus {
1649+
sriovInfo := gpu.SriovInfo
1650+
if sriovInfo.IsPF() {
1651+
pfCount++
1652+
totalExpected += sriovInfo.PhysicalFunction.TotalVFs
1653+
totalEnabled += sriovInfo.PhysicalFunction.NumVFs
1654+
}
1655+
}
1656+
1657+
if totalExpected == 0 {
1658+
log.Info("No SR-IOV capable GPUs found, skipping VF wait")
1659+
return true, nil
1660+
}
1661+
1662+
if totalEnabled == totalExpected {
1663+
log.Infof("All %d VF(s) enabled on %d NVIDIA GPU(s)", totalEnabled, pfCount)
1664+
return true, nil
1665+
}
1666+
1667+
log.Infof("Waiting for VFs: %d/%d enabled across %d GPU(s)", totalEnabled, totalExpected, pfCount)
1668+
return false, nil
1669+
})
1670+
}
1671+
16251672
func (c *CCManager) validate() error {
16261673
// delete status file if already present
16271674
err := deleteStatusFile(outputDirFlag + "/" + ccManagerStatusFile)

0 commit comments

Comments
 (0)