@@ -39,6 +39,7 @@ import (
3939 "k8s.io/apimachinery/pkg/fields"
4040 "k8s.io/apimachinery/pkg/labels"
4141 "k8s.io/apimachinery/pkg/runtime/serializer/json"
42+ "k8s.io/apimachinery/pkg/util/wait"
4243 "k8s.io/client-go/kubernetes"
4344 "k8s.io/client-go/kubernetes/scheme"
4445 "k8s.io/client-go/rest"
@@ -224,6 +225,8 @@ const (
224225 wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
225226 // shell indicates what shell to use when invoking commands in a subprocess
226227 shell = "sh"
228+ // defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
229+ defaultVFWaitTimeout = 5 * time .Minute
227230)
228231
229232func main () {
@@ -1591,6 +1594,11 @@ func (v *VGPUManager) validate() error {
15911594 return err
15921595 }
15931596
1597+ log .Info ("Waiting for VFs to be available..." )
1598+ if err := waitForVFs (ctx , defaultVFWaitTimeout ); err != nil {
1599+ return fmt .Errorf ("vGPU Manager VFs not ready: %w" , err )
1600+ }
1601+
15941602 statusFile := vGPUManagerStatusFile
15951603 if hostDriver {
15961604 statusFile = hostVGPUManagerStatusFile
@@ -1622,6 +1630,44 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
16221630 return hostDriver , runCommand (command , args , silent )
16231631}
16241632
1633+ // waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1634+ // It polls sriov_numvfs until all GPUs have their full VF count enabled.
1635+ func waitForVFs (ctx context.Context , timeout time.Duration ) error {
1636+ pollInterval := time .Duration (sleepIntervalSecondsFlag ) * time .Second
1637+ nvpciLib := nvpci .New ()
1638+
1639+ return wait .PollUntilContextTimeout (ctx , pollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
1640+ gpus , err := nvpciLib .GetGPUs ()
1641+ if err != nil {
1642+ log .Warnf ("Error getting GPUs: %v" , err )
1643+ return false , nil
1644+ }
1645+
1646+ var totalExpected , totalEnabled uint64
1647+ var sriovGPUCount int
1648+ for _ , gpu := range gpus {
1649+ if gpu .SriovInfo .IsPF () {
1650+ sriovGPUCount ++
1651+ totalExpected += gpu .SriovInfo .PhysicalFunction .TotalVFs
1652+ totalEnabled += gpu .SriovInfo .PhysicalFunction .NumVFs
1653+ }
1654+ }
1655+
1656+ if totalExpected == 0 {
1657+ log .Info ("No SR-IOV capable GPUs found, skipping VF wait" )
1658+ return true , nil
1659+ }
1660+
1661+ if totalEnabled == totalExpected {
1662+ log .Infof ("All %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , sriovGPUCount )
1663+ return true , nil
1664+ }
1665+
1666+ log .Infof ("Waiting for VFs: %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , sriovGPUCount )
1667+ return false , nil
1668+ })
1669+ }
1670+
16251671func (c * CCManager ) validate () error {
16261672 // delete status file if already present
16271673 err := deleteStatusFile (outputDirFlag + "/" + ccManagerStatusFile )
0 commit comments