@@ -39,6 +39,7 @@ import (
3939 "k8s.io/apimachinery/pkg/fields"
4040 "k8s.io/apimachinery/pkg/labels"
4141 "k8s.io/apimachinery/pkg/runtime/serializer/json"
42+ "k8s.io/apimachinery/pkg/util/wait"
4243 "k8s.io/client-go/kubernetes"
4344 "k8s.io/client-go/kubernetes/scheme"
4445 "k8s.io/client-go/rest"
@@ -224,6 +225,8 @@ const (
224225 wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
225226 // shell indicates what shell to use when invoking commands in a subprocess
226227 shell = "sh"
228+ // defaultVFWaitTimeout is the default timeout for waiting for VFs to be created
229+ defaultVFWaitTimeout = 5 * time .Minute
227230)
228231
229232func main () {
@@ -1591,6 +1594,11 @@ func (v *VGPUManager) validate() error {
15911594 return err
15921595 }
15931596
1597+ log .Info ("Waiting for VFs to be available..." )
1598+ if err := waitForVFs (ctx , defaultVFWaitTimeout ); err != nil {
1599+ return fmt .Errorf ("vGPU Manager VFs not ready: %w" , err )
1600+ }
1601+
15941602 statusFile := vGPUManagerStatusFile
15951603 if hostDriver {
15961604 statusFile = hostVGPUManagerStatusFile
@@ -1622,6 +1630,45 @@ func (v *VGPUManager) runValidation(silent bool) (hostDriver bool, err error) {
16221630 return hostDriver , runCommand (command , args , silent )
16231631}
16241632
1633+ // waitForVFs waits for Virtual Functions to be created on all NVIDIA GPUs.
1634+ // It polls sriov_numvfs until all GPUs have their full VF count enabled.
1635+ func waitForVFs (ctx context.Context , timeout time.Duration ) error {
1636+ pollInterval := time .Duration (sleepIntervalSecondsFlag ) * time .Second
1637+ nvpciLib := nvpci .New ()
1638+
1639+ return wait .PollUntilContextTimeout (ctx , pollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
1640+ gpus , err := nvpciLib .GetGPUs ()
1641+ if err != nil {
1642+ log .Warnf ("Error getting GPUs: %v" , err )
1643+ return false , nil
1644+ }
1645+
1646+ var totalExpected , totalEnabled uint64
1647+ var pfCount int
1648+ for _ , gpu := range gpus {
1649+ sriovInfo := gpu .SriovInfo
1650+ if sriovInfo .IsPF () {
1651+ pfCount ++
1652+ totalExpected += sriovInfo .PhysicalFunction .TotalVFs
1653+ totalEnabled += sriovInfo .PhysicalFunction .NumVFs
1654+ }
1655+ }
1656+
1657+ if totalExpected == 0 {
1658+ log .Info ("No SR-IOV capable GPUs found, skipping VF wait" )
1659+ return true , nil
1660+ }
1661+
1662+ if totalEnabled == totalExpected {
1663+ log .Infof ("All %d VF(s) enabled on %d NVIDIA GPU(s)" , totalEnabled , pfCount )
1664+ return true , nil
1665+ }
1666+
1667+ log .Infof ("Waiting for VFs: %d/%d enabled across %d GPU(s)" , totalEnabled , totalExpected , pfCount )
1668+ return false , nil
1669+ })
1670+ }
1671+
16251672func (c * CCManager ) validate () error {
16261673 // delete status file if already present
16271674 err := deleteStatusFile (outputDirFlag + "/" + ccManagerStatusFile )
0 commit comments