Skip to content

Commit 15a9586

Browse files
committed
added vfio framework
Signed-off-by: Arjun <agadiyar@nvidia.com>
1 parent 89a1e62 commit 15a9586

File tree

5 files changed

+328
-193
lines changed

5 files changed

+328
-193
lines changed

cmd/nvidia-vgpu-dm/apply/config.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ import (
2828
// VGPUConfig applies the selected vGPU config to the node
2929
func VGPUConfig(c *Context) error {
3030
return assert.WalkSelectedVGPUConfigForEachGPU(c.VGPUConfig, func(vc *v1.VGPUConfigSpec, i int, d types.DeviceID) error {
31-
configManager := vgpu.NewNvlibVGPUConfigManager()
31+
configManager, err := vgpu.NewNvlibVGPUConfigManager()
32+
if err != nil {
33+
return fmt.Errorf("error creating vGPU config manager: %v", err)
34+
}
3235
current, err := configManager.GetVGPUConfig(i)
3336
if err != nil {
3437
return fmt.Errorf("error getting vGPU config: %v", err)

cmd/nvidia-vgpu-dm/assert/config.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ func VGPUConfig(c *Context) error {
3636

3737
matched := make([]bool, len(gpus))
3838
err = WalkSelectedVGPUConfigForEachGPU(c.VGPUConfig, func(vc *v1.VGPUConfigSpec, i int, d types.DeviceID) error {
39-
configManager := vgpu.NewNvlibVGPUConfigManager()
39+
configManager, err := vgpu.NewNvlibVGPUConfigManager()
40+
if err != nil {
41+
return fmt.Errorf("error creating vGPU config manager: %v", err)
42+
}
4043
current, err := configManager.GetVGPUConfig(i)
4144
if err != nil {
4245
return fmt.Errorf("error getting vGPU config: %v", err)

internal/vfio/vfio.go

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
package vfio
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
"strconv"
9+
"strings"
10+
"time"
11+
12+
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
13+
"github.com/NVIDIA/vgpu-device-manager/internal/nvlib"
14+
)
15+
16+
const (
17+
HostPCIDevicesRoot = "/host/sys/bus/pci/devices"
18+
)
19+
20+
type VFIOManager struct {
21+
nvlib nvlib.Interface
22+
}
23+
24+
func NewVFIOManager(nvlibInstance nvlib.Interface) *VFIOManager {
25+
return &VFIOManager{nvlib: nvlibInstance}
26+
}
27+
28+
// ParentDevice represents an NVIDIA parent PCI device.
29+
type ParentDevice struct {
30+
*nvpci.NvidiaPCIDevice
31+
VirtualFunctionPaths map[string]string
32+
}
33+
34+
// Device represents an NVIDIA (vGPU) device.
35+
type Device struct {
36+
Path string
37+
Parent *ParentDevice
38+
}
39+
40+
func (m *VFIOManager) GetAllParentDevices() ([]*ParentDevice, error) {
41+
nvdevices, err := m.nvlib.Nvpci.GetGPUs()
42+
if err != nil {
43+
return nil, fmt.Errorf("unable to get all NVIDIA GPU devices: %v", err)
44+
}
45+
parentDevices := []*ParentDevice{}
46+
for _, device := range nvdevices {
47+
vfnum := 0
48+
numVF := int(device.SriovInfo.PhysicalFunction.NumVFs)
49+
virtualFunctionPaths := make(map[string]string)
50+
for vfnum < numVF {
51+
vfAddr := filepath.Join(HostPCIDevicesRoot, device.Address, "virtfn"+strconv.Itoa(vfnum), "nvidia")
52+
if _, err := os.Stat(vfAddr); err != nil {
53+
return nil, fmt.Errorf("virtual function %d at address %s does not exist", vfnum, vfAddr)
54+
}
55+
virtualFunctionPaths[strconv.Itoa(vfnum)] = vfAddr
56+
}
57+
parentDevices = append(parentDevices, &ParentDevice{
58+
NvidiaPCIDevice: device,
59+
VirtualFunctionPaths: virtualFunctionPaths,
60+
})
61+
}
62+
return parentDevices, nil
63+
}
64+
65+
func (m *VFIOManager) GetAllDevices() ([]*Device, error) {
66+
parentDevices, err := m.GetAllParentDevices()
67+
if err != nil {
68+
return nil, fmt.Errorf("unable to get all parent devices: %v", err)
69+
}
70+
devices := []*Device{}
71+
for _, parentDevice := range parentDevices {
72+
for _, vfAddr := range parentDevice.VirtualFunctionPaths {
73+
vgpuTypeNumberBytes, err := os.ReadFile(filepath.Join(vfAddr, "current_vgpu_type"))
74+
if err != nil {
75+
return nil, fmt.Errorf("unable to read current vGPU type: %v", err)
76+
}
77+
vgpuTypeNumber, err := strconv.Atoi(strings.TrimSpace(string(vgpuTypeNumberBytes)))
78+
if err != nil {
79+
return nil, fmt.Errorf("unable to convert current vGPU type number to int: %v", err)
80+
}
81+
if vgpuTypeNumber != 0 {
82+
devices = append(devices, &Device{
83+
Path: vfAddr,
84+
Parent: parentDevice,
85+
})
86+
}
87+
}
88+
}
89+
return devices, nil
90+
}
91+
92+
// GetPhysicalFunction gets the physical PCI device backing a 'parent' device.
93+
func (p *ParentDevice) GetPhysicalFunction() *nvpci.NvidiaPCIDevice {
94+
if p.SriovInfo.IsVF() {
95+
return p.SriovInfo.VirtualFunction.PhysicalFunction
96+
}
97+
// Either it is an SRIOV physical function or a non-SRIOV device, so return the device itself
98+
return p.NvidiaPCIDevice
99+
}
100+
101+
// GetPhysicalFunction gets the physical PCI device that a vGPU is created on.
102+
func (m *Device) GetPhysicalFunction() *nvpci.NvidiaPCIDevice {
103+
return m.Parent.GetPhysicalFunction()
104+
}
105+
106+
// GetIdForVGPUTypeName returns the vGPU type ID for a given type name
107+
func (p *ParentDevice) GetIdForVGPUTypeName(filePath string, vgpuTypeName string) (int, error) {
108+
file, err := os.Open(filePath)
109+
if err != nil {
110+
return 0, fmt.Errorf("unable to open file %s: %v", filePath, err)
111+
}
112+
defer file.Close()
113+
114+
scanner := bufio.NewScanner(file)
115+
for scanner.Scan() {
116+
line := scanner.Text()
117+
fields := strings.Fields(line)
118+
if len(fields) < 2 {
119+
continue
120+
}
121+
name := fields[len(fields)-1]
122+
numInt, err := strconv.Atoi(fields[0])
123+
if err == nil && name == vgpuTypeName {
124+
return numInt, nil
125+
}
126+
}
127+
return 0, fmt.Errorf("vGPU type %s not found in file %s", vgpuTypeName, filePath)
128+
}
129+
130+
// IsVFIOEnabled checks if VFIO is enabled for a specific GPU
131+
func (m *VFIOManager) IsVFIOEnabled(gpu int) (bool, error) {
132+
time.Sleep(10 * time.Second) // Wait for 10 seconds to ensure the virtual functions are ready
133+
nvdevice, err := m.nvlib.Nvpci.GetGPUByIndex(gpu)
134+
if err != nil {
135+
return false, fmt.Errorf("unable to get GPU by index %d: %v", gpu, err)
136+
}
137+
// Check if vfio exists and has entries
138+
vfioPath := filepath.Join(HostPCIDevicesRoot, nvdevice.Address, "virtfn0", "nvidia")
139+
creatableTypesFile := filepath.Join(vfioPath, "creatable_vgpu_types")
140+
141+
_, statErr := os.Stat(creatableTypesFile)
142+
if statErr == nil {
143+
return true, nil
144+
}
145+
146+
return false, fmt.Errorf("unable to stat creatable_vgpu_types file at %s: %v", creatableTypesFile, statErr)
147+
}
148+
149+
// IsVGPUTypeSupported checks if the vfioType is supported by this parent GPU
150+
func (p *ParentDevice) IsVGPUTypeAvailable(vfioType string) (bool, error) {
151+
for _, vfPath := range p.VirtualFunctionPaths {
152+
creatableTypesPath := filepath.Join(vfPath, "creatable_vgpu_types")
153+
file, err := os.Open(creatableTypesPath)
154+
if err != nil {
155+
return false, fmt.Errorf("unable to open file %s: %v", creatableTypesPath, err)
156+
}
157+
defer file.Close()
158+
scanner := bufio.NewScanner(file)
159+
for scanner.Scan() {
160+
line := scanner.Text()
161+
fields := strings.Fields(line)
162+
if len(fields) < 2 {
163+
continue
164+
}
165+
name := fields[len(fields)-1]
166+
if name == vfioType {
167+
return true, nil
168+
}
169+
}
170+
}
171+
return false, nil
172+
}
173+
174+
// Delete deletes a vGPU type from a specific GPU
175+
func (m *Device) Delete() error {
176+
currentVGPUTypePath := filepath.Join(m.Path, "current_vgpu_type")
177+
err := os.WriteFile(currentVGPUTypePath, []byte("0"), 0644)
178+
if err != nil {
179+
return fmt.Errorf("unable to write to %s: %v", currentVGPUTypePath, err)
180+
}
181+
return nil
182+
}
183+
184+
func (p *ParentDevice) CreateVGPUDevice(vfioType string, vfnum string) error {
185+
vfPath := p.VirtualFunctionPaths[vfnum]
186+
currentVGPUTypePath := filepath.Join(vfPath, "current_vgpu_type")
187+
number, err := p.GetIdForVGPUTypeName(filepath.Join(vfPath, "creatable_vgpu_types"), vfioType)
188+
if err != nil {
189+
return fmt.Errorf("unable to get vGPU type number: %v", err)
190+
}
191+
err = os.WriteFile(currentVGPUTypePath, []byte(strconv.Itoa(number)), 0644)
192+
if err != nil {
193+
return fmt.Errorf("unable to write current vGPU type: %v", err)
194+
}
195+
return nil
196+
}
197+
198+
func (p *ParentDevice) GetAvailableVGPUInstances(vfioType string) (int, error) {
199+
available, err := p.IsVGPUTypeAvailable(vfioType)
200+
if err != nil {
201+
return 0, fmt.Errorf("unable to check if vGPU type is available: %v", err)
202+
}
203+
if available {
204+
return int(p.NvidiaPCIDevice.SriovInfo.PhysicalFunction.NumVFs), nil
205+
}
206+
return 0, nil
207+
}

0 commit comments

Comments
 (0)