Skip to content
Open
5 changes: 4 additions & 1 deletion cmd/nvidia-vgpu-dm/apply/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ import (
// VGPUConfig applies the selected vGPU config to the node
func VGPUConfig(c *Context) error {
return assert.WalkSelectedVGPUConfigForEachGPU(c.VGPUConfig, func(vc *v1.VGPUConfigSpec, i int, d types.DeviceID) error {
configManager := vgpu.NewNvlibVGPUConfigManager()
configManager, err := vgpu.NewNvlibVGPUConfigManager()
if err != nil {
return fmt.Errorf("error creating vGPU config manager: %v", err)
}
current, err := configManager.GetVGPUConfig(i)
if err != nil {
return fmt.Errorf("error getting vGPU config: %v", err)
Expand Down
5 changes: 4 additions & 1 deletion cmd/nvidia-vgpu-dm/assert/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ func VGPUConfig(c *Context) error {

matched := make([]bool, len(gpus))
err = WalkSelectedVGPUConfigForEachGPU(c.VGPUConfig, func(vc *v1.VGPUConfigSpec, i int, d types.DeviceID) error {
configManager := vgpu.NewNvlibVGPUConfigManager()
configManager, err := vgpu.NewNvlibVGPUConfigManager()
if err != nil {
return fmt.Errorf("error creating vGPU config manager: %v", err)
}
current, err := configManager.GetVGPUConfig(i)
if err != nil {
return fmt.Errorf("error getting vGPU config: %v", err)
Expand Down
208 changes: 208 additions & 0 deletions internal/vfio/vfio.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
package vfio

import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"github.com/NVIDIA/vgpu-device-manager/internal/nvlib"
)

const (
HostPCIDevicesRoot = "/host/sys/bus/pci/devices"
)

type VFIOManager struct {
nvlib nvlib.Interface
}

func NewVFIOManager(nvlibInstance nvlib.Interface) *VFIOManager {
return &VFIOManager{nvlib: nvlibInstance}
}

// ParentDevice represents an NVIDIA parent PCI device.
type ParentDevice struct {
*nvpci.NvidiaPCIDevice
VirtualFunctionPaths map[string]string
}

// Device represents an NVIDIA (vGPU) device.
type Device struct {
Path string
Parent *ParentDevice
}

func (m *VFIOManager) GetAllParentDevices() ([]*ParentDevice, error) {
nvdevices, err := m.nvlib.Nvpci.GetGPUs()
if err != nil {
return nil, fmt.Errorf("unable to get all NVIDIA GPU devices: %v", err)
}
parentDevices := []*ParentDevice{}
for _, device := range nvdevices {
vfnum := 0
numVF := int(device.SriovInfo.PhysicalFunction.NumVFs)
virtualFunctionPaths := make(map[string]string)
for vfnum < numVF {
vfAddr := filepath.Join(HostPCIDevicesRoot, device.Address, "virtfn"+strconv.Itoa(vfnum), "nvidia")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the nvidia directory should be included in this path. The "path to the VF" is simply /sys/bus/pci/devices/<BDF>/virtfn<N>. Other parts of the code are not intuitive to me because vfAddr includes the nvidia directory.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

local-agadiyar@ipp1-2284:/sys/bus/pci/devices/0000:41:00.0/virtfn0/nvidia$ cat current_vgpu_type
687
local-agadiyar@ipp1-2284:/sys/bus/pci/devices/0000:41:00.0/virtfn0/nvidia$ cat creatable_vgpu_types
ID : vGPU Name

The current_vgpu_type and creatable_vgpu_types files are located in the nvidia folder. This way we don't need to append nvidia onto another address variable

if _, err := os.Stat(vfAddr); err != nil {
return nil, fmt.Errorf("virtual function %d at address %s does not exist", vfnum, vfAddr)
}
virtualFunctionPaths[strconv.Itoa(vfnum)] = vfAddr
vfnum++
}
parentDevices = append(parentDevices, &ParentDevice{
NvidiaPCIDevice: device,
VirtualFunctionPaths: virtualFunctionPaths,
})
}
return parentDevices, nil
}

func (m *VFIOManager) GetAllDevices() ([]*Device, error) {
parentDevices, err := m.GetAllParentDevices()
if err != nil {
return nil, fmt.Errorf("unable to get all parent devices: %v", err)
}
devices := []*Device{}
for _, parentDevice := range parentDevices {
for _, vfAddr := range parentDevice.VirtualFunctionPaths {
vgpuTypeNumberBytes, err := os.ReadFile(filepath.Join(vfAddr, "current_vgpu_type"))
if err != nil {
return nil, fmt.Errorf("unable to read current vGPU type: %v", err)
}
vgpuTypeNumber, err := strconv.Atoi(strings.TrimSpace(string(vgpuTypeNumberBytes)))
if err != nil {
return nil, fmt.Errorf("unable to convert current vGPU type number to int: %v", err)
}
if vgpuTypeNumber != 0 {
devices = append(devices, &Device{
Path: vfAddr,
Parent: parentDevice,
})
}
}
}
return devices, nil
}

// GetPhysicalFunction gets the physical PCI device backing a 'parent' device.
func (p *ParentDevice) GetPhysicalFunction() *nvpci.NvidiaPCIDevice {
if p.SriovInfo.IsVF() {
return p.SriovInfo.VirtualFunction.PhysicalFunction
}
// Either it is an SRIOV physical function or a non-SRIOV device, so return the device itself
return p.NvidiaPCIDevice
}

// GetPhysicalFunction gets the physical PCI device that a vGPU is created on.
func (m *Device) GetPhysicalFunction() *nvpci.NvidiaPCIDevice {
return m.Parent.GetPhysicalFunction()
}

// GetIdForVGPUTypeName returns the vGPU type ID for a given type name
func (p *ParentDevice) GetIdForVGPUTypeName(filePath string, vgpuTypeName string) (int, error) {
file, err := os.Open(filePath)
if err != nil {
return 0, fmt.Errorf("unable to open file %s: %v", filePath, err)
}
defer file.Close()

scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
name := fields[len(fields)-1]
numInt, err := strconv.Atoi(fields[0])
if err == nil && name == vgpuTypeName {
return numInt, nil
}
}
return 0, fmt.Errorf("vGPU type %s not found in file %s", vgpuTypeName, filePath)
}

// IsVFIOEnabled checks if VFIO is enabled for a specific GPU
func (m *VFIOManager) IsVFIOEnabled(gpu int) (bool, error) {
time.Sleep(10 * time.Second) // Wait for 10 seconds to ensure the virtual functions are ready
nvdevice, err := m.nvlib.Nvpci.GetGPUByIndex(gpu)
if err != nil {
return false, fmt.Errorf("unable to get GPU by index %d: %v", gpu, err)
}
// Check if vfio exists and has entries
vfioPath := filepath.Join(HostPCIDevicesRoot, nvdevice.Address, "virtfn0", "nvidia")
creatableTypesFile := filepath.Join(vfioPath, "creatable_vgpu_types")

_, statErr := os.Stat(creatableTypesFile)
if statErr == nil {
return true, nil
}

return false, fmt.Errorf("unable to stat creatable_vgpu_types file at %s: %v", creatableTypesFile, statErr)
}

// IsVGPUTypeSupported checks if the vfioType is supported by this parent GPU
func (p *ParentDevice) IsVGPUTypeAvailable(vfioType string) (bool, error) {
for _, vfPath := range p.VirtualFunctionPaths {
creatableTypesPath := filepath.Join(vfPath, "creatable_vgpu_types")
file, err := os.Open(creatableTypesPath)
if err != nil {
return false, fmt.Errorf("unable to open file %s: %v", creatableTypesPath, err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
name := fields[len(fields)-1]
if name == vfioType {
return true, nil
}
}
}
return false, nil
}

// Delete deletes a vGPU type from a specific GPU
func (m *Device) Delete() error {
currentVGPUTypePath := filepath.Join(m.Path, "current_vgpu_type")
err := os.WriteFile(currentVGPUTypePath, []byte("0"), 0644)
if err != nil {
return fmt.Errorf("unable to write to %s: %v", currentVGPUTypePath, err)
}
return nil
}

func (p *ParentDevice) CreateVGPUDevice(vfioType string, vfnum string) error {
vfPath := p.VirtualFunctionPaths[vfnum]
currentVGPUTypePath := filepath.Join(vfPath, "current_vgpu_type")
number, err := p.GetIdForVGPUTypeName(filepath.Join(vfPath, "creatable_vgpu_types"), vfioType)
if err != nil {
return fmt.Errorf("unable to get vGPU type number: %v", err)
}
err = os.WriteFile(currentVGPUTypePath, []byte(strconv.Itoa(number)), 0644)
if err != nil {
return fmt.Errorf("unable to write current vGPU type: %v", err)
}
return nil
}

func (p *ParentDevice) GetAvailableVGPUInstances(vfioType string) (int, error) {
available, err := p.IsVGPUTypeAvailable(vfioType)
if err != nil {
return 0, fmt.Errorf("unable to check if vGPU type is available: %v", err)
}
if available {
return int(p.NvidiaPCIDevice.SriovInfo.PhysicalFunction.NumVFs), nil
}
return 0, nil
}
126 changes: 126 additions & 0 deletions internal/vgpu-combined/vgpu-combined.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package vgpu_combined

import (
"fmt"

"github.com/NVIDIA/go-nvlib/pkg/nvmdev"
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"github.com/NVIDIA/vgpu-device-manager/internal/nvlib"
"github.com/NVIDIA/vgpu-device-manager/internal/vfio"
)

type VGPUCombinedManager struct {
isVFIOMode bool
vfio *vfio.VFIOManager
nvlib nvlib.Interface
}

func NewVGPUCombinedManager() (*VGPUCombinedManager, error) {
nvlibInstance := nvlib.New()
vfioManager := vfio.NewVFIOManager(nvlibInstance)

// Determine mode once at initialization
isVFIOMode, err := vfioManager.IsVFIOEnabled(0)
if err != nil {
return nil, fmt.Errorf("error checking if VFIO is enabled: %v", err)
}

return &VGPUCombinedManager{
isVFIOMode: isVFIOMode,
vfio: vfioManager,
nvlib: nvlibInstance,
}, nil
}

// ParentDeviceInterface represents a common interface for both VFIO and MDEV parent devices
type ParentDeviceInterface interface {
GetPhysicalFunction() *nvpci.NvidiaPCIDevice
IsVGPUTypeAvailable(string) (bool, error)
CreateVGPUDevice(string, string) error
GetAvailableVGPUInstances(string) (int, error)
}

// DeviceInterface represents a common interface for both VFIO and MDEV vGPU device instances
type DeviceInterface interface {
GetPhysicalFunction() *nvpci.NvidiaPCIDevice
Delete() error
}

type mdevParentAdapter struct {
*nvmdev.ParentDevice
}

func (a *mdevParentAdapter) IsVGPUTypeAvailable(mdevType string) (bool, error) {
return a.ParentDevice.IsMDEVTypeAvailable(mdevType)
}

func (a *mdevParentAdapter) CreateVGPUDevice(mdevType string, id string) error {
return a.ParentDevice.CreateMDEVDevice(mdevType, id)
}

func (a *mdevParentAdapter) GetAvailableVGPUInstances(mdevType string) (int, error) {
return a.ParentDevice.GetAvailableMDEVInstances(mdevType)
}

// IsVFIOMode returns true if the manager is running in VFIO mode, false for MDEV mode
func (m *VGPUCombinedManager) IsVFIOMode() bool {
return m.isVFIOMode
}

// GetNvpci returns the nvpci interface for GPU enumeration
func (m *VGPUCombinedManager) GetNvpci() nvpci.Interface {
return m.nvlib.Nvpci
}

// GetNvmdev returns the nvmdev interface for MDEV operations
func (m *VGPUCombinedManager) GetNvmdev() nvmdev.Interface {
return m.nvlib.Nvmdev
}

// GetAllParentDevices returns all parent devices as a common interface type
func (m *VGPUCombinedManager) GetAllParentDevices() ([]ParentDeviceInterface, error) {
if m.isVFIOMode {
vfioDevices, err := m.vfio.GetAllParentDevices()
if err != nil {
return nil, err
}
result := make([]ParentDeviceInterface, len(vfioDevices))
for i, d := range vfioDevices {
result[i] = d
}
return result, nil
}
mdevDevices, err := m.nvlib.Nvmdev.GetAllParentDevices()
if err != nil {
return nil, err
}
result := make([]ParentDeviceInterface, len(mdevDevices))
for i, d := range mdevDevices {
result[i] = &mdevParentAdapter{ParentDevice: d}
}
return result, nil
}

// GetAllDevices returns all vGPU device instances as a common interface type
func (m *VGPUCombinedManager) GetAllDevices() ([]DeviceInterface, error) {
if m.isVFIOMode {
vfioDevices, err := m.vfio.GetAllDevices()
if err != nil {
return nil, err
}
result := make([]DeviceInterface, len(vfioDevices))
for i, d := range vfioDevices {
result[i] = d
}
return result, nil
}
mdevDevices, err := m.nvlib.Nvmdev.GetAllDevices()
if err != nil {
return nil, err
}
result := make([]DeviceInterface, len(mdevDevices))
for i, d := range mdevDevices {
result[i] = d
}
return result, nil
}
Loading