-
Notifications
You must be signed in to change notification settings - Fork 28
Create VGPU changes with VFIO Framework #139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
1c55d02
b17d50a
3f31828
29b3788
4a52cbe
89a1e62
82b9164
58c7cc6
55e063c
d49bf31
6eaa198
7e25ec0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,208 @@ | ||
| package vfio | ||
|
|
||
| import ( | ||
| "bufio" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
| "strconv" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/NVIDIA/go-nvlib/pkg/nvpci" | ||
| "github.com/NVIDIA/vgpu-device-manager/internal/nvlib" | ||
| ) | ||
|
|
||
| const ( | ||
| HostPCIDevicesRoot = "/host/sys/bus/pci/devices" | ||
| ) | ||
|
|
||
| type VFIOManager struct { | ||
| nvlib nvlib.Interface | ||
| } | ||
|
|
||
| func NewVFIOManager(nvlibInstance nvlib.Interface) *VFIOManager { | ||
| return &VFIOManager{nvlib: nvlibInstance} | ||
| } | ||
|
|
||
| // ParentDevice represents an NVIDIA parent PCI device. | ||
| type ParentDevice struct { | ||
| *nvpci.NvidiaPCIDevice | ||
| VirtualFunctionPaths map[string]string | ||
| } | ||
|
|
||
| // Device represents an NVIDIA (vGPU) device. | ||
| type Device struct { | ||
| Path string | ||
| Parent *ParentDevice | ||
| } | ||
|
|
||
| func (m *VFIOManager) GetAllParentDevices() ([]*ParentDevice, error) { | ||
| nvdevices, err := m.nvlib.Nvpci.GetGPUs() | ||
| if err != nil { | ||
| return nil, fmt.Errorf("unable to get all NVIDIA GPU devices: %v", err) | ||
| } | ||
| parentDevices := []*ParentDevice{} | ||
| for _, device := range nvdevices { | ||
| vfnum := 0 | ||
| numVF := int(device.SriovInfo.PhysicalFunction.NumVFs) | ||
cdesiniotis marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| virtualFunctionPaths := make(map[string]string) | ||
| for vfnum < numVF { | ||
| vfAddr := filepath.Join(HostPCIDevicesRoot, device.Address, "virtfn"+strconv.Itoa(vfnum), "nvidia") | ||
|
||
| if _, err := os.Stat(vfAddr); err != nil { | ||
| return nil, fmt.Errorf("virtual function %d at address %s does not exist", vfnum, vfAddr) | ||
| } | ||
| virtualFunctionPaths[strconv.Itoa(vfnum)] = vfAddr | ||
| vfnum++ | ||
| } | ||
| parentDevices = append(parentDevices, &ParentDevice{ | ||
| NvidiaPCIDevice: device, | ||
| VirtualFunctionPaths: virtualFunctionPaths, | ||
| }) | ||
| } | ||
| return parentDevices, nil | ||
| } | ||
|
|
||
| func (m *VFIOManager) GetAllDevices() ([]*Device, error) { | ||
| parentDevices, err := m.GetAllParentDevices() | ||
| if err != nil { | ||
| return nil, fmt.Errorf("unable to get all parent devices: %v", err) | ||
| } | ||
| devices := []*Device{} | ||
| for _, parentDevice := range parentDevices { | ||
| for _, vfAddr := range parentDevice.VirtualFunctionPaths { | ||
| vgpuTypeNumberBytes, err := os.ReadFile(filepath.Join(vfAddr, "current_vgpu_type")) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("unable to read current vGPU type: %v", err) | ||
| } | ||
| vgpuTypeNumber, err := strconv.Atoi(strings.TrimSpace(string(vgpuTypeNumberBytes))) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("unable to convert current vGPU type number to int: %v", err) | ||
| } | ||
| if vgpuTypeNumber != 0 { | ||
| devices = append(devices, &Device{ | ||
| Path: vfAddr, | ||
| Parent: parentDevice, | ||
| }) | ||
| } | ||
| } | ||
| } | ||
| return devices, nil | ||
| } | ||
|
|
||
| // GetPhysicalFunction gets the physical PCI device backing a 'parent' device. | ||
| func (p *ParentDevice) GetPhysicalFunction() *nvpci.NvidiaPCIDevice { | ||
| if p.SriovInfo.IsVF() { | ||
| return p.SriovInfo.VirtualFunction.PhysicalFunction | ||
| } | ||
| // Either it is an SRIOV physical function or a non-SRIOV device, so return the device itself | ||
| return p.NvidiaPCIDevice | ||
| } | ||
|
|
||
| // GetPhysicalFunction gets the physical PCI device that a vGPU is created on. | ||
| func (m *Device) GetPhysicalFunction() *nvpci.NvidiaPCIDevice { | ||
| return m.Parent.GetPhysicalFunction() | ||
| } | ||
|
|
||
| // GetIdForVGPUTypeName returns the vGPU type ID for a given type name | ||
| func (p *ParentDevice) GetIdForVGPUTypeName(filePath string, vgpuTypeName string) (int, error) { | ||
| file, err := os.Open(filePath) | ||
| if err != nil { | ||
| return 0, fmt.Errorf("unable to open file %s: %v", filePath, err) | ||
| } | ||
| defer file.Close() | ||
|
|
||
| scanner := bufio.NewScanner(file) | ||
| for scanner.Scan() { | ||
| line := scanner.Text() | ||
| fields := strings.Fields(line) | ||
| if len(fields) < 2 { | ||
| continue | ||
| } | ||
| name := fields[len(fields)-1] | ||
| numInt, err := strconv.Atoi(fields[0]) | ||
| if err == nil && name == vgpuTypeName { | ||
| return numInt, nil | ||
| } | ||
| } | ||
| return 0, fmt.Errorf("vGPU type %s not found in file %s", vgpuTypeName, filePath) | ||
| } | ||
|
|
||
| // IsVFIOEnabled checks if VFIO is enabled for a specific GPU | ||
| func (m *VFIOManager) IsVFIOEnabled(gpu int) (bool, error) { | ||
| time.Sleep(10 * time.Second) // Wait for 10 seconds to ensure the virtual functions are ready | ||
JunAr7112 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| nvdevice, err := m.nvlib.Nvpci.GetGPUByIndex(gpu) | ||
| if err != nil { | ||
| return false, fmt.Errorf("unable to get GPU by index %d: %v", gpu, err) | ||
| } | ||
| // Check if vfio exists and has entries | ||
| vfioPath := filepath.Join(HostPCIDevicesRoot, nvdevice.Address, "virtfn0", "nvidia") | ||
JunAr7112 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| creatableTypesFile := filepath.Join(vfioPath, "creatable_vgpu_types") | ||
|
|
||
| _, statErr := os.Stat(creatableTypesFile) | ||
| if statErr == nil { | ||
| return true, nil | ||
| } | ||
|
|
||
| return false, fmt.Errorf("unable to stat creatable_vgpu_types file at %s: %v", creatableTypesFile, statErr) | ||
| } | ||
|
|
||
| // IsVGPUTypeSupported checks if the vfioType is supported by this parent GPU | ||
| func (p *ParentDevice) IsVGPUTypeAvailable(vfioType string) (bool, error) { | ||
JunAr7112 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for _, vfPath := range p.VirtualFunctionPaths { | ||
| creatableTypesPath := filepath.Join(vfPath, "creatable_vgpu_types") | ||
| file, err := os.Open(creatableTypesPath) | ||
| if err != nil { | ||
| return false, fmt.Errorf("unable to open file %s: %v", creatableTypesPath, err) | ||
| } | ||
| defer file.Close() | ||
| scanner := bufio.NewScanner(file) | ||
| for scanner.Scan() { | ||
| line := scanner.Text() | ||
| fields := strings.Fields(line) | ||
| if len(fields) < 2 { | ||
| continue | ||
| } | ||
| name := fields[len(fields)-1] | ||
| if name == vfioType { | ||
| return true, nil | ||
| } | ||
| } | ||
| } | ||
| return false, nil | ||
| } | ||
|
|
||
| // Delete deletes a vGPU type from a specific GPU | ||
| func (m *Device) Delete() error { | ||
| currentVGPUTypePath := filepath.Join(m.Path, "current_vgpu_type") | ||
| err := os.WriteFile(currentVGPUTypePath, []byte("0"), 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("unable to write to %s: %v", currentVGPUTypePath, err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (p *ParentDevice) CreateVGPUDevice(vfioType string, vfnum string) error { | ||
| vfPath := p.VirtualFunctionPaths[vfnum] | ||
| currentVGPUTypePath := filepath.Join(vfPath, "current_vgpu_type") | ||
| number, err := p.GetIdForVGPUTypeName(filepath.Join(vfPath, "creatable_vgpu_types"), vfioType) | ||
| if err != nil { | ||
| return fmt.Errorf("unable to get vGPU type number: %v", err) | ||
| } | ||
| err = os.WriteFile(currentVGPUTypePath, []byte(strconv.Itoa(number)), 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("unable to write current vGPU type: %v", err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (p *ParentDevice) GetAvailableVGPUInstances(vfioType string) (int, error) { | ||
| available, err := p.IsVGPUTypeAvailable(vfioType) | ||
| if err != nil { | ||
| return 0, fmt.Errorf("unable to check if vGPU type is available: %v", err) | ||
| } | ||
| if available { | ||
| return int(p.NvidiaPCIDevice.SriovInfo.PhysicalFunction.NumVFs), nil | ||
| } | ||
| return 0, nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| package vgpu_combined | ||
|
|
||
| import ( | ||
| "fmt" | ||
|
|
||
| "github.com/NVIDIA/go-nvlib/pkg/nvmdev" | ||
| "github.com/NVIDIA/go-nvlib/pkg/nvpci" | ||
| "github.com/NVIDIA/vgpu-device-manager/internal/nvlib" | ||
| "github.com/NVIDIA/vgpu-device-manager/internal/vfio" | ||
| ) | ||
|
|
||
| type VGPUCombinedManager struct { | ||
| isVFIOMode bool | ||
| vfio *vfio.VFIOManager | ||
| nvlib nvlib.Interface | ||
| } | ||
|
|
||
| func NewVGPUCombinedManager() (*VGPUCombinedManager, error) { | ||
| nvlibInstance := nvlib.New() | ||
| vfioManager := vfio.NewVFIOManager(nvlibInstance) | ||
|
|
||
| // Determine mode once at initialization | ||
| isVFIOMode, err := vfioManager.IsVFIOEnabled(0) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("error checking if VFIO is enabled: %v", err) | ||
| } | ||
|
|
||
| return &VGPUCombinedManager{ | ||
| isVFIOMode: isVFIOMode, | ||
| vfio: vfioManager, | ||
| nvlib: nvlibInstance, | ||
| }, nil | ||
| } | ||
|
|
||
| // ParentDeviceInterface represents a common interface for both VFIO and MDEV parent devices | ||
| type ParentDeviceInterface interface { | ||
| GetPhysicalFunction() *nvpci.NvidiaPCIDevice | ||
| IsVGPUTypeAvailable(string) (bool, error) | ||
| CreateVGPUDevice(string, string) error | ||
| GetAvailableVGPUInstances(string) (int, error) | ||
| } | ||
|
|
||
| // DeviceInterface represents a common interface for both VFIO and MDEV vGPU device instances | ||
| type DeviceInterface interface { | ||
| GetPhysicalFunction() *nvpci.NvidiaPCIDevice | ||
| Delete() error | ||
| } | ||
|
|
||
| type mdevParentAdapter struct { | ||
| *nvmdev.ParentDevice | ||
| } | ||
|
|
||
| func (a *mdevParentAdapter) IsVGPUTypeAvailable(mdevType string) (bool, error) { | ||
| return a.ParentDevice.IsMDEVTypeAvailable(mdevType) | ||
| } | ||
|
|
||
| func (a *mdevParentAdapter) CreateVGPUDevice(mdevType string, id string) error { | ||
| return a.ParentDevice.CreateMDEVDevice(mdevType, id) | ||
| } | ||
|
|
||
| func (a *mdevParentAdapter) GetAvailableVGPUInstances(mdevType string) (int, error) { | ||
| return a.ParentDevice.GetAvailableMDEVInstances(mdevType) | ||
| } | ||
|
|
||
| // IsVFIOMode returns true if the manager is running in VFIO mode, false for MDEV mode | ||
| func (m *VGPUCombinedManager) IsVFIOMode() bool { | ||
| return m.isVFIOMode | ||
| } | ||
|
|
||
| // GetNvpci returns the nvpci interface for GPU enumeration | ||
| func (m *VGPUCombinedManager) GetNvpci() nvpci.Interface { | ||
| return m.nvlib.Nvpci | ||
| } | ||
|
|
||
| // GetNvmdev returns the nvmdev interface for MDEV operations | ||
| func (m *VGPUCombinedManager) GetNvmdev() nvmdev.Interface { | ||
| return m.nvlib.Nvmdev | ||
| } | ||
|
|
||
| // GetAllParentDevices returns all parent devices as a common interface type | ||
| func (m *VGPUCombinedManager) GetAllParentDevices() ([]ParentDeviceInterface, error) { | ||
| if m.isVFIOMode { | ||
| vfioDevices, err := m.vfio.GetAllParentDevices() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| result := make([]ParentDeviceInterface, len(vfioDevices)) | ||
| for i, d := range vfioDevices { | ||
| result[i] = d | ||
| } | ||
| return result, nil | ||
| } | ||
| mdevDevices, err := m.nvlib.Nvmdev.GetAllParentDevices() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| result := make([]ParentDeviceInterface, len(mdevDevices)) | ||
| for i, d := range mdevDevices { | ||
| result[i] = &mdevParentAdapter{ParentDevice: d} | ||
| } | ||
| return result, nil | ||
| } | ||
|
|
||
| // GetAllDevices returns all vGPU device instances as a common interface type | ||
| func (m *VGPUCombinedManager) GetAllDevices() ([]DeviceInterface, error) { | ||
| if m.isVFIOMode { | ||
| vfioDevices, err := m.vfio.GetAllDevices() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| result := make([]DeviceInterface, len(vfioDevices)) | ||
| for i, d := range vfioDevices { | ||
| result[i] = d | ||
| } | ||
| return result, nil | ||
| } | ||
| mdevDevices, err := m.nvlib.Nvmdev.GetAllDevices() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| result := make([]DeviceInterface, len(mdevDevices)) | ||
| for i, d := range mdevDevices { | ||
| result[i] = d | ||
| } | ||
| return result, nil | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.