Skip to content

Commit 0a7321e

Browse files
committed
Add test
1 parent a5d652a commit 0a7321e

File tree

2 files changed

+262
-3
lines changed

2 files changed

+262
-3
lines changed

integration/vgpu_test.go

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
package integration
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"os"
7+
"testing"
8+
"time"
9+
10+
"github.com/onkernel/hypeman/cmd/api/config"
11+
"github.com/onkernel/hypeman/lib/devices"
12+
"github.com/onkernel/hypeman/lib/guest"
13+
"github.com/onkernel/hypeman/lib/hypervisor"
14+
"github.com/onkernel/hypeman/lib/images"
15+
"github.com/onkernel/hypeman/lib/instances"
16+
"github.com/onkernel/hypeman/lib/network"
17+
"github.com/onkernel/hypeman/lib/paths"
18+
"github.com/onkernel/hypeman/lib/system"
19+
"github.com/onkernel/hypeman/lib/volumes"
20+
"github.com/stretchr/testify/assert"
21+
"github.com/stretchr/testify/require"
22+
)
23+
24+
// TestVGPU is an integration test that verifies vGPU (SR-IOV mdev) support works.
25+
//
26+
// This test automatically detects vGPU availability and skips if:
27+
// - No SR-IOV VFs are found in /sys/class/mdev_bus/
28+
// - No vGPU profiles are available
29+
// - Not running as root (required for mdev creation)
30+
// - KVM is not available
31+
//
32+
// To run manually:
33+
//
34+
// sudo go test -v -run TestVGPU -timeout 5m ./integration/...
35+
//
36+
// Note: This test verifies mdev creation and PCI device visibility inside the VM.
37+
// It does NOT test nvidia-smi or CUDA functionality since that requires NVIDIA
38+
// guest drivers pre-installed in the image.
39+
func TestVGPU(t *testing.T) {
40+
if testing.Short() {
41+
t.Skip("skipping integration test in short mode")
42+
}
43+
44+
// Auto-detect vGPU availability - skip if prerequisites not met
45+
skipReason, profile := checkVGPUTestPrerequisites()
46+
if skipReason != "" {
47+
t.Skip(skipReason)
48+
}
49+
50+
t.Logf("vGPU test prerequisites met, using profile: %s", profile)
51+
52+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
53+
defer cancel()
54+
55+
// Set up test environment
56+
tmpDir := t.TempDir()
57+
p := paths.New(tmpDir)
58+
59+
cfg := &config.Config{
60+
DataDir: tmpDir,
61+
BridgeName: "vmbr0",
62+
SubnetCIDR: "10.100.0.0/16",
63+
DNSServer: "1.1.1.1",
64+
}
65+
66+
// Create managers
67+
imageManager, err := images.NewManager(p, 1, nil)
68+
require.NoError(t, err)
69+
70+
systemManager := system.NewManager(p)
71+
networkManager := network.NewManager(p, cfg, nil)
72+
deviceManager := devices.NewManager(p)
73+
volumeManager := volumes.NewManager(p, 0, nil)
74+
75+
limits := instances.ResourceLimits{
76+
MaxOverlaySize: 100 * 1024 * 1024 * 1024,
77+
MaxVcpusPerInstance: 0,
78+
MaxMemoryPerInstance: 0,
79+
MaxTotalVcpus: 0,
80+
MaxTotalMemory: 0,
81+
}
82+
83+
instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil)
84+
85+
// Track instance ID for cleanup
86+
var instanceID string
87+
88+
// Cleanup any orphaned instances and mdevs
89+
t.Cleanup(func() {
90+
if instanceID != "" {
91+
t.Log("Cleanup: Deleting instance...")
92+
instanceManager.DeleteInstance(ctx, instanceID)
93+
}
94+
})
95+
96+
// Step 1: Ensure system files (kernel, initrd)
97+
t.Log("Step 1: Ensuring system files...")
98+
err = systemManager.EnsureSystemFiles(ctx)
99+
require.NoError(t, err)
100+
t.Log("System files ready")
101+
102+
// Step 2: Pull alpine image (lightweight for testing)
103+
imageName := "docker.io/library/alpine:latest"
104+
t.Log("Step 2: Pulling alpine image...")
105+
_, err = imageManager.CreateImage(ctx, images.CreateImageRequest{
106+
Name: imageName,
107+
})
108+
require.NoError(t, err)
109+
110+
// Wait for image to be ready
111+
t.Log("Waiting for image build...")
112+
var img *images.Image
113+
for i := 0; i < 120; i++ {
114+
img, err = imageManager.GetImage(ctx, imageName)
115+
if err == nil && img.Status == images.StatusReady {
116+
break
117+
}
118+
if img != nil && img.Status == images.StatusFailed {
119+
errMsg := "unknown"
120+
if img.Error != nil {
121+
errMsg = *img.Error
122+
}
123+
t.Fatalf("Image build failed: %s", errMsg)
124+
}
125+
time.Sleep(1 * time.Second)
126+
}
127+
require.NotNil(t, img, "Image should exist")
128+
require.Equal(t, images.StatusReady, img.Status, "Image should be ready")
129+
t.Log("Image ready")
130+
131+
// Step 3: Create instance with vGPU using QEMU hypervisor
132+
// QEMU is required for vGPU/mdev passthrough with NVIDIA's vGPU manager
133+
t.Log("Step 3: Creating instance with vGPU profile:", profile)
134+
inst, err := instanceManager.CreateInstance(ctx, instances.CreateInstanceRequest{
135+
Name: "vgpu-test",
136+
Image: imageName,
137+
Size: 2 * 1024 * 1024 * 1024, // 2GB
138+
HotplugSize: 512 * 1024 * 1024,
139+
OverlaySize: 1024 * 1024 * 1024,
140+
Vcpus: 2,
141+
NetworkEnabled: false, // No network needed for this test
142+
Hypervisor: hypervisor.TypeQEMU,
143+
GPU: &instances.GPUConfig{
144+
Profile: profile,
145+
},
146+
})
147+
require.NoError(t, err)
148+
instanceID = inst.Id
149+
t.Logf("Instance created: %s", inst.Id)
150+
151+
// Verify mdev UUID was assigned
152+
require.NotEmpty(t, inst.GPUMdevUUID, "Instance should have mdev UUID assigned")
153+
t.Logf("mdev UUID: %s", inst.GPUMdevUUID)
154+
155+
// Step 4: Verify mdev was created in sysfs
156+
t.Run("MdevCreated", func(t *testing.T) {
157+
mdevPath := "/sys/bus/mdev/devices/" + inst.GPUMdevUUID
158+
_, err := os.Stat(mdevPath)
159+
assert.NoError(t, err, "mdev device should exist at %s", mdevPath)
160+
t.Logf("mdev exists at: %s", mdevPath)
161+
})
162+
163+
// Step 5: Wait for guest agent to be ready
164+
t.Log("Step 5: Waiting for guest agent...")
165+
err = waitForGuestAgent(ctx, instanceManager, inst.Id, 60*time.Second)
166+
require.NoError(t, err, "guest agent should be ready")
167+
168+
// Step 6: Verify GPU is visible inside VM via PCI
169+
t.Run("GPUVisibleInVM", func(t *testing.T) {
170+
actualInst, err := instanceManager.GetInstance(ctx, inst.Id)
171+
require.NoError(t, err)
172+
173+
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
174+
require.NoError(t, err)
175+
176+
// Check for NVIDIA vendor ID (0x10de) in guest PCI devices
177+
var stdout, stderr bytes.Buffer
178+
checkGPUCmd := "cat /sys/bus/pci/devices/*/vendor 2>/dev/null | grep -i 10de && echo 'NVIDIA_FOUND' || echo 'NO_NVIDIA'"
179+
180+
_, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
181+
Command: []string{"/bin/sh", "-c", checkGPUCmd},
182+
Stdout: &stdout,
183+
Stderr: &stderr,
184+
TTY: false,
185+
})
186+
require.NoError(t, err, "exec should work")
187+
188+
output := stdout.String()
189+
t.Logf("GPU check output: %s", output)
190+
191+
assert.Contains(t, output, "NVIDIA_FOUND", "NVIDIA GPU (vendor 0x10de) should be visible in guest")
192+
})
193+
194+
// Step 7: Check instance GPU info is correct
195+
t.Run("InstanceGPUInfo", func(t *testing.T) {
196+
actualInst, err := instanceManager.GetInstance(ctx, inst.Id)
197+
require.NoError(t, err)
198+
199+
assert.Equal(t, profile, actualInst.GPUProfile, "GPU profile should match")
200+
assert.NotEmpty(t, actualInst.GPUMdevUUID, "mdev UUID should be set")
201+
t.Logf("Instance GPU: profile=%s, mdev=%s", actualInst.GPUProfile, actualInst.GPUMdevUUID)
202+
})
203+
204+
t.Log("✅ vGPU test PASSED!")
205+
}
206+
207+
// checkVGPUTestPrerequisites checks if vGPU test can run.
208+
// Returns (skipReason, profileName) - skipReason is empty if all prerequisites are met.
209+
func checkVGPUTestPrerequisites() (string, string) {
210+
// Check KVM
211+
if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {
212+
return "vGPU test requires /dev/kvm", ""
213+
}
214+
215+
// Check for root (required for mdev creation via sysfs)
216+
if os.Geteuid() != 0 {
217+
return "vGPU test requires root (sudo) for mdev creation", ""
218+
}
219+
220+
// Check for vGPU mode (SR-IOV VFs present)
221+
mode := devices.DetectHostGPUMode()
222+
if mode != devices.GPUModeVGPU {
223+
return "vGPU test requires SR-IOV VFs in /sys/class/mdev_bus/", ""
224+
}
225+
226+
// Check for available profiles
227+
profiles, err := devices.ListGPUProfiles()
228+
if err != nil {
229+
return "vGPU test failed to list profiles: " + err.Error(), ""
230+
}
231+
if len(profiles) == 0 {
232+
return "vGPU test requires at least one GPU profile", ""
233+
}
234+
235+
// Find a profile with available instances
236+
for _, p := range profiles {
237+
if p.Available > 0 {
238+
return "", p.Name
239+
}
240+
}
241+
242+
return "vGPU test requires at least one available VF (all VFs are in use)", ""
243+
}
244+

lib/hypervisor/qemu/config.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"runtime"
66
"strconv"
7+
"strings"
78

89
"github.com/onkernel/hypeman/lib/hypervisor"
910
)
@@ -64,9 +65,23 @@ func BuildArgs(cfg hypervisor.VMConfig) []string {
6465
args = append(args, "-device", fmt.Sprintf("vhost-vsock-pci,guest-cid=%d", cfg.VsockCID))
6566
}
6667

67-
// PCI device passthrough (GPU, etc.)
68-
for _, pciAddr := range cfg.PCIDevices {
69-
args = append(args, "-device", fmt.Sprintf("vfio-pci,host=%s", pciAddr))
68+
// PCI device passthrough (GPU, mdev vGPU, etc.)
69+
for _, devicePath := range cfg.PCIDevices {
70+
var deviceArg string
71+
if strings.HasPrefix(devicePath, "/sys/bus/mdev/devices/") {
72+
// mdev device (vGPU) - use sysfsdev parameter
73+
deviceArg = fmt.Sprintf("vfio-pci,sysfsdev=%s", devicePath)
74+
} else if strings.HasPrefix(devicePath, "/sys/bus/pci/devices/") {
75+
// Full sysfs path for regular PCI device - extract the PCI address
76+
// Path format: /sys/bus/pci/devices/0000:82:00.4/
77+
parts := strings.Split(strings.TrimSuffix(devicePath, "/"), "/")
78+
pciAddr := parts[len(parts)-1]
79+
deviceArg = fmt.Sprintf("vfio-pci,host=%s", pciAddr)
80+
} else {
81+
// Raw PCI address (e.g., "0000:82:00.4")
82+
deviceArg = fmt.Sprintf("vfio-pci,host=%s", devicePath)
83+
}
84+
args = append(args, "-device", deviceArg)
7085
}
7186

7287
// Serial console output to file

0 commit comments

Comments
 (0)