|
| 1 | +package integration |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "os" |
| 7 | + "testing" |
| 8 | + "time" |
| 9 | + |
| 10 | + "github.com/onkernel/hypeman/cmd/api/config" |
| 11 | + "github.com/onkernel/hypeman/lib/devices" |
| 12 | + "github.com/onkernel/hypeman/lib/guest" |
| 13 | + "github.com/onkernel/hypeman/lib/hypervisor" |
| 14 | + "github.com/onkernel/hypeman/lib/images" |
| 15 | + "github.com/onkernel/hypeman/lib/instances" |
| 16 | + "github.com/onkernel/hypeman/lib/network" |
| 17 | + "github.com/onkernel/hypeman/lib/paths" |
| 18 | + "github.com/onkernel/hypeman/lib/system" |
| 19 | + "github.com/onkernel/hypeman/lib/volumes" |
| 20 | + "github.com/stretchr/testify/assert" |
| 21 | + "github.com/stretchr/testify/require" |
| 22 | +) |
| 23 | + |
| 24 | +// TestVGPU is an integration test that verifies vGPU (SR-IOV mdev) support works. |
| 25 | +// |
| 26 | +// This test automatically detects vGPU availability and skips if: |
| 27 | +// - No SR-IOV VFs are found in /sys/class/mdev_bus/ |
| 28 | +// - No vGPU profiles are available |
| 29 | +// - Not running as root (required for mdev creation) |
| 30 | +// - KVM is not available |
| 31 | +// |
| 32 | +// To run manually: |
| 33 | +// |
| 34 | +// sudo go test -v -run TestVGPU -timeout 5m ./integration/... |
| 35 | +// |
| 36 | +// Note: This test verifies mdev creation and PCI device visibility inside the VM. |
| 37 | +// It does NOT test nvidia-smi or CUDA functionality since that requires NVIDIA |
| 38 | +// guest drivers pre-installed in the image. |
| 39 | +func TestVGPU(t *testing.T) { |
| 40 | + if testing.Short() { |
| 41 | + t.Skip("skipping integration test in short mode") |
| 42 | + } |
| 43 | + |
| 44 | + // Auto-detect vGPU availability - skip if prerequisites not met |
| 45 | + skipReason, profile := checkVGPUTestPrerequisites() |
| 46 | + if skipReason != "" { |
| 47 | + t.Skip(skipReason) |
| 48 | + } |
| 49 | + |
| 50 | + t.Logf("vGPU test prerequisites met, using profile: %s", profile) |
| 51 | + |
| 52 | + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) |
| 53 | + defer cancel() |
| 54 | + |
| 55 | + // Set up test environment |
| 56 | + tmpDir := t.TempDir() |
| 57 | + p := paths.New(tmpDir) |
| 58 | + |
| 59 | + cfg := &config.Config{ |
| 60 | + DataDir: tmpDir, |
| 61 | + BridgeName: "vmbr0", |
| 62 | + SubnetCIDR: "10.100.0.0/16", |
| 63 | + DNSServer: "1.1.1.1", |
| 64 | + } |
| 65 | + |
| 66 | + // Create managers |
| 67 | + imageManager, err := images.NewManager(p, 1, nil) |
| 68 | + require.NoError(t, err) |
| 69 | + |
| 70 | + systemManager := system.NewManager(p) |
| 71 | + networkManager := network.NewManager(p, cfg, nil) |
| 72 | + deviceManager := devices.NewManager(p) |
| 73 | + volumeManager := volumes.NewManager(p, 0, nil) |
| 74 | + |
| 75 | + limits := instances.ResourceLimits{ |
| 76 | + MaxOverlaySize: 100 * 1024 * 1024 * 1024, |
| 77 | + MaxVcpusPerInstance: 0, |
| 78 | + MaxMemoryPerInstance: 0, |
| 79 | + MaxTotalVcpus: 0, |
| 80 | + MaxTotalMemory: 0, |
| 81 | + } |
| 82 | + |
| 83 | + instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) |
| 84 | + |
| 85 | + // Track instance ID for cleanup |
| 86 | + var instanceID string |
| 87 | + |
| 88 | + // Cleanup any orphaned instances and mdevs |
| 89 | + t.Cleanup(func() { |
| 90 | + if instanceID != "" { |
| 91 | + t.Log("Cleanup: Deleting instance...") |
| 92 | + instanceManager.DeleteInstance(ctx, instanceID) |
| 93 | + } |
| 94 | + }) |
| 95 | + |
| 96 | + // Step 1: Ensure system files (kernel, initrd) |
| 97 | + t.Log("Step 1: Ensuring system files...") |
| 98 | + err = systemManager.EnsureSystemFiles(ctx) |
| 99 | + require.NoError(t, err) |
| 100 | + t.Log("System files ready") |
| 101 | + |
| 102 | + // Step 2: Pull alpine image (lightweight for testing) |
| 103 | + imageName := "docker.io/library/alpine:latest" |
| 104 | + t.Log("Step 2: Pulling alpine image...") |
| 105 | + _, err = imageManager.CreateImage(ctx, images.CreateImageRequest{ |
| 106 | + Name: imageName, |
| 107 | + }) |
| 108 | + require.NoError(t, err) |
| 109 | + |
| 110 | + // Wait for image to be ready |
| 111 | + t.Log("Waiting for image build...") |
| 112 | + var img *images.Image |
| 113 | + for i := 0; i < 120; i++ { |
| 114 | + img, err = imageManager.GetImage(ctx, imageName) |
| 115 | + if err == nil && img.Status == images.StatusReady { |
| 116 | + break |
| 117 | + } |
| 118 | + if img != nil && img.Status == images.StatusFailed { |
| 119 | + errMsg := "unknown" |
| 120 | + if img.Error != nil { |
| 121 | + errMsg = *img.Error |
| 122 | + } |
| 123 | + t.Fatalf("Image build failed: %s", errMsg) |
| 124 | + } |
| 125 | + time.Sleep(1 * time.Second) |
| 126 | + } |
| 127 | + require.NotNil(t, img, "Image should exist") |
| 128 | + require.Equal(t, images.StatusReady, img.Status, "Image should be ready") |
| 129 | + t.Log("Image ready") |
| 130 | + |
| 131 | + // Step 3: Create instance with vGPU using QEMU hypervisor |
| 132 | + // QEMU is required for vGPU/mdev passthrough with NVIDIA's vGPU manager |
| 133 | + t.Log("Step 3: Creating instance with vGPU profile:", profile) |
| 134 | + inst, err := instanceManager.CreateInstance(ctx, instances.CreateInstanceRequest{ |
| 135 | + Name: "vgpu-test", |
| 136 | + Image: imageName, |
| 137 | + Size: 2 * 1024 * 1024 * 1024, // 2GB |
| 138 | + HotplugSize: 512 * 1024 * 1024, |
| 139 | + OverlaySize: 1024 * 1024 * 1024, |
| 140 | + Vcpus: 2, |
| 141 | + NetworkEnabled: false, // No network needed for this test |
| 142 | + Hypervisor: hypervisor.TypeQEMU, |
| 143 | + GPU: &instances.GPUConfig{ |
| 144 | + Profile: profile, |
| 145 | + }, |
| 146 | + }) |
| 147 | + require.NoError(t, err) |
| 148 | + instanceID = inst.Id |
| 149 | + t.Logf("Instance created: %s", inst.Id) |
| 150 | + |
| 151 | + // Verify mdev UUID was assigned |
| 152 | + require.NotEmpty(t, inst.GPUMdevUUID, "Instance should have mdev UUID assigned") |
| 153 | + t.Logf("mdev UUID: %s", inst.GPUMdevUUID) |
| 154 | + |
| 155 | + // Step 4: Verify mdev was created in sysfs |
| 156 | + t.Run("MdevCreated", func(t *testing.T) { |
| 157 | + mdevPath := "/sys/bus/mdev/devices/" + inst.GPUMdevUUID |
| 158 | + _, err := os.Stat(mdevPath) |
| 159 | + assert.NoError(t, err, "mdev device should exist at %s", mdevPath) |
| 160 | + t.Logf("mdev exists at: %s", mdevPath) |
| 161 | + }) |
| 162 | + |
| 163 | + // Step 5: Wait for guest agent to be ready |
| 164 | + t.Log("Step 5: Waiting for guest agent...") |
| 165 | + err = waitForGuestAgent(ctx, instanceManager, inst.Id, 60*time.Second) |
| 166 | + require.NoError(t, err, "guest agent should be ready") |
| 167 | + |
| 168 | + // Step 6: Verify GPU is visible inside VM via PCI |
| 169 | + t.Run("GPUVisibleInVM", func(t *testing.T) { |
| 170 | + actualInst, err := instanceManager.GetInstance(ctx, inst.Id) |
| 171 | + require.NoError(t, err) |
| 172 | + |
| 173 | + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) |
| 174 | + require.NoError(t, err) |
| 175 | + |
| 176 | + // Check for NVIDIA vendor ID (0x10de) in guest PCI devices |
| 177 | + var stdout, stderr bytes.Buffer |
| 178 | + checkGPUCmd := "cat /sys/bus/pci/devices/*/vendor 2>/dev/null | grep -i 10de && echo 'NVIDIA_FOUND' || echo 'NO_NVIDIA'" |
| 179 | + |
| 180 | + _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ |
| 181 | + Command: []string{"/bin/sh", "-c", checkGPUCmd}, |
| 182 | + Stdout: &stdout, |
| 183 | + Stderr: &stderr, |
| 184 | + TTY: false, |
| 185 | + }) |
| 186 | + require.NoError(t, err, "exec should work") |
| 187 | + |
| 188 | + output := stdout.String() |
| 189 | + t.Logf("GPU check output: %s", output) |
| 190 | + |
| 191 | + assert.Contains(t, output, "NVIDIA_FOUND", "NVIDIA GPU (vendor 0x10de) should be visible in guest") |
| 192 | + }) |
| 193 | + |
| 194 | + // Step 7: Check instance GPU info is correct |
| 195 | + t.Run("InstanceGPUInfo", func(t *testing.T) { |
| 196 | + actualInst, err := instanceManager.GetInstance(ctx, inst.Id) |
| 197 | + require.NoError(t, err) |
| 198 | + |
| 199 | + assert.Equal(t, profile, actualInst.GPUProfile, "GPU profile should match") |
| 200 | + assert.NotEmpty(t, actualInst.GPUMdevUUID, "mdev UUID should be set") |
| 201 | + t.Logf("Instance GPU: profile=%s, mdev=%s", actualInst.GPUProfile, actualInst.GPUMdevUUID) |
| 202 | + }) |
| 203 | + |
| 204 | + t.Log("✅ vGPU test PASSED!") |
| 205 | +} |
| 206 | + |
| 207 | +// checkVGPUTestPrerequisites checks if vGPU test can run. |
| 208 | +// Returns (skipReason, profileName) - skipReason is empty if all prerequisites are met. |
| 209 | +func checkVGPUTestPrerequisites() (string, string) { |
| 210 | + // Check KVM |
| 211 | + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { |
| 212 | + return "vGPU test requires /dev/kvm", "" |
| 213 | + } |
| 214 | + |
| 215 | + // Check for root (required for mdev creation via sysfs) |
| 216 | + if os.Geteuid() != 0 { |
| 217 | + return "vGPU test requires root (sudo) for mdev creation", "" |
| 218 | + } |
| 219 | + |
| 220 | + // Check for vGPU mode (SR-IOV VFs present) |
| 221 | + mode := devices.DetectHostGPUMode() |
| 222 | + if mode != devices.GPUModeVGPU { |
| 223 | + return "vGPU test requires SR-IOV VFs in /sys/class/mdev_bus/", "" |
| 224 | + } |
| 225 | + |
| 226 | + // Check for available profiles |
| 227 | + profiles, err := devices.ListGPUProfiles() |
| 228 | + if err != nil { |
| 229 | + return "vGPU test failed to list profiles: " + err.Error(), "" |
| 230 | + } |
| 231 | + if len(profiles) == 0 { |
| 232 | + return "vGPU test requires at least one GPU profile", "" |
| 233 | + } |
| 234 | + |
| 235 | + // Find a profile with available instances |
| 236 | + for _, p := range profiles { |
| 237 | + if p.Available > 0 { |
| 238 | + return "", p.Name |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + return "vGPU test requires at least one available VF (all VFs are in use)", "" |
| 243 | +} |
| 244 | + |
0 commit comments