Skip to content

Commit 34007ad

Browse files
committed
cleanup
1 parent 1f0e661 commit 34007ad

File tree

9 files changed

+234
-155
lines changed

9 files changed

+234
-155
lines changed

cmd/api/api/devices.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ func (s *ApiService) ListAvailableDevices(ctx context.Context, request oapi.List
4646

4747
// CreateDevice registers a new device for passthrough
4848
func (s *ApiService) CreateDevice(ctx context.Context, request oapi.CreateDeviceRequestObject) (oapi.CreateDeviceResponseObject, error) {
49+
var name string
50+
if request.Body.Name != nil {
51+
name = *request.Body.Name
52+
}
4953
req := devices.CreateDeviceRequest{
50-
Name: request.Body.Name,
54+
Name: name,
5155
PCIAddress: request.Body.PciAddress,
5256
}
5357

@@ -136,7 +140,7 @@ func deviceToOAPI(d devices.Device) oapi.Device {
136140
deviceType := oapi.DeviceType(d.Type)
137141
return oapi.Device{
138142
Id: d.Id,
139-
Name: d.Name,
143+
Name: &d.Name,
140144
Type: deviceType,
141145
PciAddress: d.PCIAddress,
142146
VendorId: d.VendorID,

lib/devices/README.md

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ lib/devices/
3737
├── vfio.go # VFIO bind/unbind operations
3838
├── manager.go # Manager interface and implementation
3939
├── manager_test.go # Unit tests
40-
├── gpu_e2e_test.go # End-to-end GPU passthrough test
40+
├── gpu_e2e_test.go # End-to-end GPU passthrough test (auto-skips if no GPU)
4141
└── scripts/
4242
└── gpu-reset.sh # GPU recovery script (see Troubleshooting)
4343
```
@@ -285,29 +285,36 @@ Then reboot.
285285

286286
### Running the E2E Test
287287

288-
The GPU passthrough E2E test requires:
289-
- Root permissions (for driver bind/unbind)
288+
The GPU passthrough E2E test **automatically detects** GPU availability and skips if prerequisites aren't met.
289+
290+
**Why GPU tests require root**: Unlike network tests which can use Linux capabilities (`CAP_NET_ADMIN`), GPU passthrough requires writing to sysfs files (`/sys/bus/pci/drivers/*/unbind`, etc.) which are protected by standard Unix file permissions (owned by root, mode 0200). Capabilities don't bypass DAC (discretionary access control) for file writes.
291+
292+
Prerequisites for the test to run (not skip):
293+
- **Root permissions** (sudo) - required for sysfs driver operations
290294
- NVIDIA GPU on host
291-
- IOMMU enabled
295+
- IOMMU enabled (`intel_iommu=on` or `amd_iommu=on`)
292296
- `vfio_pci` and `vfio_iommu_type1` modules loaded
293297
- `/sbin` in PATH (for `mkfs.ext4`)
294298

295299
```bash
296-
# Load VFIO modules
300+
# Prepare the environment
297301
sudo modprobe vfio_pci vfio_iommu_type1
298302

299-
# Run the test
303+
# Run via make - test auto-skips if not root or no GPU
304+
make test
305+
306+
# Or run directly with sudo
300307
sudo env PATH=$PATH:/sbin:/usr/sbin \
301-
go test -v -tags=integration,gpu -run TestGPUPassthrough \
302-
-timeout 5m ./lib/devices/...
308+
go test -v -run TestGPUPassthrough -timeout 5m ./lib/devices/...
303309
```
304310

305311
The test will:
306-
1. Discover available NVIDIA GPUs
307-
2. Register the first GPU found
308-
3. Create a VM with GPU passthrough
309-
4. Verify the GPU is visible inside the VM
310-
5. Clean up (delete VM, unbind from VFIO, restore nvidia driver)
312+
1. Check prerequisites and skip if not met (not root, no GPU, no IOMMU, etc.)
313+
2. Discover available NVIDIA GPUs
314+
3. Register the first GPU found
315+
4. Create a VM with GPU passthrough
316+
5. Verify the GPU is visible inside the VM
317+
6. Clean up (delete VM, unbind from VFIO, restore nvidia driver)
311318

312319
## Future Plans: GPU Sharing Across Multiple VMs
313320

lib/devices/gpu_e2e_test.go

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
//go:build gpu
2-
31
package devices_test
42

53
import (
@@ -25,31 +23,32 @@ import (
2523

2624
// TestGPUPassthrough is an E2E test that verifies GPU passthrough works.
2725
//
28-
// Run with:
26+
// This test automatically detects GPU availability and skips if:
27+
// - No NVIDIA GPU is found
28+
// - IOMMU is not enabled
29+
// - VFIO modules are not loaded
30+
// - Not running as root
2931
//
30-
// sudo env PATH=$PATH:/sbin:/usr/sbin go test -tags "gpu containers_image_openpgp" -v -run TestGPUPassthrough ./lib/devices/...
32+
// To run manually:
3133
//
32-
// Prerequisites:
33-
// - NVIDIA GPU available on host (bound to nvidia driver)
34-
// - IOMMU enabled (intel_iommu=on or amd_iommu=on in kernel cmdline)
35-
// - vfio-pci kernel module loaded (sudo modprobe vfio_pci vfio_iommu_type1)
36-
// - Root permissions (sudo) for driver bind/unbind operations
37-
// - /sbin in PATH for mkfs.ext4
34+
// sudo env PATH=$PATH:/sbin:/usr/sbin go test -v -run TestGPUPassthrough ./lib/devices/...
3835
//
3936
// WARNING: This test will unbind the GPU from the nvidia driver, which may
4037
// disrupt other processes using the GPU. The test attempts to restore the
4138
// nvidia driver binding on cleanup.
4239
func TestGPUPassthrough(t *testing.T) {
4340
ctx := context.Background()
4441

45-
// Check prerequisites
46-
checkPrerequisites(t)
47-
48-
// Check if running as root (needed for driver bind/unbind)
49-
if os.Geteuid() != 0 {
50-
t.Skip("Skipping GPU test: must run as root (sudo) to bind/unbind drivers")
42+
// Auto-detect GPU availability - skip if prerequisites not met
43+
skipReason := checkGPUTestPrerequisites()
44+
if skipReason != "" {
45+
t.Skip(skipReason)
5146
}
5247

48+
// Log that prerequisites passed
49+
groups, _ := os.ReadDir("/sys/kernel/iommu_groups")
50+
t.Logf("GPU test prerequisites met: %d IOMMU groups found", len(groups))
51+
5352
// Setup test infrastructure
5453
tmpDir := t.TempDir()
5554
p := paths.New(tmpDir)
@@ -267,25 +266,51 @@ func TestGPUPassthrough(t *testing.T) {
267266
t.Log("✅ GPU passthrough test PASSED!")
268267
}
269268

270-
func checkPrerequisites(t *testing.T) {
271-
t.Helper()
272-
269+
// checkGPUTestPrerequisites checks if GPU passthrough test can run.
270+
// Returns empty string if all prerequisites are met, otherwise returns skip reason.
271+
func checkGPUTestPrerequisites() string {
273272
// Check KVM
274273
if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {
275-
t.Fatal("/dev/kvm not available - ensure KVM is enabled and user is in 'kvm' group")
274+
return "GPU passthrough test requires /dev/kvm"
276275
}
277276

278-
// Check VFIO
277+
// Check VFIO modules
279278
if _, err := os.Stat("/dev/vfio/vfio"); os.IsNotExist(err) {
280-
t.Fatal("/dev/vfio/vfio not available - ensure vfio-pci module is loaded (modprobe vfio-pci)")
279+
return "GPU passthrough test requires VFIO (modprobe vfio_pci vfio_iommu_type1)"
281280
}
282281

283282
// Check IOMMU is enabled by looking for IOMMU groups
284283
groups, err := os.ReadDir("/sys/kernel/iommu_groups")
285284
if err != nil || len(groups) == 0 {
286-
t.Fatal("No IOMMU groups found - ensure IOMMU is enabled (intel_iommu=on or amd_iommu=on in kernel cmdline)")
285+
return "GPU passthrough test requires IOMMU (intel_iommu=on or amd_iommu=on)"
286+
}
287+
288+
// Check for NVIDIA GPU
289+
available, err := devices.DiscoverAvailableDevices()
290+
if err != nil {
291+
return "GPU passthrough test failed to discover devices: " + err.Error()
292+
}
293+
294+
hasNvidiaGPU := false
295+
for _, d := range available {
296+
if strings.Contains(strings.ToLower(d.VendorName), "nvidia") {
297+
hasNvidiaGPU = true
298+
break
299+
}
300+
}
301+
if !hasNvidiaGPU {
302+
return "GPU passthrough test requires an NVIDIA GPU"
287303
}
288-
t.Logf("Found %d IOMMU groups", len(groups))
304+
305+
// GPU passthrough requires root (euid=0) for sysfs driver bind/unbind operations.
306+
// Unlike network operations which can use CAP_NET_ADMIN, sysfs file writes are
307+
// protected by standard Unix DAC (file permissions), not just capabilities.
308+
// The files in /sys/bus/pci/drivers/ are owned by root with mode 0200.
309+
if os.Geteuid() != 0 {
310+
return "GPU passthrough test requires root (sudo) for sysfs driver operations"
311+
}
312+
313+
return "" // All prerequisites met
289314
}
290315

291316
func waitForInstanceReady(ctx context.Context, t *testing.T, mgr instances.Manager, id string, timeout time.Duration) error {

lib/devices/manager.go

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"encoding/json"
66
"fmt"
77
"os"
8-
"path/filepath"
8+
"strings"
99
"sync"
1010
"time"
1111

@@ -62,8 +62,7 @@ func (m *manager) ListDevices(ctx context.Context) ([]Device, error) {
6262
m.mu.RLock()
6363
defer m.mu.RUnlock()
6464

65-
devicesDir := m.devicesDir()
66-
entries, err := os.ReadDir(devicesDir)
65+
entries, err := os.ReadDir(m.paths.DevicesDir())
6766
if err != nil {
6867
if os.IsNotExist(err) {
6968
return []Device{}, nil
@@ -98,12 +97,7 @@ func (m *manager) ListAvailableDevices(ctx context.Context) ([]AvailableDevice,
9897
func (m *manager) CreateDevice(ctx context.Context, req CreateDeviceRequest) (*Device, error) {
9998
log := logger.FromContext(ctx)
10099

101-
// Validate name
102-
if !ValidateDeviceName(req.Name) {
103-
return nil, ErrInvalidName
104-
}
105-
106-
// Validate PCI address format
100+
// Validate PCI address format (required)
107101
if !ValidatePCIAddress(req.PCIAddress) {
108102
return nil, ErrInvalidPCIAddress
109103
}
@@ -114,11 +108,26 @@ func (m *manager) CreateDevice(ctx context.Context, req CreateDeviceRequest) (*D
114108
return nil, fmt.Errorf("get device info: %w", err)
115109
}
116110

111+
// Generate ID
112+
id := cuid2.Generate()
113+
114+
// Handle optional name: if not provided, generate one from PCI address
115+
name := req.Name
116+
if name == "" {
117+
// Generate name from PCI address: 0000:a2:00.0 -> pci-0000-a2-00-0
118+
name = "pci-" + strings.ReplaceAll(strings.ReplaceAll(req.PCIAddress, ":", "-"), ".", "-")
119+
}
120+
121+
// Validate name format
122+
if !ValidateDeviceName(name) {
123+
return nil, ErrInvalidName
124+
}
125+
117126
m.mu.Lock()
118127
defer m.mu.Unlock()
119128

120129
// Check if name already exists
121-
if _, err := m.findByName(req.Name); err == nil {
130+
if _, err := m.findByName(name); err == nil {
122131
return nil, ErrNameExists
123132
}
124133

@@ -127,13 +136,10 @@ func (m *manager) CreateDevice(ctx context.Context, req CreateDeviceRequest) (*D
127136
return nil, ErrAlreadyExists
128137
}
129138

130-
// Generate ID
131-
id := cuid2.Generate()
132-
133139
// Create device
134140
device := &Device{
135141
Id: id,
136-
Name: req.Name,
142+
Name: name,
137143
Type: DetermineDeviceType(deviceInfo),
138144
PCIAddress: req.PCIAddress,
139145
VendorID: deviceInfo.VendorID,
@@ -145,19 +151,19 @@ func (m *manager) CreateDevice(ctx context.Context, req CreateDeviceRequest) (*D
145151
}
146152

147153
// Ensure directories exist
148-
if err := os.MkdirAll(m.deviceDir(id), 0755); err != nil {
154+
if err := os.MkdirAll(m.paths.DeviceDir(id), 0755); err != nil {
149155
return nil, fmt.Errorf("create device dir: %w", err)
150156
}
151157

152158
// Save device metadata
153159
if err := m.saveDevice(device); err != nil {
154-
os.RemoveAll(m.deviceDir(id))
160+
os.RemoveAll(m.paths.DeviceDir(id))
155161
return nil, fmt.Errorf("save device: %w", err)
156162
}
157163

158164
log.InfoContext(ctx, "registered device",
159165
"id", id,
160-
"name", req.Name,
166+
"name", name,
161167
"pci_address", req.PCIAddress,
162168
"type", device.Type,
163169
)
@@ -208,7 +214,7 @@ func (m *manager) DeleteDevice(ctx context.Context, id string) error {
208214
}
209215

210216
// Remove device directory
211-
if err := os.RemoveAll(m.deviceDir(id)); err != nil {
217+
if err := os.RemoveAll(m.paths.DeviceDir(id)); err != nil {
212218
return fmt.Errorf("remove device dir: %w", err)
213219
}
214220

@@ -339,20 +345,8 @@ func (m *manager) MarkDetached(ctx context.Context, deviceID string) error {
339345

340346
// Helper methods
341347

342-
func (m *manager) devicesDir() string {
343-
return filepath.Join(m.paths.GuestsDir(), "..", "devices")
344-
}
345-
346-
func (m *manager) deviceDir(id string) string {
347-
return filepath.Join(m.devicesDir(), id)
348-
}
349-
350-
func (m *manager) deviceMetadataPath(id string) string {
351-
return filepath.Join(m.deviceDir(id), "metadata.json")
352-
}
353-
354348
func (m *manager) loadDevice(id string) (*Device, error) {
355-
data, err := os.ReadFile(m.deviceMetadataPath(id))
349+
data, err := os.ReadFile(m.paths.DeviceMetadata(id))
356350
if err != nil {
357351
if os.IsNotExist(err) {
358352
return nil, ErrNotFound
@@ -374,11 +368,11 @@ func (m *manager) saveDevice(device *Device) error {
374368
return err
375369
}
376370

377-
return os.WriteFile(m.deviceMetadataPath(device.Id), data, 0644)
371+
return os.WriteFile(m.paths.DeviceMetadata(device.Id), data, 0644)
378372
}
379373

380374
func (m *manager) findByName(name string) (*Device, error) {
381-
entries, err := os.ReadDir(m.devicesDir())
375+
entries, err := os.ReadDir(m.paths.DevicesDir())
382376
if err != nil {
383377
return nil, ErrNotFound
384378
}
@@ -402,7 +396,7 @@ func (m *manager) findByName(name string) (*Device, error) {
402396
}
403397

404398
func (m *manager) findByPCIAddress(pciAddress string) (*Device, error) {
405-
entries, err := os.ReadDir(m.devicesDir())
399+
entries, err := os.ReadDir(m.paths.DevicesDir())
406400
if err != nil {
407401
return nil, ErrNotFound
408402
}

lib/devices/types.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ type Device struct {
2929

3030
// CreateDeviceRequest is the request to register a new device
3131
type CreateDeviceRequest struct {
32-
Name string `json:"name"` // required: globally unique name
33-
PCIAddress string `json:"pci_address"` // required: PCI address (e.g., "0000:a2:00.0")
32+
Name string `json:"name,omitempty"` // optional: globally unique name (auto-generated if not provided)
33+
PCIAddress string `json:"pci_address"` // required: PCI address (e.g., "0000:a2:00.0")
3434
}
3535

3636
// AvailableDevice represents a PCI device discovered on the host

0 commit comments

Comments
 (0)