Skip to content

Commit 5e6ad08

Browse files
author
Rafael Garcia
committed
feat(instances): integrate devices with instance lifecycle
Wire up device management throughout the instance lifecycle: - create.go: Validate devices, auto-bind to VFIO, pass to VM config - delete.go: Detach devices, auto-unbind from VFIO - configdisk.go: Add HAS_GPU config flag for GPU instances - manager.go: Add deviceManager dependency - providers.go: Add ProvideDeviceManager - wire.go/wire_gen.go: Wire up DeviceManager in DI - api.go: Add DeviceManager to ApiService struct
1 parent 4ba6fa2 commit 5e6ad08

File tree

11 files changed

+129
-31
lines changed

11 files changed

+129
-31
lines changed

cmd/api/api/api.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package api
22

33
import (
44
"github.com/onkernel/hypeman/cmd/api/config"
5+
"github.com/onkernel/hypeman/lib/devices"
56
"github.com/onkernel/hypeman/lib/images"
67
"github.com/onkernel/hypeman/lib/ingress"
78
"github.com/onkernel/hypeman/lib/instances"
@@ -17,6 +18,7 @@ type ApiService struct {
1718
InstanceManager instances.Manager
1819
VolumeManager volumes.Manager
1920
NetworkManager network.Manager
21+
DeviceManager devices.Manager
2022
IngressManager ingress.Manager
2123
}
2224

@@ -29,6 +31,7 @@ func New(
2931
instanceManager instances.Manager,
3032
volumeManager volumes.Manager,
3133
networkManager network.Manager,
34+
deviceManager devices.Manager,
3235
ingressManager ingress.Manager,
3336
) *ApiService {
3437
return &ApiService{
@@ -37,6 +40,7 @@ func New(
3740
InstanceManager: instanceManager,
3841
VolumeManager: volumeManager,
3942
NetworkManager: networkManager,
43+
DeviceManager: deviceManager,
4044
IngressManager: ingressManager,
4145
}
4246
}

cmd/api/api/api_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
"github.com/onkernel/hypeman/cmd/api/config"
12+
"github.com/onkernel/hypeman/lib/devices"
1213
"github.com/onkernel/hypeman/lib/images"
1314
"github.com/onkernel/hypeman/lib/instances"
1415
mw "github.com/onkernel/hypeman/lib/middleware"
@@ -34,11 +35,12 @@ func newTestService(t *testing.T) *ApiService {
3435

3536
systemMgr := system.NewManager(p)
3637
networkMgr := network.NewManager(p, cfg, nil)
38+
deviceMgr := devices.NewManager(p)
3739
volumeMgr := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
3840
limits := instances.ResourceLimits{
3941
MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB
4042
}
41-
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil)
43+
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil)
4244

4345
// Register cleanup for orphaned Cloud Hypervisor processes
4446
t.Cleanup(func() {
@@ -50,6 +52,7 @@ func newTestService(t *testing.T) *ApiService {
5052
ImageManager: imageMgr,
5153
InstanceManager: instanceMgr,
5254
VolumeManager: volumeMgr,
55+
DeviceManager: deviceMgr,
5356
}
5457
}
5558

cmd/api/wire.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/google/wire"
1010
"github.com/onkernel/hypeman/cmd/api/api"
1111
"github.com/onkernel/hypeman/cmd/api/config"
12+
"github.com/onkernel/hypeman/lib/devices"
1213
"github.com/onkernel/hypeman/lib/images"
1314
"github.com/onkernel/hypeman/lib/ingress"
1415
"github.com/onkernel/hypeman/lib/instances"
@@ -27,6 +28,7 @@ type application struct {
2728
ImageManager images.Manager
2829
SystemManager system.Manager
2930
NetworkManager network.Manager
31+
DeviceManager devices.Manager
3032
InstanceManager instances.Manager
3133
VolumeManager volumes.Manager
3234
IngressManager ingress.Manager
@@ -44,6 +46,7 @@ func initializeApp() (*application, func(), error) {
4446
providers.ProvideImageManager,
4547
providers.ProvideSystemManager,
4648
providers.ProvideNetworkManager,
49+
providers.ProvideDeviceManager,
4750
providers.ProvideInstanceManager,
4851
providers.ProvideVolumeManager,
4952
providers.ProvideIngressManager,

cmd/api/wire_gen.go

Lines changed: 8 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/instances/configdisk.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (m *manager) createConfigDisk(inst *Instance, imageInfo *images.Image, netC
5353
// Create ext4 disk with config files
5454
// Use ext4 for now (can switch to erofs when kernel supports it)
5555
diskPath := m.paths.InstanceConfigDisk(inst.Id)
56-
56+
5757
// Calculate size (config files are tiny, use 1MB minimum)
5858
_, err = images.ExportRootfs(tmpDir, diskPath, images.FormatExt4)
5959
if err != nil {
@@ -70,26 +70,26 @@ func (m *manager) generateConfigScript(inst *Instance, imageInfo *images.Image,
7070
if len(imageInfo.Entrypoint) > 0 {
7171
entrypoint = shellQuoteArray(imageInfo.Entrypoint)
7272
}
73-
73+
7474
// Prepare cmd value
7575
cmd := ""
7676
if len(imageInfo.Cmd) > 0 {
7777
cmd = shellQuoteArray(imageInfo.Cmd)
7878
}
79-
79+
8080
// Prepare workdir value
8181
workdir := shellQuote("/")
8282
if imageInfo.WorkingDir != "" {
8383
workdir = shellQuote(imageInfo.WorkingDir)
8484
}
85-
85+
8686
// Build environment variable exports
8787
var envLines strings.Builder
8888
mergedEnv := mergeEnv(imageInfo.Env, inst.Env)
8989
for key, value := range mergedEnv {
9090
envLines.WriteString(fmt.Sprintf("export %s=%s\n", key, shellQuote(value)))
9191
}
92-
92+
9393
// Build network configuration section
9494
// Use netConfig directly instead of trying to derive it (VM hasn't started yet)
9595
networkSection := ""
@@ -105,6 +105,13 @@ GUEST_DNS="%s"
105105
`, netConfig.IP, cidr, netConfig.Gateway, netConfig.DNS)
106106
}
107107

108+
// GPU passthrough configuration
109+
// When devices are attached, set HAS_GPU=1 to trigger NVIDIA module loading in init
110+
gpuSection := ""
111+
if len(inst.Devices) > 0 {
112+
gpuSection = "\n# GPU passthrough\nHAS_GPU=1\n"
113+
}
114+
108115
// Build volume mounts section
109116
// Volumes are attached as /dev/vdd, /dev/vde, etc. (after vda=rootfs, vdb=overlay, vdc=config)
110117
// For overlay volumes, two devices are used: base + overlay disk
@@ -137,7 +144,7 @@ GUEST_DNS="%s"
137144
volumeLines.WriteString("\"\n")
138145
volumeSection = volumeLines.String()
139146
}
140-
147+
141148
// Generate script as a readable template block
142149
// ENTRYPOINT and CMD contain shell-quoted arrays that will be eval'd in init
143150
script := fmt.Sprintf(`#!/bin/sh
@@ -149,16 +156,17 @@ CMD="%s"
149156
WORKDIR=%s
150157
151158
# Environment variables
152-
%s%s%s`,
159+
%s%s%s%s`,
153160
inst.Id,
154161
entrypoint,
155162
cmd,
156163
workdir,
157164
envLines.String(),
158165
networkSection,
159166
volumeSection,
167+
gpuSection,
160168
)
161-
169+
162170
return script
163171
}
164172

lib/instances/create.go

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
"github.com/nrednav/cuid2"
12+
"github.com/onkernel/hypeman/lib/devices"
1213
"github.com/onkernel/hypeman/lib/images"
1314
"github.com/onkernel/hypeman/lib/logger"
1415
"github.com/onkernel/hypeman/lib/network"
@@ -141,7 +142,7 @@ func (m *manager) createInstance(
141142
return nil, ErrAlreadyExists
142143
}
143144

144-
// 5. Apply defaults
145+
// 6. Apply defaults
145146
size := req.Size
146147
if size == 0 {
147148
size = 1 * 1024 * 1024 * 1024 // 1GB default
@@ -191,16 +192,42 @@ func (m *manager) createInstance(
191192
req.Env = make(map[string]string)
192193
}
193194

194-
// 6. Determine network based on NetworkEnabled flag
195+
// 7. Determine network based on NetworkEnabled flag
195196
networkName := ""
196197
if req.NetworkEnabled {
197198
networkName = "default"
198199
}
199200

200-
// 7. Get default kernel version
201+
// 8. Get default kernel version
201202
kernelVer := m.systemManager.GetDefaultKernelVersion()
202203

203-
// 8. Create instance metadata
204+
// 9. Validate, resolve, and auto-bind devices (GPU passthrough)
205+
var resolvedDeviceIDs []string
206+
if len(req.Devices) > 0 && m.deviceManager != nil {
207+
for _, deviceRef := range req.Devices {
208+
device, err := m.deviceManager.GetDevice(ctx, deviceRef)
209+
if err != nil {
210+
log.ErrorContext(ctx, "failed to get device", "device", deviceRef, "error", err)
211+
return nil, fmt.Errorf("device %s: %w", deviceRef, err)
212+
}
213+
if device.AttachedTo != nil {
214+
log.ErrorContext(ctx, "device already attached", "device", deviceRef, "instance", *device.AttachedTo)
215+
return nil, fmt.Errorf("device %s is already attached to instance %s", deviceRef, *device.AttachedTo)
216+
}
217+
// Auto-bind to VFIO if not already bound
218+
if !device.BoundToVFIO {
219+
log.InfoContext(ctx, "auto-binding device to VFIO", "device", deviceRef, "pci_address", device.PCIAddress)
220+
if err := m.deviceManager.BindToVFIO(ctx, device.Id); err != nil {
221+
log.ErrorContext(ctx, "failed to bind device to VFIO", "device", deviceRef, "error", err)
222+
return nil, fmt.Errorf("bind device %s to VFIO: %w", deviceRef, err)
223+
}
224+
}
225+
resolvedDeviceIDs = append(resolvedDeviceIDs, device.Id)
226+
}
227+
log.DebugContext(ctx, "validated devices for passthrough", "id", id, "devices", resolvedDeviceIDs)
228+
}
229+
230+
// 10. Create instance metadata
204231
stored := &StoredMetadata{
205232
Id: id,
206233
Name: req.Name,
@@ -220,6 +247,7 @@ func (m *manager) createInstance(
220247
DataDir: m.paths.InstanceDir(id),
221248
VsockCID: vsockCID,
222249
VsockSocket: vsockSocket,
250+
Devices: resolvedDeviceIDs,
223251
}
224252

225253
// Setup cleanup stack for automatic rollback on errors
@@ -243,7 +271,7 @@ func (m *manager) createInstance(
243271
return nil, fmt.Errorf("create overlay disk: %w", err)
244272
}
245273

246-
// 10. Allocate network (if network enabled)
274+
// 14. Allocate network (if network enabled)
247275
var netConfig *network.NetworkConfig
248276
if networkName != "" {
249277
log.DebugContext(ctx, "allocating network", "instance_id", id, "network", networkName)
@@ -268,7 +296,7 @@ func (m *manager) createInstance(
268296
})
269297
}
270298

271-
// 10.5. Validate and attach volumes
299+
// 15. Validate and attach volumes
272300
if len(req.Volumes) > 0 {
273301
log.DebugContext(ctx, "validating volumes", "instance_id", id, "count", len(req.Volumes))
274302
for _, volAttach := range req.Volumes {
@@ -308,7 +336,7 @@ func (m *manager) createInstance(
308336
stored.Volumes = req.Volumes
309337
}
310338

311-
// 11. Create config disk (needs Instance for buildVMConfig)
339+
// 16. Create config disk (needs Instance for buildVMConfig)
312340
inst := &Instance{StoredMetadata: *stored}
313341
log.DebugContext(ctx, "creating config disk", "instance_id", id)
314342
if err := m.createConfigDisk(inst, imageInfo, netConfig); err != nil {
@@ -487,7 +515,7 @@ func (m *manager) startAndBootVM(
487515

488516
// Build VM configuration matching Cloud Hypervisor VmConfig
489517
inst := &Instance{StoredMetadata: *stored}
490-
vmConfig, err := m.buildVMConfig(inst, imageInfo, netConfig)
518+
vmConfig, err := m.buildVMConfig(ctx, inst, imageInfo, netConfig)
491519
if err != nil {
492520
return fmt.Errorf("build vm config: %w", err)
493521
}
@@ -537,7 +565,7 @@ func (m *manager) startAndBootVM(
537565
}
538566

539567
// buildVMConfig creates the Cloud Hypervisor VmConfig
540-
func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) (vmm.VmConfig, error) {
568+
func (m *manager) buildVMConfig(ctx context.Context, inst *Instance, imageInfo *images.Image, netConfig *network.NetworkConfig) (vmm.VmConfig, error) {
541569
// Get system file paths
542570
kernelPath, _ := m.systemManager.GetKernelPath(system.KernelVersion(inst.KernelVersion))
543571
initrdPath, _ := m.systemManager.GetInitrdPath()
@@ -644,6 +672,22 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
644672
Socket: inst.VsockSocket,
645673
}
646674

675+
// Device passthrough configuration (GPU, etc.)
676+
var deviceConfigs *[]vmm.DeviceConfig
677+
if len(inst.Devices) > 0 && m.deviceManager != nil {
678+
configs := make([]vmm.DeviceConfig, 0, len(inst.Devices))
679+
for _, deviceID := range inst.Devices {
680+
device, err := m.deviceManager.GetDevice(ctx, deviceID)
681+
if err != nil {
682+
return vmm.VmConfig{}, fmt.Errorf("get device %s: %w", deviceID, err)
683+
}
684+
configs = append(configs, vmm.DeviceConfig{
685+
Path: devices.GetDeviceSysfsPath(device.PCIAddress),
686+
})
687+
}
688+
deviceConfigs = &configs
689+
}
690+
647691
return vmm.VmConfig{
648692
Payload: payload,
649693
Cpus: &cpus,
@@ -653,6 +697,7 @@ func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image, netConf
653697
Console: &console,
654698
Net: nets,
655699
Vsock: &vsock,
700+
Devices: deviceConfigs,
656701
}, nil
657702
}
658703

0 commit comments

Comments
 (0)