Skip to content

Commit d0c0bbc

Browse files
authored
Add vGPU support (#52)
* Add vGPU support * Add logging, safer orphan cleanup * Add test * Fix performance issue with looking up profile types * Run as root * Automatically install headers * Update inference test to use DKMS * Address review comments * Performance optimized /resources endpoint for mdev scanning * Fix remaining calculation * Fix availability math
1 parent b77bd93 commit d0c0bbc

File tree

31 files changed

+2142
-1040
lines changed

31 files changed

+2142
-1040
lines changed

.air.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ tmp_dir = "tmp"
55
[build]
66
args_bin = []
77
bin = "./tmp/main"
8-
cmd = "go build -tags containers_image_openpgp -o ./tmp/main ./cmd/api && sudo setcap cap_net_admin,cap_net_bind_service=+eip ./tmp/main"
8+
cmd = "go build -tags containers_image_openpgp -o ./tmp/main ./cmd/api"
99
delay = 1000
1010
exclude_dir = ["assets", "tmp", "vendor", "testdata", "bin", "scripts", "data", "kernel"]
1111
exclude_file = []
1212
exclude_regex = ["_test.go"]
1313
exclude_unchanged = false
1414
follow_symlink = false
15-
full_bin = ""
15+
full_bin = "sudo ./tmp/main"
1616
include_dir = []
1717
include_ext = ["go", "tpl", "tmpl", "html", "yaml"]
1818
include_file = []

cmd/api/api/instances.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
190190
hvType = hypervisor.Type(*request.Body.Hypervisor)
191191
}
192192

193+
// Parse GPU configuration (vGPU mode)
194+
var gpuConfig *instances.GPUConfig
195+
if request.Body.Gpu != nil && request.Body.Gpu.Profile != nil && *request.Body.Gpu.Profile != "" {
196+
gpuConfig = &instances.GPUConfig{
197+
Profile: *request.Body.Gpu.Profile,
198+
}
199+
}
200+
193201
// Calculate default resource limits when not specified (0 = auto)
194202
// Uses proportional allocation based on CPU: (vcpus / cpuCapacity) * resourceCapacity
195203
if diskIOBps == 0 {
@@ -220,6 +228,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
220228
Devices: deviceRefs,
221229
Volumes: volumes,
222230
Hypervisor: hvType,
231+
GPU: gpuConfig,
223232
}
224233

225234
inst, err := s.InstanceManager.CreateInstance(ctx, domainReq)
@@ -685,5 +694,17 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
685694
oapiInst.Volumes = &oapiVolumes
686695
}
687696

697+
// Convert GPU info
698+
if inst.GPUProfile != "" {
699+
gpu := &oapi.InstanceGPU{
700+
Profile: lo.ToPtr(inst.GPUProfile),
701+
}
702+
// Only set MdevUuid when non-empty to avoid "mdev_uuid": "" in output
703+
if inst.GPUMdevUUID != "" {
704+
gpu.MdevUuid = lo.ToPtr(inst.GPUMdevUUID)
705+
}
706+
oapiInst.Gpu = gpu
707+
}
708+
688709
return oapiInst
689710
}

cmd/api/api/resources.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques
5656
})
5757
}
5858

59+
// Add GPU status if available
60+
if status.GPU != nil {
61+
gpuStatus := convertGPUResourceStatus(status.GPU)
62+
resp.Gpu = &gpuStatus
63+
}
64+
5965
return oapi.GetResources200JSONResponse(resp), nil
6066
}
6167

@@ -75,3 +81,38 @@ func convertResourceStatus(rs resources.ResourceStatus) oapi.ResourceStatus {
7581
Source: source,
7682
}
7783
}
84+
85+
func convertGPUResourceStatus(gs *resources.GPUResourceStatus) oapi.GPUResourceStatus {
86+
result := oapi.GPUResourceStatus{
87+
Mode: oapi.GPUResourceStatusMode(gs.Mode),
88+
TotalSlots: gs.TotalSlots,
89+
UsedSlots: gs.UsedSlots,
90+
}
91+
92+
// Convert profiles (vGPU mode)
93+
if len(gs.Profiles) > 0 {
94+
profiles := make([]oapi.GPUProfile, len(gs.Profiles))
95+
for i, p := range gs.Profiles {
96+
profiles[i] = oapi.GPUProfile{
97+
Name: p.Name,
98+
FramebufferMb: p.FramebufferMB,
99+
Available: p.Available,
100+
}
101+
}
102+
result.Profiles = &profiles
103+
}
104+
105+
// Convert devices (passthrough mode)
106+
if len(gs.Devices) > 0 {
107+
devices := make([]oapi.PassthroughDevice, len(gs.Devices))
108+
for i, d := range gs.Devices {
109+
devices[i] = oapi.PassthroughDevice{
110+
Name: d.Name,
111+
Available: d.Available,
112+
}
113+
}
114+
result.Devices = &devices
115+
}
116+
117+
return result
118+
}

cmd/api/main.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/kernel/hypeman"
2323
"github.com/kernel/hypeman/cmd/api/api"
2424
"github.com/kernel/hypeman/cmd/api/config"
25+
"github.com/kernel/hypeman/lib/devices"
2526
"github.com/kernel/hypeman/lib/guest"
2627
"github.com/kernel/hypeman/lib/hypervisor/qemu"
2728
"github.com/kernel/hypeman/lib/instances"
@@ -200,6 +201,26 @@ func run() error {
200201
return fmt.Errorf("reconcile device state: %w", err)
201202
}
202203

204+
// Reconcile mdev devices (clears orphaned vGPUs from crashed VMs)
205+
// Build mdev info from instances - only destroys mdevs tracked by hypeman
206+
logger.Info("Reconciling mdev devices...")
207+
var mdevInfos []devices.MdevReconcileInfo
208+
if allInstances != nil {
209+
for _, inst := range allInstances {
210+
if inst.GPUMdevUUID != "" {
211+
mdevInfos = append(mdevInfos, devices.MdevReconcileInfo{
212+
InstanceID: inst.Id,
213+
MdevUUID: inst.GPUMdevUUID,
214+
IsRunning: inst.State == instances.StateRunning || inst.State == instances.StateUnknown,
215+
})
216+
}
217+
}
218+
}
219+
if err := devices.ReconcileMdevs(app.Ctx, mdevInfos); err != nil {
220+
// Log but don't fail - mdev cleanup is best-effort
221+
logger.Warn("failed to reconcile mdev devices", "error", err)
222+
}
223+
203224
// Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams)
204225
logger.Info("Initializing ingress manager...")
205226
if err := app.IngressManager.Initialize(app.Ctx); err != nil {

0 commit comments

Comments
 (0)