Skip to content

Commit 1f0e661

Browse files
committed
gpu passthrough
1 parent 9e69646 commit 1f0e661

File tree

24 files changed

+4151
-147
lines changed

24 files changed

+4151
-147
lines changed

cmd/api/api/api.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package api
22

33
import (
44
"github.com/onkernel/hypeman/cmd/api/config"
5+
"github.com/onkernel/hypeman/lib/devices"
56
"github.com/onkernel/hypeman/lib/images"
67
"github.com/onkernel/hypeman/lib/instances"
78
"github.com/onkernel/hypeman/lib/network"
@@ -16,6 +17,7 @@ type ApiService struct {
1617
InstanceManager instances.Manager
1718
VolumeManager volumes.Manager
1819
NetworkManager network.Manager
20+
DeviceManager devices.Manager
1921
}
2022

2123
var _ oapi.StrictServerInterface = (*ApiService)(nil)
@@ -27,13 +29,15 @@ func New(
2729
instanceManager instances.Manager,
2830
volumeManager volumes.Manager,
2931
networkManager network.Manager,
32+
deviceManager devices.Manager,
3033
) *ApiService {
3134
return &ApiService{
3235
Config: config,
3336
ImageManager: imageManager,
3437
InstanceManager: instanceManager,
3538
VolumeManager: volumeManager,
3639
NetworkManager: networkManager,
40+
DeviceManager: deviceManager,
3741
}
3842
}
3943

cmd/api/api/api_test.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"testing"
99

1010
"github.com/onkernel/hypeman/cmd/api/config"
11+
"github.com/onkernel/hypeman/lib/devices"
1112
"github.com/onkernel/hypeman/lib/images"
1213
"github.com/onkernel/hypeman/lib/instances"
1314
"github.com/onkernel/hypeman/lib/network"
@@ -30,8 +31,9 @@ func newTestService(t *testing.T) *ApiService {
3031

3132
systemMgr := system.NewManager(p)
3233
networkMgr := network.NewManager(p, cfg)
34+
deviceMgr := devices.NewManager(p)
3335
maxOverlaySize := int64(100 * 1024 * 1024 * 1024) // 100GB for tests
34-
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, maxOverlaySize)
36+
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, maxOverlaySize)
3537
volumeMgr := volumes.NewManager(p)
3638

3739
// Register cleanup for orphaned Cloud Hypervisor processes
@@ -44,6 +46,7 @@ func newTestService(t *testing.T) *ApiService {
4446
ImageManager: imageMgr,
4547
InstanceManager: instanceMgr,
4648
VolumeManager: volumeMgr,
49+
DeviceManager: deviceMgr,
4750
}
4851
}
4952

cmd/api/api/devices.go

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
package api
2+
3+
import (
4+
"context"
5+
"errors"
6+
7+
"github.com/onkernel/hypeman/lib/devices"
8+
"github.com/onkernel/hypeman/lib/oapi"
9+
)
10+
11+
// ListDevices returns all registered devices
12+
func (s *ApiService) ListDevices(ctx context.Context, request oapi.ListDevicesRequestObject) (oapi.ListDevicesResponseObject, error) {
13+
deviceList, err := s.DeviceManager.ListDevices(ctx)
14+
if err != nil {
15+
return oapi.ListDevices500JSONResponse{
16+
Code: "internal_error",
17+
Message: err.Error(),
18+
}, nil
19+
}
20+
21+
result := make([]oapi.Device, len(deviceList))
22+
for i, d := range deviceList {
23+
result[i] = deviceToOAPI(d)
24+
}
25+
26+
return oapi.ListDevices200JSONResponse(result), nil
27+
}
28+
29+
// ListAvailableDevices discovers passthrough-capable devices on the host
30+
func (s *ApiService) ListAvailableDevices(ctx context.Context, request oapi.ListAvailableDevicesRequestObject) (oapi.ListAvailableDevicesResponseObject, error) {
31+
available, err := s.DeviceManager.ListAvailableDevices(ctx)
32+
if err != nil {
33+
return oapi.ListAvailableDevices500JSONResponse{
34+
Code: "internal_error",
35+
Message: err.Error(),
36+
}, nil
37+
}
38+
39+
result := make([]oapi.AvailableDevice, len(available))
40+
for i, d := range available {
41+
result[i] = availableDeviceToOAPI(d)
42+
}
43+
44+
return oapi.ListAvailableDevices200JSONResponse(result), nil
45+
}
46+
47+
// CreateDevice registers a new device for passthrough
48+
func (s *ApiService) CreateDevice(ctx context.Context, request oapi.CreateDeviceRequestObject) (oapi.CreateDeviceResponseObject, error) {
49+
req := devices.CreateDeviceRequest{
50+
Name: request.Body.Name,
51+
PCIAddress: request.Body.PciAddress,
52+
}
53+
54+
device, err := s.DeviceManager.CreateDevice(ctx, req)
55+
if err != nil {
56+
switch {
57+
case errors.Is(err, devices.ErrInvalidName):
58+
return oapi.CreateDevice400JSONResponse{
59+
Code: "invalid_name",
60+
Message: err.Error(),
61+
}, nil
62+
case errors.Is(err, devices.ErrInvalidPCIAddress):
63+
return oapi.CreateDevice400JSONResponse{
64+
Code: "invalid_pci_address",
65+
Message: err.Error(),
66+
}, nil
67+
case errors.Is(err, devices.ErrDeviceNotFound):
68+
return oapi.CreateDevice404JSONResponse{
69+
Code: "device_not_found",
70+
Message: err.Error(),
71+
}, nil
72+
case errors.Is(err, devices.ErrAlreadyExists), errors.Is(err, devices.ErrNameExists):
73+
return oapi.CreateDevice409JSONResponse{
74+
Code: "conflict",
75+
Message: err.Error(),
76+
}, nil
77+
default:
78+
return oapi.CreateDevice500JSONResponse{
79+
Code: "internal_error",
80+
Message: err.Error(),
81+
}, nil
82+
}
83+
}
84+
85+
return oapi.CreateDevice201JSONResponse(deviceToOAPI(*device)), nil
86+
}
87+
88+
// GetDevice returns a device by ID or name
89+
func (s *ApiService) GetDevice(ctx context.Context, request oapi.GetDeviceRequestObject) (oapi.GetDeviceResponseObject, error) {
90+
device, err := s.DeviceManager.GetDevice(ctx, request.Id)
91+
if err != nil {
92+
if errors.Is(err, devices.ErrNotFound) {
93+
return oapi.GetDevice404JSONResponse{
94+
Code: "not_found",
95+
Message: "device not found",
96+
}, nil
97+
}
98+
return oapi.GetDevice500JSONResponse{
99+
Code: "internal_error",
100+
Message: err.Error(),
101+
}, nil
102+
}
103+
104+
return oapi.GetDevice200JSONResponse(deviceToOAPI(*device)), nil
105+
}
106+
107+
// DeleteDevice unregisters a device
108+
func (s *ApiService) DeleteDevice(ctx context.Context, request oapi.DeleteDeviceRequestObject) (oapi.DeleteDeviceResponseObject, error) {
109+
err := s.DeviceManager.DeleteDevice(ctx, request.Id)
110+
if err != nil {
111+
switch {
112+
case errors.Is(err, devices.ErrNotFound):
113+
return oapi.DeleteDevice404JSONResponse{
114+
Code: "not_found",
115+
Message: "device not found",
116+
}, nil
117+
case errors.Is(err, devices.ErrInUse):
118+
return oapi.DeleteDevice409JSONResponse{
119+
Code: "in_use",
120+
Message: "device is attached to an instance",
121+
}, nil
122+
default:
123+
return oapi.DeleteDevice500JSONResponse{
124+
Code: "internal_error",
125+
Message: err.Error(),
126+
}, nil
127+
}
128+
}
129+
130+
return oapi.DeleteDevice204Response{}, nil
131+
}
132+
133+
// Helper functions
134+
135+
func deviceToOAPI(d devices.Device) oapi.Device {
136+
deviceType := oapi.DeviceType(d.Type)
137+
return oapi.Device{
138+
Id: d.Id,
139+
Name: d.Name,
140+
Type: deviceType,
141+
PciAddress: d.PCIAddress,
142+
VendorId: d.VendorID,
143+
DeviceId: d.DeviceID,
144+
IommuGroup: d.IOMMUGroup,
145+
BoundToVfio: d.BoundToVFIO,
146+
AttachedTo: d.AttachedTo,
147+
CreatedAt: d.CreatedAt,
148+
}
149+
}
150+
151+
func availableDeviceToOAPI(d devices.AvailableDevice) oapi.AvailableDevice {
152+
return oapi.AvailableDevice{
153+
PciAddress: d.PCIAddress,
154+
VendorId: d.VendorID,
155+
DeviceId: d.DeviceID,
156+
VendorName: &d.VendorName,
157+
DeviceName: &d.DeviceName,
158+
IommuGroup: d.IOMMUGroup,
159+
CurrentDriver: d.CurrentDriver,
160+
}
161+
}
162+
163+

cmd/api/api/exec_test.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,7 @@ func TestExecInstanceNonTTY(t *testing.T) {
8282
Name: "exec-test",
8383
Image: "docker.io/library/nginx:alpine",
8484
Network: &struct {
85-
Enabled *bool `json:"enabled,omitempty"`
86-
Name *string `json:"name,omitempty"`
85+
Enabled *bool `json:"enabled,omitempty"`
8786
}{
8887
Enabled: &networkDisabled,
8988
},
@@ -108,10 +107,16 @@ func TestExecInstanceNonTTY(t *testing.T) {
108107
case <-nginxTimeout:
109108
t.Fatal("Timeout waiting for nginx to start")
110109
case <-nginxTicker.C:
111-
logs, err := svc.InstanceManager.GetInstanceLogs(ctx(), inst.Id, false, 100)
112-
if err == nil && strings.Contains(logs, "start worker processes") {
113-
nginxReady = true
114-
t.Log("Nginx is ready")
110+
logChan, err := svc.InstanceManager.StreamInstanceLogs(ctx(), inst.Id, 100, false)
111+
if err == nil {
112+
var logs strings.Builder
113+
for line := range logChan {
114+
logs.WriteString(line)
115+
}
116+
if strings.Contains(logs.String(), "start worker processes") {
117+
nginxReady = true
118+
t.Log("Nginx is ready")
119+
}
115120
}
116121
}
117122
}

cmd/api/api/instances.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,12 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
9595
networkEnabled = *request.Body.Network.Enabled
9696
}
9797

98+
// Parse devices (GPU passthrough)
99+
var deviceRefs []string
100+
if request.Body.Devices != nil {
101+
deviceRefs = *request.Body.Devices
102+
}
103+
98104
domainReq := instances.CreateInstanceRequest{
99105
Name: request.Body.Name,
100106
Image: request.Body.Image,
@@ -104,6 +110,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
104110
Vcpus: vcpus,
105111
Env: env,
106112
NetworkEnabled: networkEnabled,
113+
Devices: deviceRefs,
107114
}
108115

109116
inst, err := s.InstanceManager.CreateInstance(ctx, domainReq)

cmd/api/wire.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/google/wire"
1010
"github.com/onkernel/hypeman/cmd/api/api"
1111
"github.com/onkernel/hypeman/cmd/api/config"
12+
"github.com/onkernel/hypeman/lib/devices"
1213
"github.com/onkernel/hypeman/lib/images"
1314
"github.com/onkernel/hypeman/lib/instances"
1415
"github.com/onkernel/hypeman/lib/network"
@@ -25,6 +26,7 @@ type application struct {
2526
ImageManager images.Manager
2627
SystemManager system.Manager
2728
NetworkManager network.Manager
29+
DeviceManager devices.Manager
2830
InstanceManager instances.Manager
2931
VolumeManager volumes.Manager
3032
ApiService *api.ApiService
@@ -40,6 +42,7 @@ func initializeApp() (*application, func(), error) {
4042
providers.ProvideImageManager,
4143
providers.ProvideSystemManager,
4244
providers.ProvideNetworkManager,
45+
providers.ProvideDeviceManager,
4346
providers.ProvideInstanceManager,
4447
providers.ProvideVolumeManager,
4548
api.New,

cmd/api/wire_gen.go

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)