Skip to content

Commit f563ada

Browse files
author
Rafael Garcia
committed
feat(api): add devices API endpoints and documentation
Add REST API for device management and supporting documentation: API endpoints: - GET/POST /devices - List and register devices - GET/DELETE /devices/{id} - Get and delete devices - GET /devices/available - Discover passthrough-capable devices - instances.go: Accept devices param in CreateInstance Documentation: - GPU.md: GPU passthrough architecture and driver injection - README.md: Device management usage guide - scripts/gpu-reset.sh: GPU reset utility Tests and fixtures: - gpu_e2e_test.go, gpu_inference_test.go, gpu_module_test.go - testdata/ollama-cuda/ - CUDA test container Also adds build-preview-cli Makefile target.
1 parent 5e6ad08 commit f563ada

File tree

18 files changed

+4316
-94
lines changed

18 files changed

+4316
-94
lines changed

Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SHELL := /bin/bash
2-
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean
2+
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries build-preview-cli release-prep clean
33

44
# Directory where local binaries will be installed
55
BIN_DIR ?= $(CURDIR)/bin
@@ -168,6 +168,12 @@ build: ensure-ch-binaries ensure-caddy-binaries lib/system/exec_agent/exec-agent
168168
# Build all binaries
169169
build-all: build
170170

171+
# Build preview CLI from stainless-sdks/hypeman-cli
172+
# Usage: make build-preview-cli - uses preview/<current-branch>
173+
# make build-preview-cli CLI_BRANCH=preview/xyz - uses specific branch
174+
build-preview-cli:
175+
@./scripts/build-preview-cli.sh $(CLI_BRANCH)
176+
171177
# Run in development mode with hot reload
172178
dev: ensure-ch-binaries ensure-caddy-binaries lib/system/exec_agent/exec-agent $(AIR)
173179
@rm -f ./tmp/main

cmd/api/api/devices.go

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package api
2+
3+
import (
4+
"context"
5+
"errors"
6+
7+
"github.com/onkernel/hypeman/lib/devices"
8+
"github.com/onkernel/hypeman/lib/oapi"
9+
)
10+
11+
// ListDevices returns all registered devices
12+
func (s *ApiService) ListDevices(ctx context.Context, request oapi.ListDevicesRequestObject) (oapi.ListDevicesResponseObject, error) {
13+
deviceList, err := s.DeviceManager.ListDevices(ctx)
14+
if err != nil {
15+
return oapi.ListDevices500JSONResponse{
16+
Code: "internal_error",
17+
Message: err.Error(),
18+
}, nil
19+
}
20+
21+
result := make([]oapi.Device, len(deviceList))
22+
for i, d := range deviceList {
23+
result[i] = deviceToOAPI(d)
24+
}
25+
26+
return oapi.ListDevices200JSONResponse(result), nil
27+
}
28+
29+
// ListAvailableDevices discovers passthrough-capable devices on the host
30+
func (s *ApiService) ListAvailableDevices(ctx context.Context, request oapi.ListAvailableDevicesRequestObject) (oapi.ListAvailableDevicesResponseObject, error) {
31+
available, err := s.DeviceManager.ListAvailableDevices(ctx)
32+
if err != nil {
33+
return oapi.ListAvailableDevices500JSONResponse{
34+
Code: "internal_error",
35+
Message: err.Error(),
36+
}, nil
37+
}
38+
39+
result := make([]oapi.AvailableDevice, len(available))
40+
for i, d := range available {
41+
result[i] = availableDeviceToOAPI(d)
42+
}
43+
44+
return oapi.ListAvailableDevices200JSONResponse(result), nil
45+
}
46+
47+
// CreateDevice registers a new device for passthrough
48+
func (s *ApiService) CreateDevice(ctx context.Context, request oapi.CreateDeviceRequestObject) (oapi.CreateDeviceResponseObject, error) {
49+
var name string
50+
if request.Body.Name != nil {
51+
name = *request.Body.Name
52+
}
53+
req := devices.CreateDeviceRequest{
54+
Name: name,
55+
PCIAddress: request.Body.PciAddress,
56+
}
57+
58+
device, err := s.DeviceManager.CreateDevice(ctx, req)
59+
if err != nil {
60+
switch {
61+
case errors.Is(err, devices.ErrInvalidName):
62+
return oapi.CreateDevice400JSONResponse{
63+
Code: "invalid_name",
64+
Message: err.Error(),
65+
}, nil
66+
case errors.Is(err, devices.ErrInvalidPCIAddress):
67+
return oapi.CreateDevice400JSONResponse{
68+
Code: "invalid_pci_address",
69+
Message: err.Error(),
70+
}, nil
71+
case errors.Is(err, devices.ErrDeviceNotFound):
72+
return oapi.CreateDevice404JSONResponse{
73+
Code: "device_not_found",
74+
Message: err.Error(),
75+
}, nil
76+
case errors.Is(err, devices.ErrAlreadyExists), errors.Is(err, devices.ErrNameExists):
77+
return oapi.CreateDevice409JSONResponse{
78+
Code: "conflict",
79+
Message: err.Error(),
80+
}, nil
81+
default:
82+
return oapi.CreateDevice500JSONResponse{
83+
Code: "internal_error",
84+
Message: err.Error(),
85+
}, nil
86+
}
87+
}
88+
89+
return oapi.CreateDevice201JSONResponse(deviceToOAPI(*device)), nil
90+
}
91+
92+
// GetDevice returns a device by ID or name
93+
func (s *ApiService) GetDevice(ctx context.Context, request oapi.GetDeviceRequestObject) (oapi.GetDeviceResponseObject, error) {
94+
device, err := s.DeviceManager.GetDevice(ctx, request.Id)
95+
if err != nil {
96+
if errors.Is(err, devices.ErrNotFound) {
97+
return oapi.GetDevice404JSONResponse{
98+
Code: "not_found",
99+
Message: "device not found",
100+
}, nil
101+
}
102+
return oapi.GetDevice500JSONResponse{
103+
Code: "internal_error",
104+
Message: err.Error(),
105+
}, nil
106+
}
107+
108+
return oapi.GetDevice200JSONResponse(deviceToOAPI(*device)), nil
109+
}
110+
111+
// DeleteDevice unregisters a device
112+
func (s *ApiService) DeleteDevice(ctx context.Context, request oapi.DeleteDeviceRequestObject) (oapi.DeleteDeviceResponseObject, error) {
113+
err := s.DeviceManager.DeleteDevice(ctx, request.Id)
114+
if err != nil {
115+
switch {
116+
case errors.Is(err, devices.ErrNotFound):
117+
return oapi.DeleteDevice404JSONResponse{
118+
Code: "not_found",
119+
Message: "device not found",
120+
}, nil
121+
case errors.Is(err, devices.ErrInUse):
122+
return oapi.DeleteDevice409JSONResponse{
123+
Code: "in_use",
124+
Message: "device is attached to an instance",
125+
}, nil
126+
default:
127+
return oapi.DeleteDevice500JSONResponse{
128+
Code: "internal_error",
129+
Message: err.Error(),
130+
}, nil
131+
}
132+
}
133+
134+
return oapi.DeleteDevice204Response{}, nil
135+
}
136+
137+
// Helper functions
138+
139+
func deviceToOAPI(d devices.Device) oapi.Device {
140+
deviceType := oapi.DeviceType(d.Type)
141+
return oapi.Device{
142+
Id: d.Id,
143+
Name: &d.Name,
144+
Type: deviceType,
145+
PciAddress: d.PCIAddress,
146+
VendorId: d.VendorID,
147+
DeviceId: d.DeviceID,
148+
IommuGroup: d.IOMMUGroup,
149+
BoundToVfio: d.BoundToVFIO,
150+
AttachedTo: d.AttachedTo,
151+
CreatedAt: d.CreatedAt,
152+
}
153+
}
154+
155+
func availableDeviceToOAPI(d devices.AvailableDevice) oapi.AvailableDevice {
156+
return oapi.AvailableDevice{
157+
PciAddress: d.PCIAddress,
158+
VendorId: d.VendorID,
159+
DeviceId: d.DeviceID,
160+
VendorName: &d.VendorName,
161+
DeviceName: &d.DeviceName,
162+
IommuGroup: d.IOMMUGroup,
163+
CurrentDriver: d.CurrentDriver,
164+
}
165+
}
166+
167+

cmd/api/api/instances.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
9696
networkEnabled = *request.Body.Network.Enabled
9797
}
9898

99+
// Parse devices (GPU passthrough)
100+
var deviceRefs []string
101+
if request.Body.Devices != nil {
102+
deviceRefs = *request.Body.Devices
103+
}
104+
99105
// Parse volumes
100106
var volumes []instances.VolumeAttachment
101107
if request.Body.Volumes != nil {
@@ -139,6 +145,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
139145
Vcpus: vcpus,
140146
Env: env,
141147
NetworkEnabled: networkEnabled,
148+
Devices: deviceRefs,
142149
Volumes: volumes,
143150
}
144151

cmd/api/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,18 @@ func run() error {
172172
}
173173
logger.Info("Network manager initialized")
174174

175+
// Reconcile device state (clears orphaned attachments from crashed VMs)
176+
// Set up liveness checker so device reconciliation can accurately detect orphaned attachments
177+
logger.Info("Reconciling device state...")
178+
livenessChecker := instances.NewLivenessChecker(app.InstanceManager)
179+
if livenessChecker != nil {
180+
app.DeviceManager.SetLivenessChecker(livenessChecker)
181+
}
182+
if err := app.DeviceManager.ReconcileDevices(app.Ctx); err != nil {
183+
logger.Error("failed to reconcile device state", "error", err)
184+
return fmt.Errorf("reconcile device state: %w", err)
185+
}
186+
175187
// Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams)
176188
logger.Info("Initializing ingress manager...")
177189
if err := app.IngressManager.Initialize(app.Ctx); err != nil {

go.mod

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,30 +41,37 @@ require (
4141
go.opentelemetry.io/otel/trace v1.38.0
4242
golang.org/x/sync v0.17.0
4343
golang.org/x/sys v0.38.0
44-
golang.org/x/term v0.37.0
4544
google.golang.org/grpc v1.77.0
4645
google.golang.org/protobuf v1.36.10
4746
gvisor.dev/gvisor v0.0.0-20251125014920-fc40e232ff54
4847
)
4948

5049
require (
5150
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect
51+
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
52+
github.com/Microsoft/go-winio v0.6.2 // indirect
5253
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
5354
github.com/apex/log v1.9.0 // indirect
5455
github.com/blang/semver/v4 v4.0.0 // indirect
5556
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
57+
github.com/containerd/errdefs v1.0.0 // indirect
58+
github.com/containerd/errdefs/pkg v0.3.0 // indirect
5659
github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect
5760
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
5861
github.com/docker/cli v28.2.2+incompatible // indirect
5962
github.com/docker/distribution v2.8.3+incompatible // indirect
63+
github.com/docker/docker v28.2.2+incompatible // indirect
6064
github.com/docker/docker-credential-helpers v0.9.3 // indirect
65+
github.com/docker/go-connections v0.5.0 // indirect
66+
github.com/docker/go-units v0.5.0 // indirect
6167
github.com/dustin/go-humanize v1.0.1 // indirect
6268
github.com/felixge/httpsnoop v1.0.4 // indirect
6369
github.com/go-logr/logr v1.4.3 // indirect
6470
github.com/go-logr/stdr v1.2.2 // indirect
6571
github.com/go-openapi/jsonpointer v0.21.0 // indirect
6672
github.com/go-openapi/swag v0.23.0 // indirect
6773
github.com/go-test/deep v1.1.1 // indirect
74+
github.com/gogo/protobuf v1.3.2 // indirect
6875
github.com/google/uuid v1.6.0 // indirect
6976
github.com/gorilla/mux v1.8.1 // indirect
7077
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
@@ -74,6 +81,8 @@ require (
7481
github.com/mailru/easyjson v0.7.7 // indirect
7582
github.com/mdlayher/socket v0.5.1 // indirect
7683
github.com/mitchellh/go-homedir v1.1.0 // indirect
84+
github.com/moby/docker-image-spec v1.3.1 // indirect
85+
github.com/moby/sys/sequential v0.6.0 // indirect
7786
github.com/moby/sys/user v0.4.0 // indirect
7887
github.com/moby/sys/userns v0.1.0 // indirect
7988
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
@@ -92,6 +101,7 @@ require (
92101
github.com/vishvananda/netns v0.0.5 // indirect
93102
github.com/woodsbury/decimal128 v1.3.0 // indirect
94103
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
104+
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
95105
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
96106
go.opentelemetry.io/otel/log v0.14.0 // indirect
97107
go.opentelemetry.io/proto/otlp v1.7.1 // indirect

0 commit comments

Comments
 (0)