Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SHELL := /bin/bash
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries build-preview-cli release-prep clean

# Directory where local binaries will be installed
BIN_DIR ?= $(CURDIR)/bin
Expand Down
4 changes: 4 additions & 0 deletions cmd/api/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package api

import (
"github.com/onkernel/hypeman/cmd/api/config"
"github.com/onkernel/hypeman/lib/devices"
"github.com/onkernel/hypeman/lib/images"
"github.com/onkernel/hypeman/lib/ingress"
"github.com/onkernel/hypeman/lib/instances"
Expand All @@ -17,6 +18,7 @@ type ApiService struct {
InstanceManager instances.Manager
VolumeManager volumes.Manager
NetworkManager network.Manager
DeviceManager devices.Manager
IngressManager ingress.Manager
}

Expand All @@ -29,6 +31,7 @@ func New(
instanceManager instances.Manager,
volumeManager volumes.Manager,
networkManager network.Manager,
deviceManager devices.Manager,
ingressManager ingress.Manager,
) *ApiService {
return &ApiService{
Expand All @@ -37,6 +40,7 @@ func New(
InstanceManager: instanceManager,
VolumeManager: volumeManager,
NetworkManager: networkManager,
DeviceManager: deviceManager,
IngressManager: ingressManager,
}
}
5 changes: 4 additions & 1 deletion cmd/api/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/onkernel/hypeman/cmd/api/config"
"github.com/onkernel/hypeman/lib/devices"
"github.com/onkernel/hypeman/lib/images"
"github.com/onkernel/hypeman/lib/instances"
mw "github.com/onkernel/hypeman/lib/middleware"
Expand All @@ -34,11 +35,12 @@ func newTestService(t *testing.T) *ApiService {

systemMgr := system.NewManager(p)
networkMgr := network.NewManager(p, cfg, nil)
deviceMgr := devices.NewManager(p)
volumeMgr := volumes.NewManager(p, 0, nil) // 0 = unlimited storage
limits := instances.ResourceLimits{
MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB
}
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, volumeMgr, limits, nil, nil)
instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil)

// Register cleanup for orphaned Cloud Hypervisor processes
t.Cleanup(func() {
Expand All @@ -50,6 +52,7 @@ func newTestService(t *testing.T) *ApiService {
ImageManager: imageMgr,
InstanceManager: instanceMgr,
VolumeManager: volumeMgr,
DeviceManager: deviceMgr,
}
}

Expand Down
167 changes: 167 additions & 0 deletions cmd/api/api/devices.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package api

import (
"context"
"errors"

"github.com/onkernel/hypeman/lib/devices"
"github.com/onkernel/hypeman/lib/oapi"
)

// ListDevices returns all registered devices
func (s *ApiService) ListDevices(ctx context.Context, request oapi.ListDevicesRequestObject) (oapi.ListDevicesResponseObject, error) {
deviceList, err := s.DeviceManager.ListDevices(ctx)
if err != nil {
return oapi.ListDevices500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}

result := make([]oapi.Device, len(deviceList))
for i, d := range deviceList {
result[i] = deviceToOAPI(d)
}

return oapi.ListDevices200JSONResponse(result), nil
}

// ListAvailableDevices discovers passthrough-capable devices on the host
func (s *ApiService) ListAvailableDevices(ctx context.Context, request oapi.ListAvailableDevicesRequestObject) (oapi.ListAvailableDevicesResponseObject, error) {
available, err := s.DeviceManager.ListAvailableDevices(ctx)
if err != nil {
return oapi.ListAvailableDevices500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}

result := make([]oapi.AvailableDevice, len(available))
for i, d := range available {
result[i] = availableDeviceToOAPI(d)
}

return oapi.ListAvailableDevices200JSONResponse(result), nil
}

// CreateDevice registers a new device for passthrough
func (s *ApiService) CreateDevice(ctx context.Context, request oapi.CreateDeviceRequestObject) (oapi.CreateDeviceResponseObject, error) {
var name string
if request.Body.Name != nil {
name = *request.Body.Name
}
req := devices.CreateDeviceRequest{
Name: name,
PCIAddress: request.Body.PciAddress,
}

device, err := s.DeviceManager.CreateDevice(ctx, req)
if err != nil {
switch {
case errors.Is(err, devices.ErrInvalidName):
return oapi.CreateDevice400JSONResponse{
Code: "invalid_name",
Message: err.Error(),
}, nil
case errors.Is(err, devices.ErrInvalidPCIAddress):
return oapi.CreateDevice400JSONResponse{
Code: "invalid_pci_address",
Message: err.Error(),
}, nil
case errors.Is(err, devices.ErrDeviceNotFound):
return oapi.CreateDevice404JSONResponse{
Code: "device_not_found",
Message: err.Error(),
}, nil
case errors.Is(err, devices.ErrAlreadyExists), errors.Is(err, devices.ErrNameExists):
return oapi.CreateDevice409JSONResponse{
Code: "conflict",
Message: err.Error(),
}, nil
default:
return oapi.CreateDevice500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}
}

return oapi.CreateDevice201JSONResponse(deviceToOAPI(*device)), nil
}

// GetDevice returns a device by ID or name
func (s *ApiService) GetDevice(ctx context.Context, request oapi.GetDeviceRequestObject) (oapi.GetDeviceResponseObject, error) {
device, err := s.DeviceManager.GetDevice(ctx, request.Id)
if err != nil {
if errors.Is(err, devices.ErrNotFound) {
return oapi.GetDevice404JSONResponse{
Code: "not_found",
Message: "device not found",
}, nil
}
return oapi.GetDevice500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}

return oapi.GetDevice200JSONResponse(deviceToOAPI(*device)), nil
}

// DeleteDevice unregisters a device
func (s *ApiService) DeleteDevice(ctx context.Context, request oapi.DeleteDeviceRequestObject) (oapi.DeleteDeviceResponseObject, error) {
err := s.DeviceManager.DeleteDevice(ctx, request.Id)
if err != nil {
switch {
case errors.Is(err, devices.ErrNotFound):
return oapi.DeleteDevice404JSONResponse{
Code: "not_found",
Message: "device not found",
}, nil
case errors.Is(err, devices.ErrInUse):
return oapi.DeleteDevice409JSONResponse{
Code: "in_use",
Message: "device is attached to an instance",
}, nil
default:
return oapi.DeleteDevice500JSONResponse{
Code: "internal_error",
Message: err.Error(),
}, nil
}
}

return oapi.DeleteDevice204Response{}, nil
}

// Helper functions

func deviceToOAPI(d devices.Device) oapi.Device {
deviceType := oapi.DeviceType(d.Type)
return oapi.Device{
Id: d.Id,
Name: &d.Name,
Type: deviceType,
PciAddress: d.PCIAddress,
VendorId: d.VendorID,
DeviceId: d.DeviceID,
IommuGroup: d.IOMMUGroup,
BoundToVfio: d.BoundToVFIO,
AttachedTo: d.AttachedTo,
CreatedAt: d.CreatedAt,
}
}

func availableDeviceToOAPI(d devices.AvailableDevice) oapi.AvailableDevice {
return oapi.AvailableDevice{
PciAddress: d.PCIAddress,
VendorId: d.VendorID,
DeviceId: d.DeviceID,
VendorName: &d.VendorName,
DeviceName: &d.DeviceName,
IommuGroup: d.IOMMUGroup,
CurrentDriver: d.CurrentDriver,
}
}


7 changes: 7 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
networkEnabled = *request.Body.Network.Enabled
}

// Parse devices (GPU passthrough)
var deviceRefs []string
if request.Body.Devices != nil {
deviceRefs = *request.Body.Devices
}

// Parse volumes
var volumes []instances.VolumeAttachment
if request.Body.Volumes != nil {
Expand Down Expand Up @@ -139,6 +145,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
Vcpus: vcpus,
Env: env,
NetworkEnabled: networkEnabled,
Devices: deviceRefs,
Volumes: volumes,
}

Expand Down
12 changes: 12 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,18 @@ func run() error {
}
logger.Info("Network manager initialized")

// Reconcile device state (clears orphaned attachments from crashed VMs)
// Set up liveness checker so device reconciliation can accurately detect orphaned attachments
logger.Info("Reconciling device state...")
livenessChecker := instances.NewLivenessChecker(app.InstanceManager)
if livenessChecker != nil {
app.DeviceManager.SetLivenessChecker(livenessChecker)
}
if err := app.DeviceManager.ReconcileDevices(app.Ctx); err != nil {
logger.Error("failed to reconcile device state", "error", err)
return fmt.Errorf("reconcile device state: %w", err)
}

// Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams)
logger.Info("Initializing ingress manager...")
if err := app.IngressManager.Initialize(app.Ctx); err != nil {
Expand Down
3 changes: 3 additions & 0 deletions cmd/api/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/google/wire"
"github.com/onkernel/hypeman/cmd/api/api"
"github.com/onkernel/hypeman/cmd/api/config"
"github.com/onkernel/hypeman/lib/devices"
"github.com/onkernel/hypeman/lib/images"
"github.com/onkernel/hypeman/lib/ingress"
"github.com/onkernel/hypeman/lib/instances"
Expand All @@ -27,6 +28,7 @@ type application struct {
ImageManager images.Manager
SystemManager system.Manager
NetworkManager network.Manager
DeviceManager devices.Manager
InstanceManager instances.Manager
VolumeManager volumes.Manager
IngressManager ingress.Manager
Expand All @@ -44,6 +46,7 @@ func initializeApp() (*application, func(), error) {
providers.ProvideImageManager,
providers.ProvideSystemManager,
providers.ProvideNetworkManager,
providers.ProvideDeviceManager,
providers.ProvideInstanceManager,
providers.ProvideVolumeManager,
providers.ProvideIngressManager,
Expand Down
13 changes: 8 additions & 5 deletions cmd/api/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading