Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cmd/api/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func newTestService(t *testing.T) *ApiService {
}
}

// cleanupOrphanedProcesses kills Cloud Hypervisor processes from metadata files
// cleanupOrphanedProcesses kills hypervisor processes from metadata files
func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
p := paths.New(dataDir)
guestsDir := p.GuestsDir()
Expand All @@ -77,21 +77,21 @@ func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
continue
}

// Parse just the CHPID field
// Parse just the HypervisorPID field
var meta struct {
CHPID *int `json:"CHPID"`
HypervisorPID *int `json:"HypervisorPID"`
}
if err := json.Unmarshal(data, &meta); err != nil {
continue
}

// If metadata has a PID, try to kill it
if meta.CHPID != nil {
pid := *meta.CHPID
if meta.HypervisorPID != nil {
pid := *meta.HypervisorPID

// Check if process exists
if err := syscall.Kill(pid, 0); err == nil {
t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d", pid)
t.Logf("Cleaning up orphaned hypervisor process: PID %d", pid)
syscall.Kill(pid, syscall.SIGKILL)
}
}
Expand Down
35 changes: 35 additions & 0 deletions lib/hypervisor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Hypervisor Abstraction

Provides a common interface for VM management across different hypervisors.

## Purpose

Hypeman originally supported only Cloud Hypervisor. This abstraction layer allows supporting multiple hypervisors (e.g., QEMU) through a unified interface, enabling:

- **Hypervisor choice per instance** - Different instances can use different hypervisors
- **Feature parity where possible** - Common operations work the same way
- **Graceful degradation** - Features unsupported by a hypervisor can be detected and handled

## How It Works

The abstraction defines two key interfaces:

1. **Hypervisor** - VM lifecycle operations (create, boot, pause, resume, snapshot, restore, shutdown)
2. **ProcessManager** - Hypervisor process lifecycle (start binary, get binary path)

Each hypervisor implementation translates the generic configuration and operations to its native format. For example, Cloud Hypervisor uses an HTTP API over a Unix socket, while QEMU would use QMP.

Before using optional features, callers check capabilities:

```go
if hv.Capabilities().SupportsSnapshot {
hv.Snapshot(ctx, path)
}
```

## Hypervisor Switching

Instances store their hypervisor type in metadata. An instance can switch hypervisors only when stopped (no running VM, no snapshot), since:

- Disk images are hypervisor-agnostic
- Snapshots are hypervisor-specific and cannot be restored by a different hypervisor
245 changes: 245 additions & 0 deletions lib/hypervisor/cloudhypervisor/cloudhypervisor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
// Package cloudhypervisor implements the hypervisor.Hypervisor interface
// for Cloud Hypervisor VMM.
package cloudhypervisor

import (
"context"
"fmt"
"time"

"github.com/onkernel/hypeman/lib/hypervisor"
"github.com/onkernel/hypeman/lib/vmm"
)

// CloudHypervisor implements hypervisor.Hypervisor for Cloud Hypervisor VMM.
type CloudHypervisor struct {
client *vmm.VMM
}

// New creates a new Cloud Hypervisor client for an existing VMM socket.
func New(socketPath string) (*CloudHypervisor, error) {
client, err := vmm.NewVMM(socketPath)
if err != nil {
return nil, fmt.Errorf("create vmm client: %w", err)
}
return &CloudHypervisor{
client: client,
}, nil
}

// Capabilities returns the features supported by Cloud Hypervisor.
func (c *CloudHypervisor) Capabilities() hypervisor.Capabilities {
return hypervisor.Capabilities{
SupportsSnapshot: true,
SupportsHotplugMemory: true,
SupportsPause: true,
SupportsVsock: true,
SupportsGPUPassthrough: true,
}
}

// CreateVM configures the VM in Cloud Hypervisor.
func (c *CloudHypervisor) CreateVM(ctx context.Context, config hypervisor.VMConfig) error {
vmConfig := ToVMConfig(config)
resp, err := c.client.CreateVMWithResponse(ctx, vmConfig)
if err != nil {
return fmt.Errorf("create vm: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("create vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
}
return nil
}

// BootVM starts the configured VM.
func (c *CloudHypervisor) BootVM(ctx context.Context) error {
resp, err := c.client.BootVMWithResponse(ctx)
if err != nil {
return fmt.Errorf("boot vm: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("boot vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
}
return nil
}

// DeleteVM removes the VM configuration from Cloud Hypervisor.
func (c *CloudHypervisor) DeleteVM(ctx context.Context) error {
resp, err := c.client.DeleteVMWithResponse(ctx)
if err != nil {
return fmt.Errorf("delete vm: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("delete vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
}
return nil
}

// Shutdown stops the VMM process gracefully.
func (c *CloudHypervisor) Shutdown(ctx context.Context) error {
resp, err := c.client.ShutdownVMMWithResponse(ctx)
if err != nil {
return fmt.Errorf("shutdown vmm: %w", err)
}
// ShutdownVMM may return various codes, 204 is success
if resp.StatusCode() != 204 {
return fmt.Errorf("shutdown vmm failed with status %d", resp.StatusCode())
}
return nil
}

// GetVMInfo returns current VM state.
func (c *CloudHypervisor) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) {
resp, err := c.client.GetVmInfoWithResponse(ctx)
if err != nil {
return nil, fmt.Errorf("get vm info: %w", err)
}
if resp.StatusCode() != 200 || resp.JSON200 == nil {
return nil, fmt.Errorf("get vm info failed with status %d", resp.StatusCode())
}

// Map Cloud Hypervisor state to hypervisor.VMState
var state hypervisor.VMState
switch resp.JSON200.State {
case vmm.Created:
state = hypervisor.StateCreated
case vmm.Running:
state = hypervisor.StateRunning
case vmm.Paused:
state = hypervisor.StatePaused
case vmm.Shutdown:
state = hypervisor.StateShutdown
default:
return nil, fmt.Errorf("unknown vm state: %s", resp.JSON200.State)
}

return &hypervisor.VMInfo{
State: state,
MemoryActualSize: resp.JSON200.MemoryActualSize,
}, nil
}

// Pause suspends VM execution.
func (c *CloudHypervisor) Pause(ctx context.Context) error {
resp, err := c.client.PauseVMWithResponse(ctx)
if err != nil {
return fmt.Errorf("pause vm: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("pause vm failed with status %d", resp.StatusCode())
}
return nil
}

// Resume continues VM execution.
func (c *CloudHypervisor) Resume(ctx context.Context) error {
resp, err := c.client.ResumeVMWithResponse(ctx)
if err != nil {
return fmt.Errorf("resume vm: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("resume vm failed with status %d", resp.StatusCode())
}
return nil
}

// Snapshot creates a VM snapshot.
func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string) error {
snapshotURL := "file://" + destPath
snapshotConfig := vmm.VmSnapshotConfig{DestinationUrl: &snapshotURL}
resp, err := c.client.PutVmSnapshotWithResponse(ctx, snapshotConfig)
if err != nil {
return fmt.Errorf("snapshot: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("snapshot failed with status %d", resp.StatusCode())
}
return nil
}

// Restore loads a VM from snapshot.
func (c *CloudHypervisor) Restore(ctx context.Context, sourcePath string) error {
sourceURL := "file://" + sourcePath
restoreConfig := vmm.RestoreConfig{
SourceUrl: sourceURL,
Prefault: ptr(false),
}
resp, err := c.client.PutVmRestoreWithResponse(ctx, restoreConfig)
if err != nil {
return fmt.Errorf("restore: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("restore failed with status %d", resp.StatusCode())
}
return nil
}

// ResizeMemory changes the VM's memory allocation.
func (c *CloudHypervisor) ResizeMemory(ctx context.Context, bytes int64) error {
resizeConfig := vmm.VmResize{DesiredRam: &bytes}
resp, err := c.client.PutVmResizeWithResponse(ctx, resizeConfig)
if err != nil {
return fmt.Errorf("resize memory: %w", err)
}
if resp.StatusCode() != 204 {
return fmt.Errorf("resize memory failed with status %d", resp.StatusCode())
}
return nil
}

// ResizeMemoryAndWait changes the VM's memory allocation and waits for it to stabilize.
// It polls until the actual memory size stabilizes (stops changing) or timeout is reached.
func (c *CloudHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error {
// First, request the resize
if err := c.ResizeMemory(ctx, bytes); err != nil {
return err
}

// Poll until memory stabilizes
const pollInterval = 20 * time.Millisecond
deadline := time.Now().Add(timeout)

var lastSize int64 = -1
stableCount := 0
const requiredStableChecks = 3 // Require 3 consecutive stable readings

for time.Now().Before(deadline) {
select {
case <-ctx.Done():
return ctx.Err()
default:
}

info, err := c.GetVMInfo(ctx)
if err != nil {
return fmt.Errorf("poll memory size: %w", err)
}

if info.MemoryActualSize == nil {
// No memory info available, just return after resize
return nil
}

currentSize := *info.MemoryActualSize

if currentSize == lastSize {
stableCount++
if stableCount >= requiredStableChecks {
// Memory has stabilized
return nil
}
} else {
stableCount = 0
lastSize = currentSize
}

time.Sleep(pollInterval)
}

// Timeout reached, but resize was requested successfully
return nil
}

func ptr[T any](v T) *T {
return &v
}
Loading