Skip to content

Commit cb29ca4

Browse files
authored
Move VMM calls behind an interface (#41)
* Move vmm calls behind hypervisor interface * Add README * address code reviews * Fix socket name too long
1 parent 4b0c8f3 commit cb29ca4

File tree

23 files changed

+973
-417
lines changed

23 files changed

+973
-417
lines changed

cmd/api/api/api_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func newTestService(t *testing.T) *ApiService {
5656
}
5757
}
5858

59-
// cleanupOrphanedProcesses kills Cloud Hypervisor processes from metadata files
59+
// cleanupOrphanedProcesses kills hypervisor processes from metadata files
6060
func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
6161
p := paths.New(dataDir)
6262
guestsDir := p.GuestsDir()
@@ -77,21 +77,21 @@ func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
7777
continue
7878
}
7979

80-
// Parse just the CHPID field
80+
// Parse just the HypervisorPID field
8181
var meta struct {
82-
CHPID *int `json:"CHPID"`
82+
HypervisorPID *int `json:"HypervisorPID"`
8383
}
8484
if err := json.Unmarshal(data, &meta); err != nil {
8585
continue
8686
}
8787

8888
// If metadata has a PID, try to kill it
89-
if meta.CHPID != nil {
90-
pid := *meta.CHPID
89+
if meta.HypervisorPID != nil {
90+
pid := *meta.HypervisorPID
9191

9292
// Check if process exists
9393
if err := syscall.Kill(pid, 0); err == nil {
94-
t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d", pid)
94+
t.Logf("Cleaning up orphaned hypervisor process: PID %d", pid)
9595
syscall.Kill(pid, syscall.SIGKILL)
9696
}
9797
}

lib/hypervisor/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Hypervisor Abstraction
2+
3+
Provides a common interface for VM management across different hypervisors.
4+
5+
## Purpose
6+
7+
Hypeman originally supported only Cloud Hypervisor. This abstraction layer allows supporting multiple hypervisors (e.g., QEMU) through a unified interface, enabling:
8+
9+
- **Hypervisor choice per instance** - Different instances can use different hypervisors
10+
- **Feature parity where possible** - Common operations work the same way
11+
- **Graceful degradation** - Features unsupported by a hypervisor can be detected and handled
12+
13+
## How It Works
14+
15+
The abstraction defines two key interfaces:
16+
17+
1. **Hypervisor** - VM lifecycle operations (create, boot, pause, resume, snapshot, restore, shutdown)
18+
2. **ProcessManager** - Hypervisor process lifecycle (start binary, get binary path)
19+
20+
Each hypervisor implementation translates the generic configuration and operations to its native format. For example, Cloud Hypervisor uses an HTTP API over a Unix socket, while QEMU would use QMP.
21+
22+
Before using optional features, callers check capabilities:
23+
24+
```go
25+
if hv.Capabilities().SupportsSnapshot {
26+
hv.Snapshot(ctx, path)
27+
}
28+
```
29+
30+
## Hypervisor Switching
31+
32+
Instances store their hypervisor type in metadata. An instance can switch hypervisors only when stopped (no running VM, no snapshot), since:
33+
34+
- Disk images are hypervisor-agnostic
35+
- Snapshots are hypervisor-specific and cannot be restored by a different hypervisor
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
// Package cloudhypervisor implements the hypervisor.Hypervisor interface
2+
// for Cloud Hypervisor VMM.
3+
package cloudhypervisor
4+
5+
import (
6+
"context"
7+
"fmt"
8+
"time"
9+
10+
"github.com/onkernel/hypeman/lib/hypervisor"
11+
"github.com/onkernel/hypeman/lib/vmm"
12+
)
13+
14+
// CloudHypervisor implements hypervisor.Hypervisor for Cloud Hypervisor VMM.
15+
type CloudHypervisor struct {
16+
client *vmm.VMM
17+
}
18+
19+
// New creates a new Cloud Hypervisor client for an existing VMM socket.
20+
func New(socketPath string) (*CloudHypervisor, error) {
21+
client, err := vmm.NewVMM(socketPath)
22+
if err != nil {
23+
return nil, fmt.Errorf("create vmm client: %w", err)
24+
}
25+
return &CloudHypervisor{
26+
client: client,
27+
}, nil
28+
}
29+
30+
// Capabilities returns the features supported by Cloud Hypervisor.
31+
func (c *CloudHypervisor) Capabilities() hypervisor.Capabilities {
32+
return hypervisor.Capabilities{
33+
SupportsSnapshot: true,
34+
SupportsHotplugMemory: true,
35+
SupportsPause: true,
36+
SupportsVsock: true,
37+
SupportsGPUPassthrough: true,
38+
}
39+
}
40+
41+
// CreateVM configures the VM in Cloud Hypervisor.
42+
func (c *CloudHypervisor) CreateVM(ctx context.Context, config hypervisor.VMConfig) error {
43+
vmConfig := ToVMConfig(config)
44+
resp, err := c.client.CreateVMWithResponse(ctx, vmConfig)
45+
if err != nil {
46+
return fmt.Errorf("create vm: %w", err)
47+
}
48+
if resp.StatusCode() != 204 {
49+
return fmt.Errorf("create vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
50+
}
51+
return nil
52+
}
53+
54+
// BootVM starts the configured VM.
55+
func (c *CloudHypervisor) BootVM(ctx context.Context) error {
56+
resp, err := c.client.BootVMWithResponse(ctx)
57+
if err != nil {
58+
return fmt.Errorf("boot vm: %w", err)
59+
}
60+
if resp.StatusCode() != 204 {
61+
return fmt.Errorf("boot vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
62+
}
63+
return nil
64+
}
65+
66+
// DeleteVM removes the VM configuration from Cloud Hypervisor.
67+
func (c *CloudHypervisor) DeleteVM(ctx context.Context) error {
68+
resp, err := c.client.DeleteVMWithResponse(ctx)
69+
if err != nil {
70+
return fmt.Errorf("delete vm: %w", err)
71+
}
72+
if resp.StatusCode() != 204 {
73+
return fmt.Errorf("delete vm failed with status %d: %s", resp.StatusCode(), string(resp.Body))
74+
}
75+
return nil
76+
}
77+
78+
// Shutdown stops the VMM process gracefully.
79+
func (c *CloudHypervisor) Shutdown(ctx context.Context) error {
80+
resp, err := c.client.ShutdownVMMWithResponse(ctx)
81+
if err != nil {
82+
return fmt.Errorf("shutdown vmm: %w", err)
83+
}
84+
// ShutdownVMM may return various codes, 204 is success
85+
if resp.StatusCode() != 204 {
86+
return fmt.Errorf("shutdown vmm failed with status %d", resp.StatusCode())
87+
}
88+
return nil
89+
}
90+
91+
// GetVMInfo returns current VM state.
92+
func (c *CloudHypervisor) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) {
93+
resp, err := c.client.GetVmInfoWithResponse(ctx)
94+
if err != nil {
95+
return nil, fmt.Errorf("get vm info: %w", err)
96+
}
97+
if resp.StatusCode() != 200 || resp.JSON200 == nil {
98+
return nil, fmt.Errorf("get vm info failed with status %d", resp.StatusCode())
99+
}
100+
101+
// Map Cloud Hypervisor state to hypervisor.VMState
102+
var state hypervisor.VMState
103+
switch resp.JSON200.State {
104+
case vmm.Created:
105+
state = hypervisor.StateCreated
106+
case vmm.Running:
107+
state = hypervisor.StateRunning
108+
case vmm.Paused:
109+
state = hypervisor.StatePaused
110+
case vmm.Shutdown:
111+
state = hypervisor.StateShutdown
112+
default:
113+
return nil, fmt.Errorf("unknown vm state: %s", resp.JSON200.State)
114+
}
115+
116+
return &hypervisor.VMInfo{
117+
State: state,
118+
MemoryActualSize: resp.JSON200.MemoryActualSize,
119+
}, nil
120+
}
121+
122+
// Pause suspends VM execution.
123+
func (c *CloudHypervisor) Pause(ctx context.Context) error {
124+
resp, err := c.client.PauseVMWithResponse(ctx)
125+
if err != nil {
126+
return fmt.Errorf("pause vm: %w", err)
127+
}
128+
if resp.StatusCode() != 204 {
129+
return fmt.Errorf("pause vm failed with status %d", resp.StatusCode())
130+
}
131+
return nil
132+
}
133+
134+
// Resume continues VM execution.
135+
func (c *CloudHypervisor) Resume(ctx context.Context) error {
136+
resp, err := c.client.ResumeVMWithResponse(ctx)
137+
if err != nil {
138+
return fmt.Errorf("resume vm: %w", err)
139+
}
140+
if resp.StatusCode() != 204 {
141+
return fmt.Errorf("resume vm failed with status %d", resp.StatusCode())
142+
}
143+
return nil
144+
}
145+
146+
// Snapshot creates a VM snapshot.
147+
func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string) error {
148+
snapshotURL := "file://" + destPath
149+
snapshotConfig := vmm.VmSnapshotConfig{DestinationUrl: &snapshotURL}
150+
resp, err := c.client.PutVmSnapshotWithResponse(ctx, snapshotConfig)
151+
if err != nil {
152+
return fmt.Errorf("snapshot: %w", err)
153+
}
154+
if resp.StatusCode() != 204 {
155+
return fmt.Errorf("snapshot failed with status %d", resp.StatusCode())
156+
}
157+
return nil
158+
}
159+
160+
// Restore loads a VM from snapshot.
161+
func (c *CloudHypervisor) Restore(ctx context.Context, sourcePath string) error {
162+
sourceURL := "file://" + sourcePath
163+
restoreConfig := vmm.RestoreConfig{
164+
SourceUrl: sourceURL,
165+
Prefault: ptr(false),
166+
}
167+
resp, err := c.client.PutVmRestoreWithResponse(ctx, restoreConfig)
168+
if err != nil {
169+
return fmt.Errorf("restore: %w", err)
170+
}
171+
if resp.StatusCode() != 204 {
172+
return fmt.Errorf("restore failed with status %d", resp.StatusCode())
173+
}
174+
return nil
175+
}
176+
177+
// ResizeMemory changes the VM's memory allocation.
178+
func (c *CloudHypervisor) ResizeMemory(ctx context.Context, bytes int64) error {
179+
resizeConfig := vmm.VmResize{DesiredRam: &bytes}
180+
resp, err := c.client.PutVmResizeWithResponse(ctx, resizeConfig)
181+
if err != nil {
182+
return fmt.Errorf("resize memory: %w", err)
183+
}
184+
if resp.StatusCode() != 204 {
185+
return fmt.Errorf("resize memory failed with status %d", resp.StatusCode())
186+
}
187+
return nil
188+
}
189+
190+
// ResizeMemoryAndWait changes the VM's memory allocation and waits for it to stabilize.
191+
// It polls until the actual memory size stabilizes (stops changing) or timeout is reached.
192+
func (c *CloudHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error {
193+
// First, request the resize
194+
if err := c.ResizeMemory(ctx, bytes); err != nil {
195+
return err
196+
}
197+
198+
// Poll until memory stabilizes
199+
const pollInterval = 20 * time.Millisecond
200+
deadline := time.Now().Add(timeout)
201+
202+
var lastSize int64 = -1
203+
stableCount := 0
204+
const requiredStableChecks = 3 // Require 3 consecutive stable readings
205+
206+
for time.Now().Before(deadline) {
207+
select {
208+
case <-ctx.Done():
209+
return ctx.Err()
210+
default:
211+
}
212+
213+
info, err := c.GetVMInfo(ctx)
214+
if err != nil {
215+
return fmt.Errorf("poll memory size: %w", err)
216+
}
217+
218+
if info.MemoryActualSize == nil {
219+
// No memory info available, just return after resize
220+
return nil
221+
}
222+
223+
currentSize := *info.MemoryActualSize
224+
225+
if currentSize == lastSize {
226+
stableCount++
227+
if stableCount >= requiredStableChecks {
228+
// Memory has stabilized
229+
return nil
230+
}
231+
} else {
232+
stableCount = 0
233+
lastSize = currentSize
234+
}
235+
236+
time.Sleep(pollInterval)
237+
}
238+
239+
// Timeout reached, but resize was requested successfully
240+
return nil
241+
}
242+
243+
func ptr[T any](v T) *T {
244+
return &v
245+
}

0 commit comments

Comments
 (0)