Skip to content

Commit 73750f6

Browse files
authored
Qemu capabilities (#49)
* Add snap / restore for qemu * Fix restore on qemu, test passing * More observability * Move sleeps * Consolidated waiting * Deduplicate arguments building * Fix possible polling error in standby / resume
1 parent 5e522f2 commit 73750f6

File tree

7 files changed

+462
-49
lines changed

7 files changed

+462
-49
lines changed

lib/hypervisor/cloudhypervisor/process.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ import (
44
"context"
55
"fmt"
66
"syscall"
7+
"time"
78

89
"github.com/onkernel/hypeman/lib/hypervisor"
10+
"github.com/onkernel/hypeman/lib/logger"
911
"github.com/onkernel/hypeman/lib/paths"
1012
"github.com/onkernel/hypeman/lib/vmm"
1113
"gvisor.dev/gvisor/pkg/cleanup"
@@ -100,17 +102,22 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
100102
// RestoreVM starts Cloud Hypervisor and restores VM state from a snapshot.
101103
// The VM is in paused state after restore; caller should call Resume() to continue execution.
102104
func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) {
105+
log := logger.FromContext(ctx)
106+
startTime := time.Now()
107+
103108
// Validate version
104109
chVersion := vmm.CHVersion(version)
105110
if !vmm.IsVersionSupported(chVersion) {
106111
return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version)
107112
}
108113

109114
// 1. Start the Cloud Hypervisor process
115+
processStartTime := time.Now()
110116
pid, err := vmm.StartProcess(ctx, p, chVersion, socketPath)
111117
if err != nil {
112118
return 0, nil, fmt.Errorf("start process: %w", err)
113119
}
120+
log.DebugContext(ctx, "CH process started", "pid", pid, "duration_ms", time.Since(processStartTime).Milliseconds())
114121

115122
// Setup cleanup to kill the process if subsequent steps fail
116123
cu := cleanup.Make(func() {
@@ -125,6 +132,7 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
125132
}
126133

127134
// 3. Restore from snapshot via HTTP API
135+
restoreAPIStart := time.Now()
128136
sourceURL := "file://" + snapshotPath
129137
restoreConfig := vmm.RestoreConfig{
130138
SourceUrl: sourceURL,
@@ -137,9 +145,11 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
137145
if resp.StatusCode() != 204 {
138146
return 0, nil, fmt.Errorf("restore failed with status %d: %s", resp.StatusCode(), string(resp.Body))
139147
}
148+
log.DebugContext(ctx, "CH restore API complete", "duration_ms", time.Since(restoreAPIStart).Milliseconds())
140149

141150
// Success - release cleanup to prevent killing the process
142151
cu.Release()
152+
log.DebugContext(ctx, "CH restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
143153
return pid, hv, nil
144154
}
145155

lib/hypervisor/qemu/process.go

Lines changed: 146 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package qemu
33

44
import (
55
"context"
6+
"encoding/json"
67
"fmt"
78
"net"
89
"os"
@@ -14,10 +15,26 @@ import (
1415
"time"
1516

1617
"github.com/onkernel/hypeman/lib/hypervisor"
18+
"github.com/onkernel/hypeman/lib/logger"
1719
"github.com/onkernel/hypeman/lib/paths"
1820
"gvisor.dev/gvisor/pkg/cleanup"
1921
)
2022

23+
// Timeout constants for QEMU operations
24+
const (
25+
// socketWaitTimeout is how long to wait for QMP socket to become available after process start
26+
socketWaitTimeout = 10 * time.Second
27+
28+
// migrationTimeout is how long to wait for migration to complete
29+
migrationTimeout = 30 * time.Second
30+
31+
// socketPollInterval is how often to check if socket is ready
32+
socketPollInterval = 50 * time.Millisecond
33+
34+
// socketDialTimeout is timeout for individual socket connection attempts
35+
socketDialTimeout = 100 * time.Millisecond
36+
)
37+
2138
func init() {
2239
hypervisor.RegisterSocketName(hypervisor.TypeQEMU, "qemu.sock")
2340
}
@@ -88,31 +105,34 @@ func (s *Starter) GetVersion(p *paths.Paths) (string, error) {
88105
return "", fmt.Errorf("could not parse QEMU version from: %s", string(output))
89106
}
90107

91-
// StartVM launches QEMU with the VM configuration and returns a Hypervisor client.
92-
// QEMU receives all configuration via command-line arguments at process start.
93-
func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) {
108+
// buildQMPArgs returns the base QMP socket arguments for QEMU.
109+
func buildQMPArgs(socketPath string) []string {
110+
return []string{
111+
"-chardev", fmt.Sprintf("socket,id=qmp,path=%s,server=on,wait=off", socketPath),
112+
"-mon", "chardev=qmp,mode=control",
113+
}
114+
}
115+
116+
// startQEMUProcess handles the common QEMU process startup logic.
117+
// Returns the PID, hypervisor client, and a cleanup function.
118+
// The cleanup function must be called on error; call cleanup.Release() on success.
119+
func (s *Starter) startQEMUProcess(ctx context.Context, p *paths.Paths, version string, socketPath string, args []string) (int, *QEMU, *cleanup.Cleanup, error) {
120+
log := logger.FromContext(ctx)
121+
94122
// Get binary path
95123
binaryPath, err := s.GetBinaryPath(p, version)
96124
if err != nil {
97-
return 0, nil, fmt.Errorf("get binary: %w", err)
125+
return 0, nil, nil, fmt.Errorf("get binary: %w", err)
98126
}
99127

100128
// Check if socket is already in use
101129
if isSocketInUse(socketPath) {
102-
return 0, nil, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath)
130+
return 0, nil, nil, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath)
103131
}
104132

105133
// Remove stale socket if exists
106134
os.Remove(socketPath)
107135

108-
// Build command arguments: QMP socket + VM configuration
109-
args := []string{
110-
"-chardev", fmt.Sprintf("socket,id=qmp,path=%s,server=on,wait=off", socketPath),
111-
"-mon", "chardev=qmp,mode=control",
112-
}
113-
// Append VM configuration as command-line arguments
114-
args = append(args, BuildArgs(config)...)
115-
116136
// Create command
117137
cmd := exec.Command(binaryPath, args...)
118138

@@ -125,7 +145,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
125145
instanceDir := filepath.Dir(socketPath)
126146
logsDir := filepath.Join(instanceDir, "logs")
127147
if err := os.MkdirAll(logsDir, 0755); err != nil {
128-
return 0, nil, fmt.Errorf("create logs directory: %w", err)
148+
return 0, nil, nil, fmt.Errorf("create logs directory: %w", err)
129149
}
130150

131151
vmmLogFile, err := os.OpenFile(
@@ -134,49 +154,148 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
134154
0644,
135155
)
136156
if err != nil {
137-
return 0, nil, fmt.Errorf("create vmm log: %w", err)
157+
return 0, nil, nil, fmt.Errorf("create vmm log: %w", err)
138158
}
139159
defer vmmLogFile.Close()
140160

141161
cmd.Stdout = vmmLogFile
142162
cmd.Stderr = vmmLogFile
143163

164+
processStartTime := time.Now()
144165
if err := cmd.Start(); err != nil {
145-
return 0, nil, fmt.Errorf("start qemu: %w", err)
166+
return 0, nil, nil, fmt.Errorf("start qemu: %w", err)
146167
}
147168

148169
pid := cmd.Process.Pid
170+
log.DebugContext(ctx, "QEMU process started", "pid", pid, "duration_ms", time.Since(processStartTime).Milliseconds())
149171

150172
// Setup cleanup to kill the process if subsequent steps fail
151173
cu := cleanup.Make(func() {
152174
syscall.Kill(pid, syscall.SIGKILL)
153175
})
154-
defer cu.Clean()
155176

156177
// Wait for socket to be ready
157-
if err := waitForSocket(socketPath, 10*time.Second); err != nil {
178+
socketWaitStart := time.Now()
179+
if err := waitForSocket(socketPath, socketWaitTimeout); err != nil {
180+
cu.Clean()
158181
vmmLogPath := filepath.Join(logsDir, "vmm.log")
159182
if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 {
160-
return 0, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData))
183+
return 0, nil, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData))
161184
}
162-
return 0, nil, err
185+
return 0, nil, nil, err
163186
}
187+
log.DebugContext(ctx, "QMP socket ready", "duration_ms", time.Since(socketWaitStart).Milliseconds())
164188

165189
// Create QMP client
166190
hv, err := New(socketPath)
167191
if err != nil {
168-
return 0, nil, fmt.Errorf("create client: %w", err)
192+
cu.Clean()
193+
return 0, nil, nil, fmt.Errorf("create client: %w", err)
194+
}
195+
196+
return pid, hv, &cu, nil
197+
}
198+
199+
// StartVM launches QEMU with the VM configuration and returns a Hypervisor client.
200+
// QEMU receives all configuration via command-line arguments at process start.
201+
func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) {
202+
log := logger.FromContext(ctx)
203+
204+
// Build command arguments: QMP socket + VM configuration
205+
args := buildQMPArgs(socketPath)
206+
args = append(args, BuildArgs(config)...)
207+
208+
pid, hv, cu, err := s.startQEMUProcess(ctx, p, version, socketPath, args)
209+
if err != nil {
210+
return 0, nil, err
211+
}
212+
defer cu.Clean()
213+
214+
// Save config for potential restore later
215+
// QEMU migration files only contain memory state, not device config
216+
instanceDir := filepath.Dir(socketPath)
217+
if err := saveVMConfig(instanceDir, config); err != nil {
218+
// Non-fatal - restore just won't work
219+
log.WarnContext(ctx, "failed to save VM config for restore", "error", err)
169220
}
170221

171-
// Success - release cleanup to prevent killing the process
172222
cu.Release()
173223
return pid, hv, nil
174224
}
175225

176226
// RestoreVM starts QEMU and restores VM state from a snapshot.
177-
// Not yet implemented for QEMU.
227+
// The VM is in paused state after restore; caller should call Resume() to continue execution.
178228
func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) {
179-
return 0, nil, fmt.Errorf("restore not supported by QEMU implementation")
229+
log := logger.FromContext(ctx)
230+
startTime := time.Now()
231+
232+
// Load saved VM config from snapshot directory
233+
// QEMU requires exact same command-line args as when snapshot was taken
234+
configLoadStart := time.Now()
235+
config, err := loadVMConfig(snapshotPath)
236+
if err != nil {
237+
return 0, nil, fmt.Errorf("load vm config from snapshot: %w", err)
238+
}
239+
log.DebugContext(ctx, "loaded VM config from snapshot", "duration_ms", time.Since(configLoadStart).Milliseconds())
240+
241+
// Build command arguments: QMP socket + VM configuration + incoming migration
242+
args := buildQMPArgs(socketPath)
243+
args = append(args, BuildArgs(config)...)
244+
245+
// Add incoming migration flag to restore from snapshot
246+
// The "file:" protocol is deprecated in QEMU 7.2+, use "exec:cat < path" instead
247+
memoryFile := filepath.Join(snapshotPath, "memory")
248+
incomingURI := "exec:cat < " + memoryFile
249+
args = append(args, "-incoming", incomingURI)
250+
251+
pid, hv, cu, err := s.startQEMUProcess(ctx, p, version, socketPath, args)
252+
if err != nil {
253+
return 0, nil, err
254+
}
255+
defer cu.Clean()
256+
257+
// Wait for VM to be ready after loading migration data
258+
// QEMU transitions from "inmigrate" to "paused" when loading completes
259+
migrationWaitStart := time.Now()
260+
if err := hv.client.WaitVMReady(ctx, migrationTimeout); err != nil {
261+
return 0, nil, fmt.Errorf("wait for vm ready: %w", err)
262+
}
263+
log.DebugContext(ctx, "VM ready", "duration_ms", time.Since(migrationWaitStart).Milliseconds())
264+
265+
cu.Release()
266+
log.DebugContext(ctx, "QEMU restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
267+
return pid, hv, nil
268+
}
269+
270+
// vmConfigFile is the name of the file where VM config is saved for restore.
271+
const vmConfigFile = "qemu-config.json"
272+
273+
// saveVMConfig saves the VM configuration to a file in the instance directory.
274+
// This is needed for QEMU restore since migration files only contain memory state.
275+
func saveVMConfig(instanceDir string, config hypervisor.VMConfig) error {
276+
configPath := filepath.Join(instanceDir, vmConfigFile)
277+
data, err := json.MarshalIndent(config, "", " ")
278+
if err != nil {
279+
return fmt.Errorf("marshal config: %w", err)
280+
}
281+
if err := os.WriteFile(configPath, data, 0644); err != nil {
282+
return fmt.Errorf("write config: %w", err)
283+
}
284+
return nil
285+
}
286+
287+
// loadVMConfig loads the VM configuration from the instance directory.
288+
func loadVMConfig(instanceDir string) (hypervisor.VMConfig, error) {
289+
configPath := filepath.Join(instanceDir, vmConfigFile)
290+
data, err := os.ReadFile(configPath)
291+
if err != nil {
292+
return hypervisor.VMConfig{}, fmt.Errorf("read config: %w", err)
293+
}
294+
var config hypervisor.VMConfig
295+
if err := json.Unmarshal(data, &config); err != nil {
296+
return hypervisor.VMConfig{}, fmt.Errorf("unmarshal config: %w", err)
297+
}
298+
return config, nil
180299
}
181300

182301
// qemuBinaryName returns the QEMU binary name for the host architecture.
@@ -205,7 +324,7 @@ func qemuInstallHint() string {
205324

206325
// isSocketInUse checks if a Unix socket is actively being used
207326
func isSocketInUse(socketPath string) bool {
208-
conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond)
327+
conn, err := net.DialTimeout("unix", socketPath, socketDialTimeout)
209328
if err != nil {
210329
return false
211330
}
@@ -217,12 +336,12 @@ func isSocketInUse(socketPath string) bool {
217336
func waitForSocket(socketPath string, timeout time.Duration) error {
218337
deadline := time.Now().Add(timeout)
219338
for time.Now().Before(deadline) {
220-
conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond)
339+
conn, err := net.DialTimeout("unix", socketPath, socketDialTimeout)
221340
if err == nil {
222341
conn.Close()
223342
return nil
224343
}
225-
time.Sleep(50 * time.Millisecond)
344+
time.Sleep(socketPollInterval)
226345
}
227346
return fmt.Errorf("timeout waiting for socket")
228347
}

lib/hypervisor/qemu/qemu.go

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package qemu
33
import (
44
"context"
55
"fmt"
6+
"os"
7+
"path/filepath"
68
"time"
79

810
"github.com/digitalocean/go-qemu/qemu"
@@ -36,8 +38,8 @@ var _ hypervisor.Hypervisor = (*QEMU)(nil)
3638
// Capabilities returns the features supported by QEMU.
3739
func (q *QEMU) Capabilities() hypervisor.Capabilities {
3840
return hypervisor.Capabilities{
39-
SupportsSnapshot: false, // Not implemented in first pass
40-
SupportsHotplugMemory: false, // Not implemented in first pass
41+
SupportsSnapshot: true, // Uses QMP migrate file:// for snapshot
42+
SupportsHotplugMemory: false, // Not implemented - balloon not configured
4143
SupportsPause: true,
4244
SupportsVsock: true,
4345
SupportsGPUPassthrough: true,
@@ -119,10 +121,40 @@ func (q *QEMU) Resume(ctx context.Context) error {
119121
return nil
120122
}
121123

122-
// Snapshot creates a VM snapshot.
123-
// Not implemented in first pass.
124+
// Snapshot creates a VM snapshot using QEMU's migrate-to-file mechanism.
125+
// The VM state is saved to destPath/memory file.
126+
// The VM config is copied to destPath for restore (QEMU requires exact arg match).
124127
func (q *QEMU) Snapshot(ctx context.Context, destPath string) error {
125-
return fmt.Errorf("snapshot not supported by QEMU implementation")
128+
// QEMU uses migrate to file for snapshots
129+
// The "file:" protocol is deprecated in QEMU 7.2+, use "exec:cat > path" instead
130+
memoryFile := destPath + "/memory"
131+
uri := "exec:cat > " + memoryFile
132+
if err := q.client.Migrate(uri); err != nil {
133+
Remove(q.socketPath)
134+
return fmt.Errorf("migrate: %w", err)
135+
}
136+
137+
// Wait for migration to complete
138+
if err := q.client.WaitMigration(ctx, migrationTimeout); err != nil {
139+
Remove(q.socketPath)
140+
return fmt.Errorf("wait migration: %w", err)
141+
}
142+
143+
// Copy VM config from instance dir to snapshot dir
144+
// QEMU restore requires exact same command-line args as when snapshot was taken
145+
instanceDir := filepath.Dir(q.socketPath)
146+
srcConfig := filepath.Join(instanceDir, vmConfigFile)
147+
dstConfig := filepath.Join(destPath, vmConfigFile)
148+
149+
configData, err := os.ReadFile(srcConfig)
150+
if err != nil {
151+
return fmt.Errorf("read vm config for snapshot: %w", err)
152+
}
153+
if err := os.WriteFile(dstConfig, configData, 0644); err != nil {
154+
return fmt.Errorf("write vm config to snapshot: %w", err)
155+
}
156+
157+
return nil
126158
}
127159

128160
// ResizeMemory changes the VM's memory allocation.

0 commit comments

Comments
 (0)