Skip to content

Commit ccac7d7

Browse files
committed
Add snap / restore for qemu
1 parent 5e522f2 commit ccac7d7

File tree

5 files changed

+353
-22
lines changed

5 files changed

+353
-22
lines changed

lib/hypervisor/qemu/process.go

Lines changed: 133 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package qemu
33

44
import (
55
"context"
6+
"encoding/json"
67
"fmt"
78
"net"
89
"os"
@@ -168,15 +169,145 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
168169
return 0, nil, fmt.Errorf("create client: %w", err)
169170
}
170171

172+
// Save config for potential restore later
173+
// QEMU migration files only contain memory state, not device config
174+
if err := saveVMConfig(instanceDir, config); err != nil {
175+
// Non-fatal - restore just won't work
176+
// Log would be nice but we don't have logger here
177+
}
178+
171179
// Success - release cleanup to prevent killing the process
172180
cu.Release()
173181
return pid, hv, nil
174182
}
175183

176184
// RestoreVM starts QEMU and restores VM state from a snapshot.
177-
// Not yet implemented for QEMU.
185+
// The VM is in paused state after restore; caller should call Resume() to continue execution.
178186
func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) {
179-
return 0, nil, fmt.Errorf("restore not supported by QEMU implementation")
187+
// Get binary path
188+
binaryPath, err := s.GetBinaryPath(p, version)
189+
if err != nil {
190+
return 0, nil, fmt.Errorf("get binary: %w", err)
191+
}
192+
193+
// Check if socket is already in use
194+
if isSocketInUse(socketPath) {
195+
return 0, nil, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath)
196+
}
197+
198+
// Remove stale socket if exists
199+
os.Remove(socketPath)
200+
201+
// Load saved VM config from snapshot directory
202+
// QEMU requires exact same command-line args as when snapshot was taken
203+
config, err := loadVMConfig(snapshotPath)
204+
if err != nil {
205+
return 0, nil, fmt.Errorf("load vm config from snapshot: %w", err)
206+
}
207+
208+
instanceDir := filepath.Dir(socketPath)
209+
210+
// Build command arguments: QMP socket + VM configuration + incoming migration
211+
args := []string{
212+
"-chardev", fmt.Sprintf("socket,id=qmp,path=%s,server=on,wait=off", socketPath),
213+
"-mon", "chardev=qmp,mode=control",
214+
}
215+
// Append VM configuration as command-line arguments
216+
args = append(args, BuildArgs(config)...)
217+
218+
// Add incoming migration flag to restore from snapshot
219+
// The snapshot file is named "memory" in the snapshot directory
220+
incomingURI := "file://" + filepath.Join(snapshotPath, "memory")
221+
args = append(args, "-incoming", incomingURI)
222+
223+
// Create command
224+
cmd := exec.Command(binaryPath, args...)
225+
226+
// Daemonize: detach from parent process group
227+
cmd.SysProcAttr = &syscall.SysProcAttr{
228+
Setpgid: true,
229+
}
230+
231+
// Redirect stdout/stderr to VMM log file
232+
logsDir := filepath.Join(instanceDir, "logs")
233+
if err := os.MkdirAll(logsDir, 0755); err != nil {
234+
return 0, nil, fmt.Errorf("create logs directory: %w", err)
235+
}
236+
237+
vmmLogFile, err := os.OpenFile(
238+
filepath.Join(logsDir, "vmm.log"),
239+
os.O_CREATE|os.O_WRONLY|os.O_APPEND,
240+
0644,
241+
)
242+
if err != nil {
243+
return 0, nil, fmt.Errorf("create vmm log: %w", err)
244+
}
245+
defer vmmLogFile.Close()
246+
247+
cmd.Stdout = vmmLogFile
248+
cmd.Stderr = vmmLogFile
249+
250+
if err := cmd.Start(); err != nil {
251+
return 0, nil, fmt.Errorf("start qemu: %w", err)
252+
}
253+
254+
pid := cmd.Process.Pid
255+
256+
// Setup cleanup to kill the process if subsequent steps fail
257+
cu := cleanup.Make(func() {
258+
syscall.Kill(pid, syscall.SIGKILL)
259+
})
260+
defer cu.Clean()
261+
262+
// Wait for socket to be ready
263+
if err := waitForSocket(socketPath, 10*time.Second); err != nil {
264+
vmmLogPath := filepath.Join(logsDir, "vmm.log")
265+
if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 {
266+
return 0, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData))
267+
}
268+
return 0, nil, err
269+
}
270+
271+
// Create QMP client
272+
hv, err := New(socketPath)
273+
if err != nil {
274+
return 0, nil, fmt.Errorf("create client: %w", err)
275+
}
276+
277+
// Success - release cleanup to prevent killing the process
278+
cu.Release()
279+
return pid, hv, nil
280+
}
281+
282+
// vmConfigFile is the name of the file where VM config is saved for restore.
283+
const vmConfigFile = "qemu-config.json"
284+
285+
// saveVMConfig saves the VM configuration to a file in the instance directory.
286+
// This is needed for QEMU restore since migration files only contain memory state.
287+
func saveVMConfig(instanceDir string, config hypervisor.VMConfig) error {
288+
configPath := filepath.Join(instanceDir, vmConfigFile)
289+
data, err := json.MarshalIndent(config, "", " ")
290+
if err != nil {
291+
return fmt.Errorf("marshal config: %w", err)
292+
}
293+
if err := os.WriteFile(configPath, data, 0644); err != nil {
294+
return fmt.Errorf("write config: %w", err)
295+
}
296+
return nil
297+
}
298+
299+
// loadVMConfig loads the VM configuration from the instance directory.
300+
func loadVMConfig(instanceDir string) (hypervisor.VMConfig, error) {
301+
configPath := filepath.Join(instanceDir, vmConfigFile)
302+
data, err := os.ReadFile(configPath)
303+
if err != nil {
304+
return hypervisor.VMConfig{}, fmt.Errorf("read config: %w", err)
305+
}
306+
var config hypervisor.VMConfig
307+
if err := json.Unmarshal(data, &config); err != nil {
308+
return hypervisor.VMConfig{}, fmt.Errorf("unmarshal config: %w", err)
309+
}
310+
return config, nil
180311
}
181312

182313
// qemuBinaryName returns the QEMU binary name for the host architecture.

lib/hypervisor/qemu/qemu.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package qemu
33
import (
44
"context"
55
"fmt"
6+
"os"
7+
"path/filepath"
68
"time"
79

810
"github.com/digitalocean/go-qemu/qemu"
@@ -36,8 +38,8 @@ var _ hypervisor.Hypervisor = (*QEMU)(nil)
3638
// Capabilities returns the features supported by QEMU.
3739
func (q *QEMU) Capabilities() hypervisor.Capabilities {
3840
return hypervisor.Capabilities{
39-
SupportsSnapshot: false, // Not implemented in first pass
40-
SupportsHotplugMemory: false, // Not implemented in first pass
41+
SupportsSnapshot: true, // Uses QMP migrate file:// for snapshot
42+
SupportsHotplugMemory: false, // Not implemented - balloon not configured
4143
SupportsPause: true,
4244
SupportsVsock: true,
4345
SupportsGPUPassthrough: true,
@@ -119,10 +121,39 @@ func (q *QEMU) Resume(ctx context.Context) error {
119121
return nil
120122
}
121123

122-
// Snapshot creates a VM snapshot.
123-
// Not implemented in first pass.
124+
// Snapshot creates a VM snapshot using QEMU's migrate-to-file mechanism.
125+
// The VM state is saved to destPath/memory file.
126+
// The VM config is copied to destPath for restore (QEMU requires exact arg match).
124127
func (q *QEMU) Snapshot(ctx context.Context, destPath string) error {
125-
return fmt.Errorf("snapshot not supported by QEMU implementation")
128+
// QEMU uses migrate to file for snapshots
129+
// The file URI must be absolute path
130+
uri := "file://" + destPath + "/memory"
131+
if err := q.client.Migrate(uri); err != nil {
132+
Remove(q.socketPath)
133+
return fmt.Errorf("migrate: %w", err)
134+
}
135+
136+
// Wait for migration to complete
137+
if err := q.client.WaitMigration(ctx, 30*time.Second); err != nil {
138+
Remove(q.socketPath)
139+
return fmt.Errorf("wait migration: %w", err)
140+
}
141+
142+
// Copy VM config from instance dir to snapshot dir
143+
// QEMU restore requires exact same command-line args as when snapshot was taken
144+
instanceDir := filepath.Dir(q.socketPath)
145+
srcConfig := filepath.Join(instanceDir, vmConfigFile)
146+
dstConfig := filepath.Join(destPath, vmConfigFile)
147+
148+
configData, err := os.ReadFile(srcConfig)
149+
if err != nil {
150+
return fmt.Errorf("read vm config for snapshot: %w", err)
151+
}
152+
if err := os.WriteFile(dstConfig, configData, 0644); err != nil {
153+
return fmt.Errorf("write vm config to snapshot: %w", err)
154+
}
155+
156+
return nil
126157
}
127158

128159
// ResizeMemory changes the VM's memory allocation.

lib/hypervisor/qemu/qmp.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package qemu
22

33
import (
4+
"context"
45
"fmt"
56
"time"
67

@@ -94,3 +95,59 @@ func (c *Client) Events() (chan qmp.Event, chan struct{}, error) {
9495
func (c *Client) Run(cmd qmp.Command) ([]byte, error) {
9596
return c.domain.Run(cmd)
9697
}
98+
99+
// Migrate initiates a migration to the given URI (typically "file:///path").
100+
// This is used for saving VM state to a file for snapshot/standby.
101+
func (c *Client) Migrate(uri string) error {
102+
// Migrate(uri, blk, inc, detach) - we use nil for optional params
103+
return c.raw.Migrate(uri, nil, nil, nil)
104+
}
105+
106+
// QueryMigration returns the current migration status.
107+
func (c *Client) QueryMigration() (raw.MigrationInfo, error) {
108+
return c.raw.QueryMigrate()
109+
}
110+
111+
// WaitMigration polls until migration completes or times out.
112+
// Returns nil if migration completed successfully, error otherwise.
113+
func (c *Client) WaitMigration(ctx context.Context, timeout time.Duration) error {
114+
deadline := time.Now().Add(timeout)
115+
pollInterval := 50 * time.Millisecond
116+
117+
for time.Now().Before(deadline) {
118+
select {
119+
case <-ctx.Done():
120+
return ctx.Err()
121+
default:
122+
}
123+
124+
info, err := c.QueryMigration()
125+
if err != nil {
126+
return fmt.Errorf("query migration: %w", err)
127+
}
128+
129+
// Check migration status (Status is a pointer in MigrationInfo)
130+
if info.Status == nil {
131+
// Status not available yet, continue polling
132+
time.Sleep(pollInterval)
133+
continue
134+
}
135+
136+
switch *info.Status {
137+
case raw.MigrationStatusCompleted:
138+
return nil
139+
case raw.MigrationStatusFailed:
140+
return fmt.Errorf("migration failed")
141+
case raw.MigrationStatusCancelled:
142+
return fmt.Errorf("migration cancelled")
143+
case raw.MigrationStatusActive, raw.MigrationStatusSetup, raw.MigrationStatusPreSwitchover, raw.MigrationStatusDevice:
144+
// Still in progress, continue polling
145+
default:
146+
// Unknown or "none" status - might not have started yet
147+
}
148+
149+
time.Sleep(pollInterval)
150+
}
151+
152+
return fmt.Errorf("migration timeout after %v", timeout)
153+
}

0 commit comments

Comments
 (0)