Skip to content

Commit c998b64

Browse files
committed
Fix possible polling error in standby / resume
1 parent 6bbb7a7 commit c998b64

File tree

2 files changed

+58
-18
lines changed

2 files changed

+58
-18
lines changed

lib/hypervisor/qemu/process.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,13 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
254254
}
255255
defer cu.Clean()
256256

257-
// Wait for incoming migration to complete
258-
// QEMU loads the migration data from the exec subprocess
259-
// After loading, VM is in paused state and ready for 'cont'
257+
// Wait for VM to be ready after loading migration data
258+
// QEMU transitions from "inmigrate" to "paused" when loading completes
260259
migrationWaitStart := time.Now()
261-
if err := hv.client.WaitMigration(ctx, migrationTimeout); err != nil {
262-
return 0, nil, fmt.Errorf("wait for migration: %w", err)
260+
if err := hv.client.WaitVMReady(ctx, migrationTimeout); err != nil {
261+
return 0, nil, fmt.Errorf("wait for vm ready: %w", err)
263262
}
264-
log.DebugContext(ctx, "migration complete", "duration_ms", time.Since(migrationWaitStart).Milliseconds())
263+
log.DebugContext(ctx, "VM ready", "duration_ms", time.Since(migrationWaitStart).Milliseconds())
265264

266265
cu.Release()
267266
log.DebugContext(ctx, "QEMU restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())

lib/hypervisor/qemu/qmp.go

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ const (
1515
// qmpConnectTimeout is the timeout for connecting to the QMP socket
1616
qmpConnectTimeout = 1 * time.Second
1717

18-
// qmpMigrationPollInterval is how often to poll migration status in WaitMigration
19-
qmpMigrationPollInterval = 50 * time.Millisecond
18+
// qmpPollInterval is how often to poll status in WaitMigration and WaitVMReady
19+
qmpPollInterval = 50 * time.Millisecond
2020
)
2121

2222
// Client wraps go-qemu's Domain and raw.Monitor with convenience methods.
@@ -117,8 +117,8 @@ func (c *Client) QueryMigration() (raw.MigrationInfo, error) {
117117
return c.raw.QueryMigrate()
118118
}
119119

120-
// WaitMigration polls until migration completes or times out.
121-
// Works for both outgoing (snapshot) and incoming (restore) migrations.
120+
// WaitMigration polls until an outgoing migration completes or times out.
121+
// Used for snapshot/standby operations where we initiate the migration.
122122
// Returns nil if migration completed successfully, error otherwise.
123123
func (c *Client) WaitMigration(ctx context.Context, timeout time.Duration) error {
124124
deadline := time.Now().Add(timeout)
@@ -133,36 +133,77 @@ func (c *Client) WaitMigration(ctx context.Context, timeout time.Duration) error
133133
info, err := c.QueryMigration()
134134
if err != nil {
135135
// Ignore transient errors during migration, keep polling
136-
time.Sleep(qmpMigrationPollInterval)
136+
time.Sleep(qmpPollInterval)
137137
continue
138138
}
139139

140140
// Check migration status (Status is a pointer in MigrationInfo)
141141
if info.Status == nil {
142142
// Status not available yet, continue polling
143-
time.Sleep(qmpMigrationPollInterval)
143+
time.Sleep(qmpPollInterval)
144144
continue
145145
}
146146

147147
switch *info.Status {
148148
case raw.MigrationStatusCompleted:
149149
return nil
150-
case raw.MigrationStatusNone:
151-
// No active migration - for incoming this means complete, for outgoing it transitions quickly
152-
return nil
153150
case raw.MigrationStatusFailed:
154151
if info.ErrorDesc != nil && *info.ErrorDesc != "" {
155152
return fmt.Errorf("migration failed: %s", *info.ErrorDesc)
156153
}
157154
return fmt.Errorf("migration failed")
158155
case raw.MigrationStatusCancelled:
159156
return fmt.Errorf("migration cancelled")
160-
case raw.MigrationStatusActive, raw.MigrationStatusSetup, raw.MigrationStatusPreSwitchover, raw.MigrationStatusDevice:
161-
// Still in progress, continue polling
157+
case raw.MigrationStatusNone, raw.MigrationStatusActive, raw.MigrationStatusSetup, raw.MigrationStatusPreSwitchover, raw.MigrationStatusDevice:
158+
// Still in progress or not started yet, continue polling
162159
}
163160

164-
time.Sleep(qmpMigrationPollInterval)
161+
time.Sleep(qmpPollInterval)
165162
}
166163

167164
return fmt.Errorf("migration timeout after %v", timeout)
168165
}
166+
167+
// WaitVMReady polls until the VM is ready after an incoming migration.
168+
// Used for restore operations where QEMU was started with -incoming.
169+
// The VM transitions from "inmigrate" to "paused" when migration data is loaded.
170+
// Returns nil when VM is ready for resume, error on timeout or failure.
171+
func (c *Client) WaitVMReady(ctx context.Context, timeout time.Duration) error {
172+
deadline := time.Now().Add(timeout)
173+
174+
for time.Now().Before(deadline) {
175+
select {
176+
case <-ctx.Done():
177+
return ctx.Err()
178+
default:
179+
}
180+
181+
status, err := c.Status()
182+
if err != nil {
183+
// Ignore transient errors, keep polling
184+
time.Sleep(qmpPollInterval)
185+
continue
186+
}
187+
188+
switch status {
189+
case qemu.StatusPaused, qemu.StatusPostMigrate:
190+
// VM has finished loading migration data and is ready for resume
191+
return nil
192+
case qemu.StatusInMigrate, qemu.StatusRestoreVM:
193+
// Still loading migration data, continue polling
194+
case qemu.StatusRunning:
195+
// Already running (shouldn't happen, but not an error)
196+
return nil
197+
case qemu.StatusGuestPanicked, qemu.StatusInternalError, qemu.StatusIOError:
198+
return fmt.Errorf("VM in error state: %v", status)
199+
case qemu.StatusShutdown:
200+
return fmt.Errorf("VM shut down during migration")
201+
default:
202+
// Other states - keep polling
203+
}
204+
205+
time.Sleep(qmpPollInterval)
206+
}
207+
208+
return fmt.Errorf("timeout waiting for VM ready after %v", timeout)
209+
}

0 commit comments

Comments
 (0)