Skip to content

Commit a522d0b

Browse files
committed
More observability
1 parent bc51ba8 commit a522d0b

File tree

3 files changed

+51
-1
lines changed

3 files changed

+51
-1
lines changed

lib/hypervisor/cloudhypervisor/process.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ import (
44
"context"
55
"fmt"
66
"syscall"
7+
"time"
78

89
"github.com/onkernel/hypeman/lib/hypervisor"
10+
"github.com/onkernel/hypeman/lib/logger"
911
"github.com/onkernel/hypeman/lib/paths"
1012
"github.com/onkernel/hypeman/lib/vmm"
1113
"gvisor.dev/gvisor/pkg/cleanup"
@@ -100,17 +102,22 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
100102
// RestoreVM starts Cloud Hypervisor and restores VM state from a snapshot.
101103
// The VM is in paused state after restore; caller should call Resume() to continue execution.
102104
func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) {
105+
log := logger.FromContext(ctx)
106+
startTime := time.Now()
107+
103108
// Validate version
104109
chVersion := vmm.CHVersion(version)
105110
if !vmm.IsVersionSupported(chVersion) {
106111
return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version)
107112
}
108113

109114
// 1. Start the Cloud Hypervisor process
115+
processStartTime := time.Now()
110116
pid, err := vmm.StartProcess(ctx, p, chVersion, socketPath)
111117
if err != nil {
112118
return 0, nil, fmt.Errorf("start process: %w", err)
113119
}
120+
log.DebugContext(ctx, "CH process started", "pid", pid, "duration_ms", time.Since(processStartTime).Milliseconds())
114121

115122
// Setup cleanup to kill the process if subsequent steps fail
116123
cu := cleanup.Make(func() {
@@ -125,6 +132,7 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
125132
}
126133

127134
// 3. Restore from snapshot via HTTP API
135+
restoreAPIStart := time.Now()
128136
sourceURL := "file://" + snapshotPath
129137
restoreConfig := vmm.RestoreConfig{
130138
SourceUrl: sourceURL,
@@ -137,9 +145,11 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
137145
if resp.StatusCode() != 204 {
138146
return 0, nil, fmt.Errorf("restore failed with status %d: %s", resp.StatusCode(), string(resp.Body))
139147
}
148+
log.DebugContext(ctx, "CH restore API complete", "duration_ms", time.Since(restoreAPIStart).Milliseconds())
140149

141150
// Success - release cleanup to prevent killing the process
142151
cu.Release()
152+
log.DebugContext(ctx, "CH restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
143153
return pid, hv, nil
144154
}
145155

lib/hypervisor/qemu/process.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616

1717
"github.com/digitalocean/go-qemu/qmp/raw"
1818
"github.com/onkernel/hypeman/lib/hypervisor"
19+
"github.com/onkernel/hypeman/lib/logger"
1920
"github.com/onkernel/hypeman/lib/paths"
2021
"gvisor.dev/gvisor/pkg/cleanup"
2122
)
@@ -185,6 +186,9 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
185186
// RestoreVM starts QEMU and restores VM state from a snapshot.
186187
// The VM is in paused state after restore; caller should call Resume() to continue execution.
187188
func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) {
189+
log := logger.FromContext(ctx)
190+
startTime := time.Now()
191+
188192
// Get binary path
189193
binaryPath, err := s.GetBinaryPath(p, version)
190194
if err != nil {
@@ -201,10 +205,12 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
201205

202206
// Load saved VM config from snapshot directory
203207
// QEMU requires exact same command-line args as when snapshot was taken
208+
configLoadStart := time.Now()
204209
config, err := loadVMConfig(snapshotPath)
205210
if err != nil {
206211
return 0, nil, fmt.Errorf("load vm config from snapshot: %w", err)
207212
}
213+
log.DebugContext(ctx, "loaded VM config from snapshot", "duration_ms", time.Since(configLoadStart).Milliseconds())
208214

209215
instanceDir := filepath.Dir(socketPath)
210216

@@ -249,11 +255,13 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
249255
cmd.Stdout = vmmLogFile
250256
cmd.Stderr = vmmLogFile
251257

258+
processStartTime := time.Now()
252259
if err := cmd.Start(); err != nil {
253260
return 0, nil, fmt.Errorf("start qemu: %w", err)
254261
}
255262

256263
pid := cmd.Process.Pid
264+
log.DebugContext(ctx, "QEMU process started", "pid", pid, "duration_ms", time.Since(processStartTime).Milliseconds())
257265

258266
// Setup cleanup to kill the process if subsequent steps fail
259267
cu := cleanup.Make(func() {
@@ -262,13 +270,15 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
262270
defer cu.Clean()
263271

264272
// Wait for socket to be ready
273+
socketWaitStart := time.Now()
265274
if err := waitForSocket(socketPath, 10*time.Second); err != nil {
266275
vmmLogPath := filepath.Join(logsDir, "vmm.log")
267276
if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 {
268277
return 0, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData))
269278
}
270279
return 0, nil, err
271280
}
281+
log.DebugContext(ctx, "QMP socket ready", "duration_ms", time.Since(socketWaitStart).Milliseconds())
272282

273283
// Create QMP client
274284
hv, err := New(socketPath)
@@ -279,12 +289,15 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
279289
// Wait for incoming migration to complete
280290
// QEMU loads the migration data from the exec subprocess
281291
// After loading, VM is in paused state and ready for 'cont'
292+
migrationWaitStart := time.Now()
282293
if err := waitForMigrationComplete(hv.client, 30*time.Second); err != nil {
283294
return 0, nil, fmt.Errorf("wait for migration: %w", err)
284295
}
296+
log.DebugContext(ctx, "migration complete", "duration_ms", time.Since(migrationWaitStart).Milliseconds())
285297

286298
// Success - release cleanup to prevent killing the process
287299
cu.Release()
300+
log.DebugContext(ctx, "QEMU restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
288301
return pid, hv, nil
289302
}
290303

lib/instances/restore.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,33 @@ func (m *manager) restoreInstance(
5656

5757
// 4. Recreate TAP device if network enabled
5858
if stored.NetworkEnabled {
59+
var networkSpan trace.Span
60+
if m.metrics != nil && m.metrics.tracer != nil {
61+
ctx, networkSpan = m.metrics.tracer.Start(ctx, "RestoreNetwork")
62+
}
5963
log.DebugContext(ctx, "recreating network for restore", "instance_id", id, "network", "default")
6064
if err := m.networkManager.RecreateAllocation(ctx, id); err != nil {
65+
if networkSpan != nil {
66+
networkSpan.End()
67+
}
6168
log.ErrorContext(ctx, "failed to recreate network", "instance_id", id, "error", err)
6269
return nil, fmt.Errorf("recreate network: %w", err)
6370
}
71+
if networkSpan != nil {
72+
networkSpan.End()
73+
}
6474
}
6575

6676
// 5. Transition: Standby → Paused (start hypervisor + restore)
67-
log.DebugContext(ctx, "restoring from snapshot", "instance_id", id, "snapshot_dir", snapshotDir)
77+
var restoreSpan trace.Span
78+
if m.metrics != nil && m.metrics.tracer != nil {
79+
ctx, restoreSpan = m.metrics.tracer.Start(ctx, "RestoreFromSnapshot")
80+
}
81+
log.DebugContext(ctx, "restoring from snapshot", "instance_id", id, "snapshot_dir", snapshotDir, "hypervisor", stored.HypervisorType)
6882
pid, hv, err := m.restoreFromSnapshot(ctx, stored, snapshotDir)
83+
if restoreSpan != nil {
84+
restoreSpan.End()
85+
}
6986
if err != nil {
7087
log.ErrorContext(ctx, "failed to restore from snapshot", "instance_id", id, "error", err)
7188
// Cleanup network on failure
@@ -80,8 +97,15 @@ func (m *manager) restoreInstance(
8097
stored.HypervisorPID = &pid
8198

8299
// 6. Transition: Paused → Running (resume)
100+
var resumeSpan trace.Span
101+
if m.metrics != nil && m.metrics.tracer != nil {
102+
ctx, resumeSpan = m.metrics.tracer.Start(ctx, "ResumeVM")
103+
}
83104
log.DebugContext(ctx, "resuming VM", "instance_id", id)
84105
if err := hv.Resume(ctx); err != nil {
106+
if resumeSpan != nil {
107+
resumeSpan.End()
108+
}
85109
log.ErrorContext(ctx, "failed to resume VM", "instance_id", id, "error", err)
86110
// Cleanup on failure
87111
hv.Shutdown(ctx)
@@ -91,6 +115,9 @@ func (m *manager) restoreInstance(
91115
}
92116
return nil, fmt.Errorf("resume vm failed: %w", err)
93117
}
118+
if resumeSpan != nil {
119+
resumeSpan.End()
120+
}
94121

95122
// 8. Delete snapshot after successful restore
96123
log.DebugContext(ctx, "deleting snapshot after successful restore", "instance_id", id)

0 commit comments

Comments
 (0)