Skip to content

Commit 498360c

Browse files
EtiennePerotgvisor-bot
authored andcommitted
Measure and report the time taken during the sandbox restore process.
PiperOrigin-RevId: 758919824
1 parent 8db5d40 commit 498360c

File tree

11 files changed

+141
-21
lines changed

11 files changed

+141
-21
lines changed

pkg/sentry/kernel/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ go_library(
382382
"//pkg/syserr",
383383
"//pkg/tcpip",
384384
"//pkg/tcpip/stack",
385+
"//pkg/timing",
385386
"//pkg/usermem",
386387
"//pkg/waiter",
387388
"@org_golang_google_protobuf//proto:go_default_library",

pkg/sentry/kernel/kernel_restore.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
2828
"gvisor.dev/gvisor/pkg/state"
2929
"gvisor.dev/gvisor/pkg/sync"
30+
"gvisor.dev/gvisor/pkg/timing"
3031
)
3132

3233
// Saver is an interface for saving the kernel.
@@ -267,17 +268,21 @@ type AsyncMFLoader struct {
267268
// pagesMetadata and pagesFile. It creates a background goroutine that will
268269
// load all the MemoryFiles. The background goroutine immediately starts
269270
// loading the main MemoryFile.
270-
func NewAsyncMFLoader(pagesMetadata, pagesFile *fd.FD, mainMF *pgalloc.MemoryFile) *AsyncMFLoader {
271+
// If timeline is provided, it will be used to track async page loading.
272+
// It takes ownership of the timeline, and will end it when done loading all
273+
// pages.
274+
func NewAsyncMFLoader(pagesMetadata, pagesFile *fd.FD, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) *AsyncMFLoader {
271275
mfl := &AsyncMFLoader{
272276
privateMFsChan: make(chan map[string]*pgalloc.MemoryFile, 1),
273277
}
274278
mfl.metadataWg.Add(1)
275279
mfl.loadWg.Add(1)
276-
go mfl.backgroundGoroutine(pagesMetadata, pagesFile, mainMF)
280+
go mfl.backgroundGoroutine(pagesMetadata, pagesFile, mainMF, timeline)
277281
return mfl
278282
}
279283

280-
func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.FD, mainMF *pgalloc.MemoryFile) {
284+
func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.FD, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) {
285+
defer timeline.End()
281286
defer pagesMetadataFD.Close()
282287
defer pagesFileFD.Close()
283288
cu := cleanup.Make(func() {
@@ -307,17 +312,20 @@ func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.F
307312
mfl.loadErrsMu.Unlock()
308313
}
309314
},
315+
Timeline: timeline,
310316
}
311317

318+
timeline.Reached("loading mainMF")
312319
log.Infof("Loading metadata for main MemoryFile: %p", mainMF)
313320
ctx := context.Background()
314321
if err := mainMF.LoadFrom(ctx, pagesMetadata, &opts); err != nil {
315322
log.Warningf("Failed to load main MemoryFile %p: %v", mainMF, err)
316323
mfl.metadataErr = err
317324
return
318325
}
319-
326+
timeline.Reached("waiting for privateMF info")
320327
privateMFs := <-mfl.privateMFsChan
328+
timeline.Reached("received privateMFs info")
321329
log.Infof("Loading metadata for %d private MemoryFiles", len(privateMFs))
322330
if err := loadPrivateMemoryFiles(ctx, pagesMetadata, privateMFs, &opts); err != nil {
323331
log.Warningf("Failed to load private MemoryFiles: %v", err)
@@ -326,12 +334,14 @@ func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.F
326334
}
327335

328336
// Report metadata load completion.
337+
timeline.Reached("metadata load done")
329338
log.Infof("All MemoryFile metadata has been loaded")
330339
cu.Release()()
331340

332341
// Wait for page loads to complete and report errors.
333342
mfl.loadWg.Wait()
334343
if loadErr := errors.Join(mfl.loadErrs...); loadErr != nil {
344+
timeline.Invalidate("page load failed")
335345
log.Warningf("Failed to load MemoryFile pages: %v", loadErr)
336346
return
337347
}

pkg/sentry/pgalloc/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ go_library(
166166
"//pkg/sync",
167167
"//pkg/sync/locking",
168168
"//pkg/syncevent",
169+
"//pkg/timing",
169170
"//pkg/usermem",
170171
"@org_golang_x_sys//unix:go_default_library",
171172
],

pkg/sentry/pgalloc/save_restore.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"gvisor.dev/gvisor/pkg/state/wire"
4141
"gvisor.dev/gvisor/pkg/sync"
4242
"gvisor.dev/gvisor/pkg/syncevent"
43+
"gvisor.dev/gvisor/pkg/timing"
4344
)
4445

4546
// SaveOpts provides options to MemoryFile.SaveTo().
@@ -219,7 +220,7 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer, pw io.Writer, opts
219220
savedBytes += maseg.Range().Length()
220221
}
221222
durPages := time.Since(timePagesStart)
222-
log.Infof("MemoryFile(%p): saved pages in %s (%d bytes, %f bytes/second)", f, durPages, savedBytes, float64(savedBytes)/durPages.Seconds())
223+
log.Infof("MemoryFile(%p): saved pages in %s (%d bytes, %.3f MiB/s)", f, durPages, savedBytes, float64(savedBytes)/durPages.Seconds()/(1024.0*1024.0))
223224

224225
return nil
225226
}
@@ -257,10 +258,18 @@ type LoadOpts struct {
257258
PagesFileOffset uint64
258259
OnAsyncPageLoadStart func(*MemoryFile)
259260
OnAsyncPageLoadDone func(*MemoryFile, error)
261+
262+
// Optional timeline for the restore process.
263+
// If async page loading is enabled, a forked timeline will be created for
264+
// that async goroutine, so ownership of this timeline remains in the hands
265+
// of the caller of LoadFrom.
266+
Timeline *timing.Timeline
260267
}
261268

262269
// LoadFrom loads MemoryFile state from the given stream.
263270
func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts) error {
271+
mfTimeline := opts.Timeline.Fork(fmt.Sprintf("mf:%p", f)).Lease()
272+
defer mfTimeline.End()
264273
timeMetadataStart := time.Now()
265274

266275
// Clear sets since non-empty sets will panic if loaded into.
@@ -300,6 +309,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts)
300309
return err
301310
}
302311
f.chunks.Store(&chunks)
312+
mfTimeline.Reached("metadata loaded")
303313
log.Infof("MemoryFile(%p): loaded metadata in %s", f, time.Since(timeMetadataStart))
304314
if err := f.file.Truncate(int64(len(chunks)) * chunkSize); err != nil {
305315
return fmt.Errorf("failed to truncate MemoryFile: %w", err)
@@ -322,6 +332,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts)
322332
if errno != 0 {
323333
return fmt.Errorf("failed to mmap MemoryFile: %w", errno)
324334
}
335+
mfTimeline.Reached("mmaped chunks")
325336
for i := range chunks {
326337
chunk := &chunks[i]
327338
chunk.mapping = m
@@ -373,6 +384,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts)
373384
qavail: aplQueueCapacity,
374385
fd: int32(opts.PagesFile.FD()),
375386
opsBusy: bitmap.New(aplQueueCapacity),
387+
timeline: mfTimeline.Transfer(),
376388
}
377389
apl = &aplg.apl
378390
// Mark ops in opsBusy that don't actually exist as permanently busy.
@@ -665,6 +677,9 @@ type aplGoroutine struct {
665677

666678
// ops stores all aplOps.
667679
ops [aplQueueCapacity]aplOp
680+
681+
// Optional timeline for tracking async page loading.
682+
timeline *timing.Timeline
668683
}
669684

670685
// Possible events in aplGoroutine.lfStatus:
@@ -855,6 +870,9 @@ func (g *aplGoroutine) main() {
855870
apl := &g.apl
856871
f := g.f
857872
q := g.q
873+
t := g.timeline
874+
defer t.End()
875+
t.Reached("aplGoroutine.main")
858876
defer func() {
859877
// Destroy q first since this synchronously stops inflight I/O.
860878
q.Destroy()
@@ -932,7 +950,9 @@ func (g *aplGoroutine) main() {
932950
}
933951
durDelta := time.Duration(now - timeLast)
934952
bytesLoadedDelta := bytesLoaded - bytesLoadedLast
935-
log.Infof("MemoryFile(%p): async page loading in progress for %s (%d bytes, %.3f MB/s); since last update %s ago: %d bytes, %.3f MB/s; %d waiters waited %v~%v for %d bytes", f, durTotal.Round(time.Millisecond), bytesLoaded, float64(bytesLoaded)*1e-6/durTotal.Seconds(), durDelta.Round(time.Millisecond), bytesLoadedDelta, float64(bytesLoadedDelta)*1e-6/durDelta.Seconds(), totalWaiters, durWaitedOne.Round(time.Millisecond), durWaitedTotal.Round(time.Millisecond), bytesWaited)
953+
bandwidthSinceLast := float64(bytesLoadedDelta) * 1e-6 / durDelta.Seconds()
954+
t.Reached(fmt.Sprintf("%.3f MB/s", bandwidthSinceLast))
955+
log.Infof("MemoryFile(%p): async page loading in progress for %s (%d bytes, %.3f MB/s); since last update %s ago: %d bytes, %.3f MB/s; %d waiters waited %v~%v for %d bytes", f, durTotal.Round(time.Millisecond), bytesLoaded, float64(bytesLoaded)*1e-6/durTotal.Seconds(), durDelta.Round(time.Millisecond), bytesLoadedDelta, bandwidthSinceLast, totalWaiters, durWaitedOne.Round(time.Millisecond), durWaitedTotal.Round(time.Millisecond), bytesWaited)
936956
timeLast = now
937957
bytesLoadedLast = bytesLoaded
938958
}

pkg/timing/timing.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ type Timeline struct {
8686
// children is a list of forked timelines that are children of this one.
8787
// Note that children do not necessarily need to end before the parent does.
8888
children []*Timeline
89+
90+
// invalidReason, if non-empty, is the reason why this Timeline is invalid.
91+
// A Timeline is also invalid if any of its children are invalid.
92+
invalidReason string
8993
}
9094

9195
// MidPoint is a named point in time on a Timeline.
@@ -243,6 +247,18 @@ func (s *Timeline) End() {
243247
}
244248
}
245249

250+
// Invalidate marks the Timeline as invalid.
251+
func (s *Timeline) Invalidate(reason string) {
252+
if s == nil {
253+
return
254+
}
255+
if s.invalidReason != "" {
256+
log.Warningf("Timer for %s: Timeline %s was already invalid (%v), but tried to invalidate again (%v)", s.timer.root.name, s.fullName, s.invalidReason, reason)
257+
return
258+
}
259+
s.invalidReason = reason
260+
}
261+
246262
// A Lease is a reference to a Timeline that is valid until the Lease is
247263
// canceled. After calling Lease on a Timeline, the caller should no longer
248264
// use the Timeline directly, and should instead use the Lease exclusively.
@@ -342,6 +358,15 @@ func (l *Lease) End() {
342358
l.timeline.End()
343359
}
344360

361+
// Invalidate invalidates the Timeline if the Lease is valid.
362+
// See `Timeline.Invalidate` for more details.
363+
func (l *Lease) Invalidate(reason string) {
364+
if l == nil || !l.valid {
365+
return
366+
}
367+
l.timeline.Invalidate(reason)
368+
}
369+
345370
// Transfer invalidates the current Lease and returns the underlying Timeline.
346371
// Typically useful when transferring ownership of a Timeline to a different
347372
// goroutine while giving up ownership in the current one.
@@ -608,6 +633,16 @@ func (t *Timer) Log() {
608633
t.root.traverse(nil, func(_, child *Timeline) {
609634
flatTimelines = append(flatTimelines, child)
610635
})
636+
var invalidReasons []string
637+
for _, timeline := range flatTimelines {
638+
if timeline.invalidReason != "" {
639+
invalidReasons = append(invalidReasons, timeline.invalidReason)
640+
}
641+
}
642+
if len(invalidReasons) > 0 {
643+
log.Warningf("Timer for %s: Timeline was invalidated, so not displaying timing data: %v", t.root.name, invalidReasons)
644+
return
645+
}
611646
var events []event
612647
for _, timeline := range flatTimelines {
613648
events = append(events, event{when: timeline.start, point: point{timeline: timeline, pointType: pointTypeStart}})

runsc/boot/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ go_library(
125125
"//pkg/tcpip/transport/raw",
126126
"//pkg/tcpip/transport/tcp",
127127
"//pkg/tcpip/transport/udp",
128+
"//pkg/timing",
128129
"//pkg/urpc",
129130
"//runsc/boot/filter",
130131
"//runsc/boot/portforward",
@@ -134,6 +135,7 @@ go_library(
134135
"//runsc/profile",
135136
"//runsc/specutils",
136137
"//runsc/specutils/seccomp",
138+
"//runsc/starttime",
137139
"//runsc/version",
138140
"@com_github_moby_sys_capability//:go_default_library",
139141
"@com_github_opencontainers_runtime_spec//specs-go:go_default_library",

runsc/boot/controller.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,12 @@ import (
3838
"gvisor.dev/gvisor/pkg/sentry/socket/plugin"
3939
"gvisor.dev/gvisor/pkg/sentry/state"
4040
"gvisor.dev/gvisor/pkg/sentry/vfs"
41+
"gvisor.dev/gvisor/pkg/timing"
4142
"gvisor.dev/gvisor/pkg/urpc"
4243
"gvisor.dev/gvisor/runsc/boot/procfs"
4344
"gvisor.dev/gvisor/runsc/config"
4445
"gvisor.dev/gvisor/runsc/specutils"
46+
"gvisor.dev/gvisor/runsc/starttime"
4547
"gvisor.dev/gvisor/runsc/version"
4648
)
4749

@@ -496,9 +498,12 @@ type RestoreOpts struct {
496498
// created, and the kernel is recreated with the restore state file. The
497499
// container then sends the signal to start.
498500
func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
501+
timer := starttime.Timer("Restore")
502+
timer.Reached("cm.Restore RPC")
499503
log.Debugf("containerManager.Restore")
500504

501505
cm.l.mu.Lock()
506+
timer.Reached("cm.l.mu.Lock")
502507
cu := cleanup.Make(cm.l.mu.Unlock)
503508
defer cu.Clean()
504509

@@ -538,10 +543,12 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
538543
onRestoreDone: cm.onRestoreDone,
539544
stateFile: reader,
540545
background: o.Background,
546+
timer: timer,
541547
mainMF: mf,
542548
}
543549
cm.l.restoreDone = sync.NewCond(&cm.l.mu)
544550
cm.l.state = restoringUnstarted
551+
545552
// Release `cm.l.mu`.
546553
cu.Clean()
547554

@@ -560,7 +567,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
560567
fileIdx++
561568

562569
// This immediately starts loading the main MemoryFile asynchronously.
563-
cm.restorer.asyncMFLoader = kernel.NewAsyncMFLoader(pagesMetadata, pagesFile, cm.restorer.mainMF)
570+
cm.restorer.asyncMFLoader = kernel.NewAsyncMFLoader(pagesMetadata, pagesFile, cm.restorer.mainMF, timer.Fork("PagesFileLoader"))
564571
}
565572

566573
if o.HaveDeviceFile {
@@ -574,9 +581,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
574581
if fileIdx < len(o.Files) {
575582
return fmt.Errorf("more files passed to Restore than expected")
576583
}
584+
timer.Reached("restorer ok")
577585

578586
// Pause the kernel while we build a new one.
579587
cm.l.k.Pause()
588+
timer.Reached("kernel paused")
580589

581590
var count int
582591
countStr, ok := metadata[ContainerCountKey]
@@ -614,7 +623,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
614623
if checkpointVersion != currentVersion {
615624
return fmt.Errorf("runsc version does not match across checkpoint restore, checkpoint: %v current: %v", checkpointVersion, currentVersion)
616625
}
617-
return cm.restorer.restoreContainerInfo(cm.l, &cm.l.root)
626+
return cm.restorer.restoreContainerInfo(cm.l, &cm.l.root, timer.Fork("cont:root"))
618627
}
619628

620629
func (cm *containerManager) onRestoreDone() {
@@ -626,9 +635,11 @@ func (cm *containerManager) onRestoreDone() {
626635
}
627636

628637
func (cm *containerManager) RestoreSubcontainer(args *StartArgs, _ *struct{}) error {
638+
timeline := timing.OrphanTimeline(fmt.Sprintf("cont:%s", args.CID[0:min(8, len(args.CID))]), gtime.Now()).Lease()
639+
defer timeline.End()
629640
log.Debugf("containerManager.RestoreSubcontainer, cid: %s, args: %+v", args.CID, args)
630-
631641
cm.l.mu.Lock()
642+
timeline.Reached("cm.l.mu.Lock")
632643
state := cm.l.state
633644
cm.l.mu.Unlock()
634645
if state != restoringUnstarted {
@@ -656,6 +667,7 @@ func (cm *containerManager) RestoreSubcontainer(args *StartArgs, _ *struct{}) er
656667

657668
// All validation passed, logs the spec for debugging.
658669
specutils.LogSpecDebug(args.Spec, args.Conf.OCISeccomp)
670+
timeline.Reached("spec validated")
659671

660672
goferFiles := args.Files
661673
var stdios []*fd.FD
@@ -695,7 +707,8 @@ func (cm *containerManager) RestoreSubcontainer(args *StartArgs, _ *struct{}) er
695707
return fmt.Errorf("error dup'ing gofer files: %w", err)
696708
}
697709

698-
if err := cm.restorer.restoreSubcontainer(args.Spec, args.Conf, cm.l, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs); err != nil {
710+
err = cm.restorer.restoreSubcontainer(args.Spec, args.Conf, cm.l, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs, timeline.Transfer())
711+
if err != nil {
699712
log.Debugf("containerManager.RestoreSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
700713
return err
701714
}
@@ -712,7 +725,7 @@ func (cm *containerManager) Pause(_, _ *struct{}) error {
712725
// Resume resumes all tasks.
713726
func (cm *containerManager) Resume(_, _ *struct{}) error {
714727
cm.l.k.Unpause()
715-
return postResumeImpl(cm.l)
728+
return postResumeImpl(cm.l, nil)
716729
}
717730

718731
// Wait waits for the init process in the given container.

0 commit comments

Comments
 (0)