Skip to content

Commit 974e3c5

Browse files
committed
add improved snapshotting functionality
Signed-off-by: Amory Hoste <[email protected]>
1 parent ea44fe2 commit 974e3c5

33 files changed

+1073
-771
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:73f90aa2f7016202b3cc81077989137e39ceaccd1f9cb820fc442a876b8f5f7e
3-
size 23718874
2+
oid sha256:e290046f2e24c117ef450a3bef6c8f8e3b1ec387decc76ccc936e1f54c827327
3+
size 26355405

bin/firecracker

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:bc78ce90c66c599af63bd32c883202fc73fe6ec08f8db8ba688b6218d2eb5d1c
3-
size 3678288
2+
oid sha256:561cff75b2e1d768d2a4e7dad01cffb3eaff194e1b1696ad3ede5284c404fb0c
3+
size 4010736

bin/firecracker-containerd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:1f724d742b66ecb6e1523b211fb5f4917ebdc500f08a0fc869c8184bc1d14f92
3-
size 43283896
2+
oid sha256:89c20c096978dafa7f3ba3b1d66a9e574f2fd89f3781ee0537da30120aea6455
3+
size 46999272

bin/firecracker-ctr

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:a3892fdae25a2f53274a0e597b28e03418898273b1fba06e6b87cd58da87e6e9
3-
size 31858944
2+
oid sha256:1b0bab69371a224e9eaed86edb26dd57e2a0b04eaa7e9b4da7e3e8c7c38e0016
3+
size 34476496

bin/jailer

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:88b8d09513ee03a872096fc5671a9dd82c218dd85754f4890e5ad021ee494b2b
3-
size 2828176
2+
oid sha256:375abd369c55ad8057ec6cd39ee77e8f68933fd7a97e1d1901881805f22815f8
3+
size 3060760
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
{
22
"firecracker_binary_path": "/usr/local/bin/firecracker",
33
"kernel_image_path": "/var/lib/firecracker-containerd/runtime/hello-vmlinux.bin",
4-
"kernel_args": "console=ttyS0 noapic reboot=k panic=1 pci=off nomodules ro systemd.journald.forward_to_console systemd.unit=firecracker.target init=/sbin/overlay-init",
54
"root_drive": "/var/lib/firecracker-containerd/runtime/default-rootfs.img",
5+
"cpu_count": 1,
66
"cpu_template": "T2",
7-
"log_levels": ["info"]
7+
"log_fifo": "fc-logs.fifo",
8+
"log_levels": ["info"],
9+
"metrics_fifo": "fc-metrics.fifo",
10+
"kernel_args": "console=ttyS0 noapic reboot=k panic=1 pci=off nomodules ro systemd.journald.forward_to_console systemd.unit=firecracker.target init=/sbin/overlay-init",
11+
"jailer": {
12+
"runc_binary_path": "/usr/bin/runc"
13+
}
814
}

cri/firecracker/coordinator.go

Lines changed: 116 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@ package firecracker
2424

2525
import (
2626
"context"
27-
"errors"
27+
"fmt"
28+
"github.com/ease-lab/vhive/metrics"
29+
"github.com/ease-lab/vhive/snapshotting"
30+
"github.com/pkg/errors"
2831
"strconv"
2932
"sync"
3033
"sync/atomic"
@@ -34,13 +37,16 @@ import (
3437
log "github.com/sirupsen/logrus"
3538
)
3639

40+
const snapshotsDir = "/fccd/snapshots"
41+
3742
type coordinator struct {
3843
sync.Mutex
3944
orch *ctriface.Orchestrator
4045
nextID uint64
46+
isSparseSnaps bool
4147

42-
activeInstances map[string]*funcInstance
43-
idleInstances map[string][]*funcInstance
48+
activeInstances map[string]*FuncInstance
49+
snapshotManager *snapshotting.SnapshotManager
4450
withoutOrchestrator bool
4551
}
4652

@@ -53,11 +59,12 @@ func withoutOrchestrator() coordinatorOption {
5359
}
5460
}
5561

56-
func newFirecrackerCoordinator(orch *ctriface.Orchestrator, opts ...coordinatorOption) *coordinator {
62+
func newFirecrackerCoordinator(orch *ctriface.Orchestrator, snapsCapacityMiB int64, isSparseSnaps bool, opts ...coordinatorOption) *coordinator {
5763
c := &coordinator{
58-
activeInstances: make(map[string]*funcInstance),
59-
idleInstances: make(map[string][]*funcInstance),
64+
activeInstances: make(map[string]*FuncInstance),
6065
orch: orch,
66+
snapshotManager: snapshotting.NewSnapshotManager(snapshotsDir, snapsCapacityMiB),
67+
isSparseSnaps: isSparseSnaps,
6168
}
6269

6370
for _, opt := range opts {
@@ -67,60 +74,46 @@ func newFirecrackerCoordinator(orch *ctriface.Orchestrator, opts ...coordinatorO
6774
return c
6875
}
6976

70-
func (c *coordinator) getIdleInstance(image string) *funcInstance {
71-
c.Lock()
72-
defer c.Unlock()
73-
74-
idles, ok := c.idleInstances[image]
75-
if !ok {
76-
c.idleInstances[image] = []*funcInstance{}
77-
return nil
78-
}
79-
80-
if len(idles) != 0 {
81-
fi := idles[0]
82-
c.idleInstances[image] = idles[1:]
83-
return fi
84-
}
85-
86-
return nil
87-
}
88-
89-
func (c *coordinator) setIdleInstance(fi *funcInstance) {
90-
c.Lock()
91-
defer c.Unlock()
92-
93-
_, ok := c.idleInstances[fi.Image]
94-
if !ok {
95-
c.idleInstances[fi.Image] = []*funcInstance{}
96-
}
97-
98-
c.idleInstances[fi.Image] = append(c.idleInstances[fi.Image], fi)
99-
}
100-
101-
func (c *coordinator) startVM(ctx context.Context, image string, memSizeMib, vCPUCount uint32) (*funcInstance, error) {
102-
if fi := c.getIdleInstance(image); c.orch != nil && c.orch.GetSnapshotsEnabled() && fi != nil {
103-
err := c.orchLoadInstance(ctx, fi)
104-
return fi, err
77+
func (c *coordinator) startVM(ctx context.Context, image string, revision string, memSizeMib, vCPUCount uint32) (*FuncInstance, error) {
78+
if c.orch != nil && c.orch.GetSnapshotsEnabled() {
79+
// Check if snapshot is available
80+
if snap, err := c.snapshotManager.AcquireSnapshot(revision); err == nil {
81+
if snap.MemSizeMib != memSizeMib || snap.VCPUCount != vCPUCount {
82+
return nil, errors.New("Please create a new revision when updating uVM memory size or vCPU count")
83+
} else {
84+
return c.orchStartVMSnapshot(ctx, snap, memSizeMib, vCPUCount)
85+
}
86+
} else {
87+
return c.orchStartVM(ctx, image, revision, memSizeMib, vCPUCount)
88+
}
10589
}
10690

107-
return c.orchStartVM(ctx, image, memSizeMib, vCPUCount)
91+
return c.orchStartVM(ctx, image, revision, memSizeMib, vCPUCount)
10892
}
10993

11094
func (c *coordinator) stopVM(ctx context.Context, containerID string) error {
11195
c.Lock()
11296

113-
fi, ok := c.activeInstances[containerID]
114-
delete(c.activeInstances, containerID)
97+
fi, present := c.activeInstances[containerID]
98+
if present {
99+
delete(c.activeInstances, containerID)
100+
}
115101

116102
c.Unlock()
117103

118-
if !ok {
104+
// Not a request to remove vm container
105+
if !present {
119106
return nil
120107
}
121108

122-
if c.orch != nil && c.orch.GetSnapshotsEnabled() {
123-
return c.orchOffloadInstance(ctx, fi)
109+
if fi.snapBooted {
110+
defer c.snapshotManager.ReleaseSnapshot(fi.revisionId)
111+
} else if c.orch != nil && c.orch.GetSnapshotsEnabled() {
112+
// Create snapshot
113+
err := c.orchCreateSnapshot(ctx, fi)
114+
if err != nil {
115+
log.Printf("Err creating snapshot %s\n", err)
116+
}
124117
}
125118

126119
return c.orchStopVM(ctx, fi)
@@ -135,22 +128,23 @@ func (c *coordinator) isActive(containerID string) bool {
135128
return ok
136129
}
137130

138-
func (c *coordinator) insertActive(containerID string, fi *funcInstance) error {
131+
func (c *coordinator) insertActive(containerID string, fi *FuncInstance) error {
139132
c.Lock()
140133
defer c.Unlock()
141134

142-
logger := log.WithFields(log.Fields{"containerID": containerID, "vmID": fi.VmID})
135+
logger := log.WithFields(log.Fields{"containerID": containerID, "vmID": fi.vmID})
143136

144137
if fi, present := c.activeInstances[containerID]; present {
145-
logger.Errorf("entry for container already exists with vmID %s" + fi.VmID)
138+
logger.Errorf("entry for container already exists with vmID %s" + fi.vmID)
146139
return errors.New("entry for container already exists")
147140
}
148141

149142
c.activeInstances[containerID] = fi
150143
return nil
151144
}
152145

153-
func (c *coordinator) orchStartVM(ctx context.Context, image string, memSizeMib, vCPUCount uint32) (*funcInstance, error) {
146+
func (c *coordinator) orchStartVM(ctx context.Context, image, revision string, memSizeMib, vCPUCount uint32) (*FuncInstance, error) {
147+
tStartCold := time.Now()
154148
vmID := strconv.Itoa(int(atomic.AddUint64(&c.nextID, 1)))
155149
logger := log.WithFields(
156150
log.Fields{
@@ -170,90 +164,114 @@ func (c *coordinator) orchStartVM(ctx context.Context, image string, memSizeMib,
170164
defer cancel()
171165

172166
if !c.withoutOrchestrator {
173-
resp, _, err = c.orch.StartVM(ctxTimeout, vmID, image, memSizeMib, vCPUCount)
167+
trackDirtyPages := c.isSparseSnaps
168+
resp, _, err = c.orch.StartVM(ctxTimeout, vmID, image, memSizeMib, vCPUCount, trackDirtyPages)
174169
if err != nil {
175170
logger.WithError(err).Error("coordinator failed to start VM")
176171
}
177172
}
178173

179-
fi := newFuncInstance(vmID, image, resp)
174+
coldStartTimeMs := metrics.ToMs(time.Since(tStartCold))
175+
176+
fi := NewFuncInstance(vmID, image, revision, resp, false, memSizeMib, vCPUCount, coldStartTimeMs)
180177
logger.Debug("successfully created fresh instance")
181178
return fi, err
182179
}
183180

184-
func (c *coordinator) orchLoadInstance(ctx context.Context, fi *funcInstance) error {
185-
fi.Logger.Debug("found idle instance to load")
181+
func (c *coordinator) orchStartVMSnapshot(ctx context.Context, snap *snapshotting.Snapshot, memSizeMib, vCPUCount uint32) (*FuncInstance, error) {
182+
tStartCold := time.Now()
183+
vmID := strconv.Itoa(int(atomic.AddUint64(&c.nextID, 1)))
184+
logger := log.WithFields(
185+
log.Fields{
186+
"vmID": vmID,
187+
"image": snap.GetImage(),
188+
},
189+
)
190+
191+
logger.Debug("loading instance from snapshot")
192+
193+
var (
194+
resp *ctriface.StartVMResponse
195+
err error
196+
)
186197

187198
ctxTimeout, cancel := context.WithTimeout(ctx, time.Second*30)
188199
defer cancel()
189200

190-
if _, err := c.orch.LoadSnapshot(ctxTimeout, fi.VmID); err != nil {
191-
fi.Logger.WithError(err).Error("failed to load VM")
192-
return err
201+
resp, _, err = c.orch.LoadSnapshot(ctxTimeout, vmID, snap)
202+
if err != nil {
203+
logger.WithError(err).Error("failed to load VM")
204+
return nil, err
193205
}
194206

195-
if _, err := c.orch.ResumeVM(ctxTimeout, fi.VmID); err != nil {
196-
fi.Logger.WithError(err).Error("failed to load VM")
197-
return err
207+
if _, err := c.orch.ResumeVM(ctxTimeout, vmID); err != nil {
208+
logger.WithError(err).Error("failed to load VM")
209+
return nil, err
198210
}
199211

200-
fi.Logger.Debug("successfully loaded idle instance")
201-
return nil
202-
}
203-
204-
func (c *coordinator) orchCreateSnapshot(ctx context.Context, fi *funcInstance) error {
205-
var err error
206-
207-
fi.OnceCreateSnapInstance.Do(
208-
func() {
209-
ctxTimeout, cancel := context.WithTimeout(ctx, time.Second*60)
210-
defer cancel()
212+
coldStartTimeMs := metrics.ToMs(time.Since(tStartCold))
213+
fi := NewFuncInstance(vmID, snap.GetImage(), snap.GetRevisionId(), resp, true, memSizeMib, vCPUCount, coldStartTimeMs)
214+
logger.Debug("successfully loaded instance from snapshot")
211215

212-
fi.Logger.Debug("creating instance snapshot on first time offloading")
213-
214-
err = c.orch.PauseVM(ctxTimeout, fi.VmID)
215-
if err != nil {
216-
fi.Logger.WithError(err).Error("failed to pause VM")
217-
return
218-
}
216+
return fi, err
217+
}
219218

220-
err = c.orch.CreateSnapshot(ctxTimeout, fi.VmID)
221-
if err != nil {
222-
fi.Logger.WithError(err).Error("failed to create snapshot")
223-
return
224-
}
219+
func (c *coordinator) orchCreateSnapshot(ctx context.Context, fi *FuncInstance) error {
220+
logger := log.WithFields(
221+
log.Fields{
222+
"vmID": fi.vmID,
223+
"image": fi.image,
225224
},
226225
)
227226

228-
return err
229-
}
230-
231-
func (c *coordinator) orchOffloadInstance(ctx context.Context, fi *funcInstance) error {
232-
fi.Logger.Debug("offloading instance")
227+
removeContainerSnaps, snap, err := c.snapshotManager.InitSnapshot(fi.revisionId, fi.image, fi.coldStartTimeMs, fi.memSizeMib, fi.vCPUCount, c.isSparseSnaps)
228+
if err != nil {
229+
if fmt.Sprint(err) == "There is not enough free space available" {
230+
fi.logger.Info(fmt.Sprintf("There is not enough space available for snapshots of %s", fi.revisionId))
231+
}
232+
return nil
233+
}
233234

234-
if err := c.orchCreateSnapshot(ctx, fi); err != nil {
235-
return err
235+
if removeContainerSnaps != nil {
236+
for _, cleanupSnapId := range *removeContainerSnaps {
237+
if err := c.orch.CleanupRevisionSnapshot(ctx, cleanupSnapId); err != nil {
238+
return errors.Wrap(err, "removing devmapper revision snapshot")
239+
}
240+
}
236241
}
237242

238-
ctxTimeout, cancel := context.WithTimeout(ctx, time.Second*10)
243+
ctxTimeout, cancel := context.WithTimeout(ctx, time.Second*60)
239244
defer cancel()
240245

241-
if err := c.orch.Offload(ctxTimeout, fi.VmID); err != nil {
242-
fi.Logger.WithError(err).Error("failed to offload instance")
246+
logger.Debug("creating instance snapshot before stopping")
247+
248+
err = c.orch.PauseVM(ctxTimeout, fi.vmID)
249+
if err != nil {
250+
logger.WithError(err).Error("failed to pause VM")
251+
return nil
252+
}
253+
254+
err = c.orch.CreateSnapshot(ctxTimeout, fi.vmID, snap)
255+
if err != nil {
256+
fi.logger.WithError(err).Error("failed to create snapshot")
257+
return nil
243258
}
244259

245-
c.setIdleInstance(fi)
260+
if err := c.snapshotManager.CommitSnapshot(fi.revisionId); err != nil {
261+
fi.logger.WithError(err).Error("failed to commit snapshot")
262+
return err
263+
}
246264

247265
return nil
248266
}
249267

250-
func (c *coordinator) orchStopVM(ctx context.Context, fi *funcInstance) error {
268+
func (c *coordinator) orchStopVM(ctx context.Context, fi *FuncInstance) error {
251269
if c.withoutOrchestrator {
252270
return nil
253271
}
254272

255-
if err := c.orch.StopSingleVM(ctx, fi.VmID); err != nil {
256-
fi.Logger.WithError(err).Error("failed to stop VM for instance")
273+
if err := c.orch.StopSingleVM(ctx, fi.vmID); err != nil {
274+
fi.logger.WithError(err).Error("failed to stop VM for instance")
257275
return err
258276
}
259277

0 commit comments

Comments
 (0)