@@ -24,7 +24,10 @@ package firecracker
2424
2525import (
2626 "context"
27- "errors"
27+ "fmt"
28+ "github.com/ease-lab/vhive/metrics"
29+ "github.com/ease-lab/vhive/snapshotting"
30+ "github.com/pkg/errors"
2831 "strconv"
2932 "sync"
3033 "sync/atomic"
@@ -34,13 +37,16 @@ import (
3437 log "github.com/sirupsen/logrus"
3538)
3639
40+ const snapshotsDir = "/fccd/snapshots"
41+
3742type coordinator struct {
3843 sync.Mutex
3944 orch * ctriface.Orchestrator
4045 nextID uint64
46+ isSparseSnaps bool
4147
42- activeInstances map [string ]* funcInstance
43- idleInstances map [ string ][] * funcInstance
48+ activeInstances map [string ]* FuncInstance
49+ snapshotManager * snapshotting. SnapshotManager
4450 withoutOrchestrator bool
4551}
4652
@@ -53,11 +59,12 @@ func withoutOrchestrator() coordinatorOption {
5359 }
5460}
5561
56- func newFirecrackerCoordinator (orch * ctriface.Orchestrator , opts ... coordinatorOption ) * coordinator {
62+ func newFirecrackerCoordinator (orch * ctriface.Orchestrator , snapsCapacityMiB int64 , isSparseSnaps bool , opts ... coordinatorOption ) * coordinator {
5763 c := & coordinator {
58- activeInstances : make (map [string ]* funcInstance ),
59- idleInstances : make (map [string ][]* funcInstance ),
64+ activeInstances : make (map [string ]* FuncInstance ),
6065 orch : orch ,
66+ snapshotManager : snapshotting .NewSnapshotManager (snapshotsDir , snapsCapacityMiB ),
67+ isSparseSnaps : isSparseSnaps ,
6168 }
6269
6370 for _ , opt := range opts {
@@ -67,60 +74,46 @@ func newFirecrackerCoordinator(orch *ctriface.Orchestrator, opts ...coordinatorO
6774 return c
6875}
6976
70- func (c * coordinator ) getIdleInstance (image string ) * funcInstance {
71- c .Lock ()
72- defer c .Unlock ()
73-
74- idles , ok := c .idleInstances [image ]
75- if ! ok {
76- c .idleInstances [image ] = []* funcInstance {}
77- return nil
78- }
79-
80- if len (idles ) != 0 {
81- fi := idles [0 ]
82- c .idleInstances [image ] = idles [1 :]
83- return fi
84- }
85-
86- return nil
87- }
88-
89- func (c * coordinator ) setIdleInstance (fi * funcInstance ) {
90- c .Lock ()
91- defer c .Unlock ()
92-
93- _ , ok := c .idleInstances [fi .Image ]
94- if ! ok {
95- c .idleInstances [fi .Image ] = []* funcInstance {}
96- }
97-
98- c .idleInstances [fi .Image ] = append (c .idleInstances [fi .Image ], fi )
99- }
100-
101- func (c * coordinator ) startVM (ctx context.Context , image string , memSizeMib , vCPUCount uint32 ) (* funcInstance , error ) {
102- if fi := c .getIdleInstance (image ); c .orch != nil && c .orch .GetSnapshotsEnabled () && fi != nil {
103- err := c .orchLoadInstance (ctx , fi )
104- return fi , err
77+ func (c * coordinator ) startVM (ctx context.Context , image string , revision string , memSizeMib , vCPUCount uint32 ) (* FuncInstance , error ) {
78+ if c .orch != nil && c .orch .GetSnapshotsEnabled () {
79+ // Check if snapshot is available
80+ if snap , err := c .snapshotManager .AcquireSnapshot (revision ); err == nil {
81+ if snap .MemSizeMib != memSizeMib || snap .VCPUCount != vCPUCount {
82+ return nil , errors .New ("Please create a new revision when updating uVM memory size or vCPU count" )
83+ } else {
84+ return c .orchStartVMSnapshot (ctx , snap , memSizeMib , vCPUCount )
85+ }
86+ } else {
87+ return c .orchStartVM (ctx , image , revision , memSizeMib , vCPUCount )
88+ }
10589 }
10690
107- return c .orchStartVM (ctx , image , memSizeMib , vCPUCount )
91+ return c .orchStartVM (ctx , image , revision , memSizeMib , vCPUCount )
10892}
10993
11094func (c * coordinator ) stopVM (ctx context.Context , containerID string ) error {
11195 c .Lock ()
11296
113- fi , ok := c .activeInstances [containerID ]
114- delete (c .activeInstances , containerID )
97+ fi , present := c .activeInstances [containerID ]
98+ if present {
99+ delete (c .activeInstances , containerID )
100+ }
115101
116102 c .Unlock ()
117103
118- if ! ok {
104+ // Not a request to remove vm container
105+ if ! present {
119106 return nil
120107 }
121108
122- if c .orch != nil && c .orch .GetSnapshotsEnabled () {
123- return c .orchOffloadInstance (ctx , fi )
109+ if fi .snapBooted {
110+ defer c .snapshotManager .ReleaseSnapshot (fi .revisionId )
111+ } else if c .orch != nil && c .orch .GetSnapshotsEnabled () {
112+ // Create snapshot
113+ err := c .orchCreateSnapshot (ctx , fi )
114+ if err != nil {
115+ log .Printf ("Err creating snapshot %s\n " , err )
116+ }
124117 }
125118
126119 return c .orchStopVM (ctx , fi )
@@ -135,22 +128,23 @@ func (c *coordinator) isActive(containerID string) bool {
135128 return ok
136129}
137130
138- func (c * coordinator ) insertActive (containerID string , fi * funcInstance ) error {
131+ func (c * coordinator ) insertActive (containerID string , fi * FuncInstance ) error {
139132 c .Lock ()
140133 defer c .Unlock ()
141134
142- logger := log .WithFields (log.Fields {"containerID" : containerID , "vmID" : fi .VmID })
135+ logger := log .WithFields (log.Fields {"containerID" : containerID , "vmID" : fi .vmID })
143136
144137 if fi , present := c .activeInstances [containerID ]; present {
145- logger .Errorf ("entry for container already exists with vmID %s" + fi .VmID )
138+ logger .Errorf ("entry for container already exists with vmID %s" + fi .vmID )
146139 return errors .New ("entry for container already exists" )
147140 }
148141
149142 c .activeInstances [containerID ] = fi
150143 return nil
151144}
152145
153- func (c * coordinator ) orchStartVM (ctx context.Context , image string , memSizeMib , vCPUCount uint32 ) (* funcInstance , error ) {
146+ func (c * coordinator ) orchStartVM (ctx context.Context , image , revision string , memSizeMib , vCPUCount uint32 ) (* FuncInstance , error ) {
147+ tStartCold := time .Now ()
154148 vmID := strconv .Itoa (int (atomic .AddUint64 (& c .nextID , 1 )))
155149 logger := log .WithFields (
156150 log.Fields {
@@ -170,90 +164,114 @@ func (c *coordinator) orchStartVM(ctx context.Context, image string, memSizeMib,
170164 defer cancel ()
171165
172166 if ! c .withoutOrchestrator {
173- resp , _ , err = c .orch .StartVM (ctxTimeout , vmID , image , memSizeMib , vCPUCount )
167+ trackDirtyPages := c .isSparseSnaps
168+ resp , _ , err = c .orch .StartVM (ctxTimeout , vmID , image , memSizeMib , vCPUCount , trackDirtyPages )
174169 if err != nil {
175170 logger .WithError (err ).Error ("coordinator failed to start VM" )
176171 }
177172 }
178173
179- fi := newFuncInstance (vmID , image , resp )
174+ coldStartTimeMs := metrics .ToMs (time .Since (tStartCold ))
175+
176+ fi := NewFuncInstance (vmID , image , revision , resp , false , memSizeMib , vCPUCount , coldStartTimeMs )
180177 logger .Debug ("successfully created fresh instance" )
181178 return fi , err
182179}
183180
184- func (c * coordinator ) orchLoadInstance (ctx context.Context , fi * funcInstance ) error {
185- fi .Logger .Debug ("found idle instance to load" )
181+ func (c * coordinator ) orchStartVMSnapshot (ctx context.Context , snap * snapshotting.Snapshot , memSizeMib , vCPUCount uint32 ) (* FuncInstance , error ) {
182+ tStartCold := time .Now ()
183+ vmID := strconv .Itoa (int (atomic .AddUint64 (& c .nextID , 1 )))
184+ logger := log .WithFields (
185+ log.Fields {
186+ "vmID" : vmID ,
187+ "image" : snap .GetImage (),
188+ },
189+ )
190+
191+ logger .Debug ("loading instance from snapshot" )
192+
193+ var (
194+ resp * ctriface.StartVMResponse
195+ err error
196+ )
186197
187198 ctxTimeout , cancel := context .WithTimeout (ctx , time .Second * 30 )
188199 defer cancel ()
189200
190- if _ , err := c .orch .LoadSnapshot (ctxTimeout , fi .VmID ); err != nil {
191- fi .Logger .WithError (err ).Error ("failed to load VM" )
192- return err
201+ resp , _ , err = c .orch .LoadSnapshot (ctxTimeout , vmID , snap )
202+ if err != nil {
203+ logger .WithError (err ).Error ("failed to load VM" )
204+ return nil , err
193205 }
194206
195- if _ , err := c .orch .ResumeVM (ctxTimeout , fi . VmID ); err != nil {
196- fi . Logger .WithError (err ).Error ("failed to load VM" )
197- return err
207+ if _ , err := c .orch .ResumeVM (ctxTimeout , vmID ); err != nil {
208+ logger .WithError (err ).Error ("failed to load VM" )
209+ return nil , err
198210 }
199211
200- fi .Logger .Debug ("successfully loaded idle instance" )
201- return nil
202- }
203-
204- func (c * coordinator ) orchCreateSnapshot (ctx context.Context , fi * funcInstance ) error {
205- var err error
206-
207- fi .OnceCreateSnapInstance .Do (
208- func () {
209- ctxTimeout , cancel := context .WithTimeout (ctx , time .Second * 60 )
210- defer cancel ()
212+ coldStartTimeMs := metrics .ToMs (time .Since (tStartCold ))
213+ fi := NewFuncInstance (vmID , snap .GetImage (), snap .GetRevisionId (), resp , true , memSizeMib , vCPUCount , coldStartTimeMs )
214+ logger .Debug ("successfully loaded instance from snapshot" )
211215
212- fi .Logger .Debug ("creating instance snapshot on first time offloading" )
213-
214- err = c .orch .PauseVM (ctxTimeout , fi .VmID )
215- if err != nil {
216- fi .Logger .WithError (err ).Error ("failed to pause VM" )
217- return
218- }
216+ return fi , err
217+ }
219218
220- err = c . orch . CreateSnapshot ( ctxTimeout , fi . VmID )
221- if err != nil {
222- fi . Logger . WithError ( err ). Error ( "failed to create snapshot" )
223- return
224- }
219+ func ( c * coordinator ) orchCreateSnapshot ( ctx context. Context , fi * FuncInstance ) error {
220+ logger := log . WithFields (
221+ log. Fields {
222+ "vmID" : fi . vmID ,
223+ "image" : fi . image ,
225224 },
226225 )
227226
228- return err
229- }
230-
231- func (c * coordinator ) orchOffloadInstance (ctx context.Context , fi * funcInstance ) error {
232- fi .Logger .Debug ("offloading instance" )
227+ removeContainerSnaps , snap , err := c .snapshotManager .InitSnapshot (fi .revisionId , fi .image , fi .coldStartTimeMs , fi .memSizeMib , fi .vCPUCount , c .isSparseSnaps )
228+ if err != nil {
229+ if fmt .Sprint (err ) == "There is not enough free space available" {
230+ fi .logger .Info (fmt .Sprintf ("There is not enough space available for snapshots of %s" , fi .revisionId ))
231+ }
232+ return nil
233+ }
233234
234- if err := c .orchCreateSnapshot (ctx , fi ); err != nil {
235- return err
235+ if removeContainerSnaps != nil {
236+ for _ , cleanupSnapId := range * removeContainerSnaps {
237+ if err := c .orch .CleanupRevisionSnapshot (ctx , cleanupSnapId ); err != nil {
238+ return errors .Wrap (err , "removing devmapper revision snapshot" )
239+ }
240+ }
236241 }
237242
238- ctxTimeout , cancel := context .WithTimeout (ctx , time .Second * 10 )
243+ ctxTimeout , cancel := context .WithTimeout (ctx , time .Second * 60 )
239244 defer cancel ()
240245
241- if err := c .orch .Offload (ctxTimeout , fi .VmID ); err != nil {
242- fi .Logger .WithError (err ).Error ("failed to offload instance" )
246+ logger .Debug ("creating instance snapshot before stopping" )
247+
248+ err = c .orch .PauseVM (ctxTimeout , fi .vmID )
249+ if err != nil {
250+ logger .WithError (err ).Error ("failed to pause VM" )
251+ return nil
252+ }
253+
254+ err = c .orch .CreateSnapshot (ctxTimeout , fi .vmID , snap )
255+ if err != nil {
256+ fi .logger .WithError (err ).Error ("failed to create snapshot" )
257+ return nil
243258 }
244259
245- c .setIdleInstance (fi )
260+ if err := c .snapshotManager .CommitSnapshot (fi .revisionId ); err != nil {
261+ fi .logger .WithError (err ).Error ("failed to commit snapshot" )
262+ return err
263+ }
246264
247265 return nil
248266}
249267
250- func (c * coordinator ) orchStopVM (ctx context.Context , fi * funcInstance ) error {
268+ func (c * coordinator ) orchStopVM (ctx context.Context , fi * FuncInstance ) error {
251269 if c .withoutOrchestrator {
252270 return nil
253271 }
254272
255- if err := c .orch .StopSingleVM (ctx , fi .VmID ); err != nil {
256- fi .Logger .WithError (err ).Error ("failed to stop VM for instance" )
273+ if err := c .orch .StopSingleVM (ctx , fi .vmID ); err != nil {
274+ fi .logger .WithError (err ).Error ("failed to stop VM for instance" )
257275 return err
258276 }
259277
0 commit comments