Skip to content

Commit 9870928

Browse files
authored
Merge pull request #389 from kzys/stopvm-sync-2
Make StopVM() synchronous
2 parents ca14a1e + 90062f9 commit 9870928

File tree

8 files changed

+225
-116
lines changed

8 files changed

+225
-116
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ require (
1717
github.com/docker/distribution v2.7.1+incompatible // indirect
1818
github.com/docker/go-events v0.0.0-20170721190031-9461782956ad // indirect
1919
github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82 // indirect
20-
github.com/firecracker-microvm/firecracker-go-sdk v0.19.1-0.20191114205152-9e2ff62839b2
20+
github.com/firecracker-microvm/firecracker-go-sdk v0.20.1-0.20200204230548-c56932849923
2121
github.com/go-ole/go-ole v1.2.4 // indirect
2222
github.com/godbus/dbus v0.0.0-20181025153459-66d97aec3384 // indirect
2323
github.com/gofrs/uuid v3.2.0+incompatible

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82 h1:X0fj836zx99zF
5959
github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82/go.mod h1:/u0gXw0Gay3ceNrsHubL3BtdOL2fHf93USgMTe0W5dI=
6060
github.com/docker/go-units v0.3.3 h1:Xk8S3Xj5sLGlG5g67hJmYMmUgXv5N4PhkjJHHqrwnTk=
6161
github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
62-
github.com/firecracker-microvm/firecracker-go-sdk v0.19.1-0.20191114205152-9e2ff62839b2 h1:Ab52E0UlBOmMIAK/igRycsxSAJVDBDMYq+Wr/n7z2E0=
63-
github.com/firecracker-microvm/firecracker-go-sdk v0.19.1-0.20191114205152-9e2ff62839b2/go.mod h1:kW0gxvPpPvMukUxxTO9DrpSlScrtrTDGY3VgjAj/Qwc=
62+
github.com/firecracker-microvm/firecracker-go-sdk v0.20.1-0.20200204230548-c56932849923 h1:tMx0TVfjMRjHrB2L5Y3qMEfAwzFtwv+CnNUQA/i0838=
63+
github.com/firecracker-microvm/firecracker-go-sdk v0.20.1-0.20200204230548-c56932849923/go.mod h1:kW0gxvPpPvMukUxxTO9DrpSlScrtrTDGY3VgjAj/Qwc=
6464
github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb h1:D4uzjWwKYQ5XnAvUbuvHW93esHg7F8N/OYeBBcJoTr0=
6565
github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q=
6666
github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI=

runtime/drive_handler_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ func TestContainerStubs(t *testing.T) {
6666
err = os.MkdirAll(patchedSrcDir, 0700)
6767
require.NoError(t, err, "failed to create patched src dir")
6868

69-
noopJailer := noopJailer{
69+
noopJailer := &noopJailer{
7070
shimDir: vm.Dir(stubDir),
7171
ctx: ctx,
7272
logger: logger,
@@ -139,7 +139,7 @@ func TestDriveMountStubs(t *testing.T) {
139139
err = os.MkdirAll(patchedSrcDir, 0700)
140140
require.NoError(t, err, "failed to create patched src dir")
141141

142-
noopJailer := noopJailer{
142+
noopJailer := &noopJailer{
143143
shimDir: vm.Dir(stubDir),
144144
ctx: ctx,
145145
logger: logger,

runtime/jailer.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ type jailer interface {
5555
// StubDrivesOptions will return a set of options used to create a new stub
5656
// drive file
5757
StubDrivesOptions() []FileOpt
58+
59+
// Stop the jailer as a way that is visible from the user-level process (e.g. SIGTERM).
60+
Stop() error
61+
5862
// Close will do any necessary cleanup that the jailer has accrued.
5963
Close() error
6064
}

runtime/noop_jailer.go

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@ package main
1515

1616
import (
1717
"context"
18+
"os"
19+
"syscall"
1820

1921
"github.com/firecracker-microvm/firecracker-go-sdk"
22+
"github.com/pkg/errors"
2023
"github.com/sirupsen/logrus"
2124

2225
"github.com/firecracker-microvm/firecracker-containerd/config"
@@ -29,17 +32,19 @@ type noopJailer struct {
2932
logger *logrus.Entry
3033
shimDir vm.Dir
3134
ctx context.Context
35+
pid int
3236
}
3337

34-
func newNoopJailer(ctx context.Context, logger *logrus.Entry, shimDir vm.Dir) noopJailer {
35-
return noopJailer{
38+
func newNoopJailer(ctx context.Context, logger *logrus.Entry, shimDir vm.Dir) *noopJailer {
39+
return &noopJailer{
3640
logger: logger,
3741
shimDir: shimDir,
3842
ctx: ctx,
43+
pid: 0,
3944
}
4045
}
4146

42-
func (j noopJailer) BuildJailedMachine(cfg *config.Config, machineConfig *firecracker.Config, vmID string) ([]firecracker.Opt, error) {
47+
func (j *noopJailer) BuildJailedMachine(cfg *config.Config, machineConfig *firecracker.Config, vmID string) ([]firecracker.Opt, error) {
4348
if len(cfg.FirecrackerBinaryPath) == 0 {
4449
return []firecracker.Opt{}, nil
4550
}
@@ -59,25 +64,58 @@ func (j noopJailer) BuildJailedMachine(cfg *config.Config, machineConfig *firecr
5964
cmd.Stderr = j.logger.WithField("vmm_stream", "stderr").WriterLevel(logrus.DebugLevel)
6065
}
6166

67+
pidHandler := firecracker.Handler{
68+
Name: "firecracker-containerd-jail-pid-handler",
69+
Fn: func(ctx context.Context, m *firecracker.Machine) error {
70+
pid, err := m.PID()
71+
if err != nil {
72+
return err
73+
}
74+
j.pid = pid
75+
return nil
76+
},
77+
}
78+
6279
j.logger.Debug("noop operation for BuildJailedMachine")
6380
return []firecracker.Opt{
6481
firecracker.WithProcessRunner(cmd),
82+
func(m *firecracker.Machine) {
83+
m.Handlers.FcInit = m.Handlers.FcInit.Append(pidHandler)
84+
},
6585
}, nil
6686
}
6787

68-
func (j noopJailer) JailPath() vm.Dir {
88+
func (j *noopJailer) JailPath() vm.Dir {
6989
j.logger.Debug("noop operation returning shim dir for JailPath")
7090
return j.shimDir
7191
}
7292

73-
func (j noopJailer) ExposeFileToJail(path string) error {
93+
func (j *noopJailer) ExposeFileToJail(path string) error {
7494
j.logger.Debug("noop operation for ExposeFileToJail")
7595
return nil
7696
}
7797

78-
func (j noopJailer) StubDrivesOptions() []FileOpt {
98+
func (j *noopJailer) StubDrivesOptions() []FileOpt {
7999
j.logger.Debug("noop operation for StubDrivesOptions")
80100
return []FileOpt{}
81101
}
82102

83-
func (j noopJailer) Close() error { return nil }
103+
func (j *noopJailer) Stop() error {
104+
if j.pid == 0 {
105+
return errors.New("the machine hasn't been started")
106+
}
107+
108+
j.logger.Debugf("sending SIGTERM to %d", j.pid)
109+
p, err := os.FindProcess(j.pid)
110+
if err != nil {
111+
return err
112+
}
113+
114+
err = p.Signal(syscall.SIGTERM)
115+
if err == nil || err.Error() == "os: process already finished" {
116+
return nil
117+
}
118+
return err
119+
}
120+
121+
func (j *noopJailer) Close() error { return nil }

runtime/runc_jailer.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,3 +501,7 @@ func getNetNS(spec specs.Spec) string {
501501

502502
return ""
503503
}
504+
505+
func (j runcJailer) Stop() error {
506+
return j.runcClient.Kill(j.ctx, j.vmID, int(syscall.SIGTERM), &runc.KillOpts{All: true})
507+
}

runtime/service.go

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ const (
7474
firecrackerStartTimeout = 5 * time.Second
7575
defaultStopVMTimeout = 5 * time.Second
7676
defaultShutdownTimeout = 5 * time.Second
77+
jailerStopTimeout = 3 * time.Second
7778

7879
// StartEventName is the topic published to when a VM starts
7980
StartEventName = "/firecracker-vm/start"
@@ -1171,54 +1172,80 @@ func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequ
11711172
return &ptypes.Empty{}, nil
11721173
}
11731174

1174-
// shutdown will stop the VMM within the provided timeout. It attempts to shutdown gracefully by having
1175-
// agent stop (which is presumed to cause the VM to begin a reboot) and then waiting for the VMM process
1176-
// to exit (via the s.shimCtx.Done() channel). If that fails, StopVMM will be called to force a shutdown (currently
1177-
// via sending SIGTERM). If that fails, the VMM will still be killed via SIGKILL when the shimCtx is canceled.
11781175
func (s *service) shutdown(
11791176
requestCtx context.Context,
11801177
timeout time.Duration,
11811178
req *taskAPI.ShutdownRequest,
11821179
) error {
1183-
shutdownCtx, cancel := context.WithTimeout(requestCtx, timeout)
1184-
defer cancel()
1180+
s.logger.Info("stopping the VM")
1181+
11851182
go func() {
1186-
// Once the shutdown procedure is done, the shim needs to shutdown too.
1187-
// This also ensures that if the VMM is still alive, it will receive a
1188-
// SIGKILL via exec.CommandContext
1189-
<-shutdownCtx.Done()
1190-
s.shimCancel()
1183+
s.shutdownLoop(requestCtx, timeout, req)
11911184
}()
11921185

1193-
s.logger.Info("stopping the VM")
1194-
1195-
// Try to tell agent to exit, causing the VM to begin a reboot. If that
1196-
// fails, try to forcibly stop the VMM. If that too fails, just cancel
1197-
// the shutdownCtx to fast-path to the VMM getting SIGKILL.
1198-
_, shutdownErr := s.agentClient.Shutdown(shutdownCtx, req)
1199-
if shutdownErr != nil {
1200-
s.logger.WithError(shutdownErr).Error("failed to shutdown VM agent")
1201-
stopVMMErr := s.machine.StopVMM()
1202-
if stopVMMErr != nil {
1203-
s.logger.WithError(stopVMMErr).Error("failed to forcibly stop VMM")
1204-
cancel()
1205-
}
1186+
err := s.machine.Wait(context.Background())
1187+
if err == nil {
1188+
return nil
12061189
}
1190+
return status.Error(codes.Internal, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1191+
}
12071192

1208-
// wait for the shimCtx to be done, which means the VM has exited and we're ready
1209-
// to shutdown
1210-
<-s.shimCtx.Done()
1211-
if shutdownCtx.Err() == context.DeadlineExceeded {
1212-
return status.Error(codes.DeadlineExceeded,
1213-
"timed out waiting for VM shutdown, VMM was sent SIGKILL")
1214-
}
1193+
// shutdownLoop sends multiple different shutdown requests to stop the VMM.
1194+
// 1) send a request to the in-VM agent, which is presumed to cause the VM to begin a reboot.
1195+
// 2) stop the VM through jailer#Stop(). The signal should be visible from the VMM (e.g. SIGTERM)
1196+
// 3) stop the VM through cancelling the associated context. The signal would not be visible from the VMM (e.g. SIGKILL)
1197+
func (s *service) shutdownLoop(
1198+
requestCtx context.Context,
1199+
timeout time.Duration,
1200+
req *taskAPI.ShutdownRequest,
1201+
) {
1202+
actions := []struct {
1203+
name string
1204+
shutdown func() error
1205+
timeout time.Duration
1206+
}{
1207+
{
1208+
name: "send a request to the in-VM agent",
1209+
shutdown: func() error {
1210+
_, err := s.agentClient.Shutdown(requestCtx, req)
1211+
if err != nil {
1212+
return err
1213+
}
1214+
return nil
1215+
},
1216+
timeout: timeout,
1217+
},
1218+
{
1219+
name: "stop the jailer",
1220+
shutdown: func() error {
1221+
return s.jailer.Stop()
1222+
},
1223+
timeout: jailerStopTimeout,
1224+
},
1225+
{
1226+
name: "cancel the context",
1227+
shutdown: func() error {
1228+
s.shimCancel()
1229+
return nil
1230+
},
1231+
},
1232+
}
1233+
1234+
for _, action := range actions {
1235+
pid, err := s.machine.PID()
1236+
if pid == 0 && err != nil {
1237+
break // we have nothing to kill
1238+
}
12151239

1216-
if s.vmExitErr != nil {
1217-
return status.Error(codes.Internal,
1218-
fmt.Sprintf("VMM exit errors: %v", s.vmExitErr))
1240+
s.logger.Debug(action.name)
1241+
err = action.shutdown()
1242+
if err != nil {
1243+
// if sending an request doesn't succeed, don't wait and carry on.
1244+
s.logger.WithError(err).Errorf("failed to %s", action.name)
1245+
} else {
1246+
time.Sleep(action.timeout)
1247+
}
12191248
}
1220-
1221-
return nil
12221249
}
12231250

12241251
func (s *service) Stats(requestCtx context.Context, req *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {

0 commit comments

Comments
 (0)