Skip to content

Commit 7a4b897

Browse files
committed
Even StopVM() is timing out, Firecracker process must be killed
Before the change, s.shimCancel() was only called by the goroutine. So the StopVM() couldn't guarantee that the VMM is killed at the end of the request. Signed-off-by: Kazuyoshi Kato <[email protected]>
1 parent a130549 commit 7a4b897

File tree

3 files changed

+89
-51
lines changed

3 files changed

+89
-51
lines changed

runtime/jailer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ type jailer interface {
5656
// drive file
5757
StubDrivesOptions() []FileOpt
5858

59-
// Stop the jailer as a way that the process can interpreted (e.g. SIGTERM).
59+
// Stop the jailer as a way that is visible from the user-level process (e.g. SIGTERM).
6060
Stop() error
6161

6262
// Close will do any necessary cleanup that the jailer has accrued.

runtime/service.go

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ const (
7474
firecrackerStartTimeout = 5 * time.Second
7575
defaultStopVMTimeout = 5 * time.Second
7676
defaultShutdownTimeout = 5 * time.Second
77+
jailerStopTimeout = 3 * time.Second
7778

7879
// StartEventName is the topic published to when a VM starts
7980
StartEventName = "/firecracker-vm/start"
@@ -1171,54 +1172,80 @@ func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequ
11711172
return &ptypes.Empty{}, nil
11721173
}
11731174

1174-
// shutdown will stop the VMM within the provided timeout. It attempts to shutdown gracefully by having
1175-
// agent stop (which is presumed to cause the VM to begin a reboot) and then waiting for the VMM process
1176-
// to exit (via the s.shimCtx.Done() channel). If that fails, StopVMM will be called to force a shutdown (currently
1177-
// via sending SIGTERM). If that fails, the VMM will still be killed via SIGKILL when the shimCtx is canceled.
11781175
func (s *service) shutdown(
11791176
requestCtx context.Context,
11801177
timeout time.Duration,
11811178
req *taskAPI.ShutdownRequest,
11821179
) error {
1183-
shutdownCtx, cancel := context.WithTimeout(requestCtx, timeout)
1184-
defer cancel()
1180+
s.logger.Info("stopping the VM")
1181+
11851182
go func() {
1186-
// Once the shutdown procedure is done, the shim needs to shutdown too.
1187-
// This also ensures that if the VMM is still alive, it will receive a
1188-
// SIGKILL via exec.CommandContext
1189-
<-shutdownCtx.Done()
1190-
s.shimCancel()
1183+
s.shutdownLoop(requestCtx, timeout, req)
11911184
}()
11921185

1193-
s.logger.Info("stopping the VM")
1194-
1195-
// Try to tell agent to exit, causing the VM to begin a reboot. If that
1196-
// fails, try to forcibly stop the VMM. If that too fails, just cancel
1197-
// the shutdownCtx to fast-path to the VMM getting SIGKILL.
1198-
_, shutdownErr := s.agentClient.Shutdown(shutdownCtx, req)
1199-
if shutdownErr != nil {
1200-
s.logger.WithError(shutdownErr).Error("failed to shutdown VM agent")
1201-
stopVMMErr := s.machine.StopVMM()
1202-
if stopVMMErr != nil {
1203-
s.logger.WithError(stopVMMErr).Error("failed to forcibly stop VMM")
1204-
cancel()
1205-
}
1186+
err := s.machine.Wait(context.Background())
1187+
if err == nil {
1188+
return nil
12061189
}
1190+
return status.Error(codes.DeadlineExceeded, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1191+
}
12071192

1208-
// wait for the shimCtx to be done, which means the VM has exited and we're ready
1209-
// to shutdown
1210-
<-s.shimCtx.Done()
1211-
if shutdownCtx.Err() == context.DeadlineExceeded {
1212-
return status.Error(codes.DeadlineExceeded,
1213-
"timed out waiting for VM shutdown, VMM was sent SIGKILL")
1214-
}
1193+
// shutdownLoop sends multiple different shutdown requests to stop the VMM.
1194+
// 1) send a request to the in-VM agent, which is presumed to cause the VM to begin a reboot.
1195+
// 2) stop the VM through jailer#Stop(). The signal should be visible from the VMM (e.g. SIGTERM)
1196+
// 3) stop the VM through cancelling the associated context. The signal would not be visible from the VMM (e.g. SIGKILL)
1197+
func (s *service) shutdownLoop(
1198+
requestCtx context.Context,
1199+
timeout time.Duration,
1200+
req *taskAPI.ShutdownRequest,
1201+
) {
1202+
actions := []struct {
1203+
name string
1204+
shutdown func() error
1205+
timeout time.Duration
1206+
}{
1207+
{
1208+
name: "send a request to the in-VM agent",
1209+
shutdown: func() error {
1210+
_, err := s.agentClient.Shutdown(requestCtx, req)
1211+
if err != nil {
1212+
return err
1213+
}
1214+
return nil
1215+
},
1216+
timeout: timeout,
1217+
},
1218+
{
1219+
name: "stop the jailer",
1220+
shutdown: func() error {
1221+
return s.jailer.Stop()
1222+
},
1223+
timeout: jailerStopTimeout,
1224+
},
1225+
{
1226+
name: "cancel the context",
1227+
shutdown: func() error {
1228+
s.shimCancel()
1229+
return nil
1230+
},
1231+
},
1232+
}
1233+
1234+
for _, action := range actions {
1235+
pid, err := s.machine.PID()
1236+
if pid == 0 && err != nil {
1237+
break // we have nothing to kill
1238+
}
12151239

1216-
if s.vmExitErr != nil {
1217-
return status.Error(codes.Internal,
1218-
fmt.Sprintf("VMM exit errors: %v", s.vmExitErr))
1240+
s.logger.Debug(action.name)
1241+
err = action.shutdown()
1242+
if err != nil {
1243+
// if sending an request doesn't succeed, don't wait and carry on.
1244+
s.logger.WithError(err).Errorf("failed to %s", action.name)
1245+
} else {
1246+
time.Sleep(action.timeout)
1247+
}
12191248
}
1220-
1221-
return nil
12221249
}
12231250

12241251
func (s *service) Stats(requestCtx context.Context, req *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {

runtime/service_integ_test.go

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ const (
6565
firecrackerRuntime = "aws.firecracker"
6666
shimProcessName = "containerd-shim-aws-firecracker"
6767
firecrackerProcessName = "firecracker"
68-
jailerProcessName = "runc"
6968

7069
defaultVMRootfsPath = "/var/lib/firecracker-containerd/runtime/default-rootfs.img"
7170
defaultVMNetDevName = "eth0"
@@ -1365,16 +1364,16 @@ func TestStopVM_Isolated(t *testing.T) {
13651364
tests := []struct {
13661365
name string
13671366
createVMRequest proto.CreateVMRequest
1368-
stopFunc func(ctx context.Context, fcClient fccontrol.FirecrackerService, vmID string)
1367+
stopFunc func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest)
13691368
withStopVM bool
13701369
}{
13711370
{
13721371
name: "Successful",
13731372
withStopVM: true,
13741373

13751374
createVMRequest: proto.CreateVMRequest{},
1376-
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, vmID string) {
1377-
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
1375+
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
1376+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
13781377
require.Equal(status.Code(err), codes.OK)
13791378
},
13801379
},
@@ -1390,11 +1389,17 @@ func TestStopVM_Isolated(t *testing.T) {
13901389
HostPath: "/var/lib/firecracker-containerd/runtime/rootfs-slow-reboot.img",
13911390
},
13921391
},
1393-
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, vmID string) {
1394-
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
1392+
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
1393+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
13951394
errCode := status.Code(err)
1396-
assert.NotEqual(codes.Unknown, errCode, "the error code must not be Unknown")
13971395
assert.Equal(codes.DeadlineExceeded, errCode, "the error code must be DeadlineExceeded")
1396+
1397+
if req.JailerConfig != nil {
1398+
// No "signal: ..." error with runc since it traps the signal
1399+
assert.Contains(err.Error(), "exit status 1")
1400+
} else {
1401+
assert.Contains(err.Error(), "signal: terminated", "must be 'terminated', not 'killed'")
1402+
}
13981403
},
13991404
},
14001405

@@ -1404,7 +1409,7 @@ func TestStopVM_Isolated(t *testing.T) {
14041409
withStopVM: false,
14051410

14061411
createVMRequest: proto.CreateVMRequest{},
1407-
stopFunc: func(ctx context.Context, _ fccontrol.FirecrackerService, _ string) {
1412+
stopFunc: func(ctx context.Context, _ fccontrol.FirecrackerService, _ proto.CreateVMRequest) {
14081413
firecrackerProcesses, err := findProcess(ctx, findFirecracker)
14091414
require.NoError(err, "failed waiting for expected firecracker process %q to come up", firecrackerProcessName)
14101415
require.Len(firecrackerProcesses, 1, "expected only one firecracker process to exist")
@@ -1426,7 +1431,7 @@ func TestStopVM_Isolated(t *testing.T) {
14261431
HostPath: "/var/lib/firecracker-containerd/runtime/rootfs-slow-reboot.img",
14271432
},
14281433
},
1429-
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, vmID string) {
1434+
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
14301435
firecrackerProcesses, err := findProcess(ctx, findFirecracker)
14311436
require.NoError(err, "failed waiting for expected firecracker process %q to come up", firecrackerProcessName)
14321437
require.Len(firecrackerProcesses, 1, "expected only one firecracker process to exist")
@@ -1439,10 +1444,16 @@ func TestStopVM_Isolated(t *testing.T) {
14391444
}()
14401445

14411446
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{
1442-
VMID: vmID,
1447+
VMID: req.VMID,
14431448
TimeoutSeconds: 10,
14441449
})
1445-
require.Contains(err.Error(), "signal: killed", "unexpected error from StopVM")
1450+
1451+
if req.JailerConfig != nil {
1452+
// No "signal: ..." error with runc since it traps the signal
1453+
assert.Contains(err.Error(), "exit status 137")
1454+
} else {
1455+
assert.Contains(err.Error(), "signal: killed")
1456+
}
14461457
},
14471458
},
14481459
}
@@ -1481,10 +1492,10 @@ func TestStopVM_Isolated(t *testing.T) {
14811492
require.Len(fcProcesses, 1, "expected only one firecracker process to exist")
14821493
fcProcess := fcProcesses[0]
14831494

1484-
test.stopFunc(ctx, fcClient, vmID)
1495+
test.stopFunc(ctx, fcClient, createVMRequest)
14851496

14861497
// If the function above uses StopVMRequest,
1487-
// shim gurantees that the underlying Firecracker process is dead
1498+
// shim guarantees that the underlying Firecracker process is dead
14881499
// (either gracefully or forcibly) at the end of the request.
14891500
if test.withStopVM {
14901501
fcExists, err := process.PidExists(fcProcess.Pid)
@@ -1493,7 +1504,7 @@ func TestStopVM_Isolated(t *testing.T) {
14931504
}
14941505

14951506
// Since the shim is the one which is writing the response,
1496-
// it cannot gurantee that the itself is dead at the end of the request.
1507+
// it cannot guarantee that the shim itself is dead at the end of the request.
14971508
// But it should be dead eventually.
14981509
err = internal.WaitForPidToExit(ctx, time.Second, shimProcess.Pid)
14991510
require.NoError(err, "shim hasn't been terminated")

0 commit comments

Comments
 (0)