Skip to content

Commit d1c2a0a

Browse files
authored
Merge pull request #577 from kzys/fix-stopvm
Kill Firecracker forcefully if StopVM is hitting its context's timeout
2 parents 3de37ab + 576514d commit d1c2a0a

File tree

2 files changed

+107
-114
lines changed

2 files changed

+107
-114
lines changed

runtime/service.go

Lines changed: 60 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -647,26 +647,10 @@ func (s *service) StopVM(requestCtx context.Context, request *proto.StopVMReques
647647
timeout = time.Duration(request.TimeoutSeconds) * time.Second
648648
}
649649

650-
info, err := s.machine.DescribeInstanceInfo(requestCtx)
651-
if err != nil {
652-
return nil, errors.Wrapf(err, "failed to get instance info %v", info)
653-
}
654-
655-
if *info.State == models.InstanceInfoStatePaused {
656-
s.logger.Debug("Instance is in Paused state, force shutdown in progress")
657-
err = s.jailer.Stop(true)
658-
if err != nil {
659-
return nil, errors.Wrap(err, "failed to stop VM in paused State")
660-
}
661-
return &empty.Empty{}, nil
662-
}
663-
664-
err = s.waitVMReady()
665-
if err != nil {
666-
return nil, err
667-
}
650+
ctx, cancel := context.WithTimeout(requestCtx, timeout)
651+
defer cancel()
668652

669-
if err = s.shutdown(requestCtx, timeout, &taskAPI.ShutdownRequest{Now: true}); err != nil {
653+
if err = s.terminate(ctx); err != nil {
670654
return nil, err
671655
}
672656
return &empty.Empty{}, nil
@@ -1563,94 +1547,80 @@ func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequ
15631547
return &ptypes.Empty{}, nil
15641548
}
15651549

1566-
if err := s.shutdown(requestCtx, defaultShutdownTimeout, req); err != nil {
1550+
ctx, cancel := context.WithTimeout(requestCtx, defaultShutdownTimeout)
1551+
defer cancel()
1552+
1553+
if err := s.terminate(ctx); err != nil {
15671554
return &ptypes.Empty{}, err
15681555
}
15691556

15701557
return &ptypes.Empty{}, nil
15711558
}
15721559

1573-
func (s *service) shutdown(
1574-
requestCtx context.Context,
1575-
timeout time.Duration,
1576-
req *taskAPI.ShutdownRequest,
1577-
) error {
1578-
s.logger.Info("stopping the VM")
1560+
func (s *service) isPaused(ctx context.Context) (bool, error) {
1561+
info, err := s.machine.DescribeInstanceInfo(ctx)
1562+
if err != nil {
1563+
return false, errors.Wrapf(err, "failed to get instance info %v", info)
1564+
}
1565+
return *info.State == models.InstanceInfoStatePaused, nil
1566+
}
15791567

1580-
go func() {
1581-
s.shutdownLoop(requestCtx, timeout, req)
1582-
}()
1568+
func (s *service) forceTerminate(ctx context.Context) error {
1569+
s.logger.Errorf("forcefully terminate VM %s", s.vmID)
15831570

1584-
var result *multierror.Error
1585-
if err := s.machine.Wait(context.Background()); err != nil {
1586-
result = multierror.Append(result, err)
1587-
}
1588-
if err := s.cleanup(); err != nil {
1589-
result = multierror.Append(result, err)
1571+
err := s.jailer.Stop(true)
1572+
if err != nil {
1573+
s.logger.WithError(err).Error("failed to stop")
15901574
}
15911575

1592-
if err := result.ErrorOrNil(); err != nil {
1593-
return status.Error(codes.Internal, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1576+
err = s.cleanup()
1577+
if err != nil {
1578+
s.logger.WithError(err).Error("failed to cleanup")
15941579
}
1595-
return nil
1580+
1581+
return status.Errorf(codes.Internal, "forcefully terminated VM %s", s.vmID)
15961582
}
15971583

1598-
// shutdownLoop sends multiple different shutdown requests to stop the VMM.
1599-
// 1) send a request to the in-VM agent, which is presumed to cause the VM to begin a reboot.
1600-
// 2) stop the VM through jailer#Stop(). The signal should be visible from the VMM (e.g. SIGTERM)
1601-
// 3) stop the VM through cancelling the associated context. The signal would not be visible from the VMM (e.g. SIGKILL)
1602-
func (s *service) shutdownLoop(
1603-
requestCtx context.Context,
1604-
timeout time.Duration,
1605-
req *taskAPI.ShutdownRequest,
1606-
) {
1607-
actions := []struct {
1608-
name string
1609-
shutdown func() error
1610-
timeout time.Duration
1611-
}{
1612-
{
1613-
name: "send a request to the in-VM agent",
1614-
shutdown: func() error {
1615-
_, err := s.agentClient.Shutdown(requestCtx, req)
1616-
if err != nil {
1617-
return err
1618-
}
1619-
return nil
1620-
},
1621-
timeout: timeout,
1622-
},
1623-
{
1624-
name: "stop the jailer by SIGTERM",
1625-
shutdown: func() error {
1626-
return s.jailer.Stop(false)
1627-
},
1628-
timeout: jailerStopTimeout,
1629-
},
1630-
{
1631-
name: "stop the jailer by SIGKILL",
1632-
shutdown: func() error {
1633-
return s.jailer.Stop(true)
1634-
},
1635-
timeout: jailerStopTimeout,
1636-
},
1584+
func (s *service) terminate(ctx context.Context) (retErr error) {
1585+
var success bool
1586+
defer func() {
1587+
if !success {
1588+
retErr = s.forceTerminate(ctx)
1589+
}
1590+
}()
1591+
1592+
err := s.waitVMReady()
1593+
if err != nil {
1594+
s.logger.WithError(err).Error("failed to wait VM")
1595+
return
16371596
}
16381597

1639-
for _, action := range actions {
1640-
pid, err := s.machine.PID()
1641-
if pid == 0 && err != nil {
1642-
break // we have nothing to kill
1643-
}
1598+
paused, err := s.isPaused(ctx)
1599+
if err != nil {
1600+
s.logger.WithError(err).Error("failed to check VM")
1601+
return
1602+
}
16441603

1645-
s.logger.Debug(action.name)
1646-
err = action.shutdown()
1647-
if err != nil {
1648-
// if sending an request doesn't succeed, don't wait and carry on.
1649-
s.logger.WithError(err).Errorf("failed to %s", action.name)
1650-
} else {
1651-
time.Sleep(action.timeout)
1652-
}
1604+
if paused {
1605+
s.logger.Error("VM is paused and cannot take requests")
1606+
return
16531607
}
1608+
1609+
s.logger.Info("gracefully shutdown VM")
1610+
_, err = s.agentClient.Shutdown(ctx, &taskAPI.ShutdownRequest{ID: s.vmID, Now: true})
1611+
if err != nil {
1612+
s.logger.WithError(err).Error("failed to call in-VM agent")
1613+
return
1614+
}
1615+
1616+
err = s.machine.Wait(ctx)
1617+
if err != nil {
1618+
s.logger.WithError(err).Error("failed to wait VM")
1619+
return
1620+
}
1621+
1622+
success = true
1623+
return
16541624
}
16551625

16561626
func (s *service) Stats(requestCtx context.Context, req *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {

runtime/service_integ_test.go

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,6 +1599,10 @@ func findProcess(
15991599
return matches, nil
16001600
}
16011601

1602+
func pidExists(pid int) bool {
1603+
return syscall.ESRCH.Is(syscall.Kill(pid, 0))
1604+
}
1605+
16021606
func TestStopVM_Isolated(t *testing.T) {
16031607
prepareIntegTest(t)
16041608
require := require.New(t)
@@ -1644,15 +1648,9 @@ func TestStopVM_Isolated(t *testing.T) {
16441648
},
16451649
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
16461650
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1647-
errCode := status.Code(err)
1648-
assert.Equal(codes.Internal, errCode, "the error code must be Internal")
1649-
1650-
if req.JailerConfig != nil {
1651-
// No "signal: ..." error with runc since it traps the signal
1652-
assert.Contains(err.Error(), "exit status 1")
1653-
} else {
1654-
assert.Contains(err.Error(), "signal: terminated", "must be 'terminated', not 'killed'")
1655-
}
1651+
require.Error(err)
1652+
assert.Equal(codes.Internal, status.Code(err))
1653+
assert.Contains(err.Error(), "forcefully terminated")
16561654
},
16571655
},
16581656

@@ -1701,12 +1699,10 @@ func TestStopVM_Isolated(t *testing.T) {
17011699
TimeoutSeconds: 10,
17021700
})
17031701

1704-
if req.JailerConfig != nil {
1705-
// No "signal: ..." error with runc since it traps the signal
1706-
assert.Contains(err.Error(), "exit status 137")
1707-
} else {
1708-
assert.Contains(err.Error(), "signal: killed")
1709-
}
1702+
require.Error(err)
1703+
assert.Equal(codes.Internal, status.Code(err))
1704+
// This is technically not accurate (the test is terminating the VM) though.
1705+
assert.Contains(err.Error(), "forcefully terminated")
17101706
},
17111707
},
17121708

@@ -1722,7 +1718,30 @@ func TestStopVM_Isolated(t *testing.T) {
17221718
require.Equal(status.Code(err), codes.OK)
17231719

17241720
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1725-
require.NoError(err)
1721+
require.Error(err)
1722+
assert.Equal(codes.Internal, status.Code(err))
1723+
assert.Contains(err.Error(), "forcefully terminated")
1724+
},
1725+
},
1726+
1727+
{
1728+
name: "Suspend",
1729+
withStopVM: true,
1730+
1731+
createVMRequest: proto.CreateVMRequest{},
1732+
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
1733+
firecrackerProcesses, err := findProcess(ctx, findFirecracker)
1734+
require.NoError(err, "failed waiting for expected firecracker process %q to come up", firecrackerProcessName)
1735+
require.Len(firecrackerProcesses, 1, "expected only one firecracker process to exist")
1736+
firecrackerProcess := firecrackerProcesses[0]
1737+
1738+
err = firecrackerProcess.Suspend()
1739+
require.NoError(err, "failed to suspend Firecracker")
1740+
1741+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1742+
require.Error(err)
1743+
assert.Equal(codes.Internal, status.Code(err))
1744+
assert.Contains(err.Error(), "forcefully terminated")
17261745
},
17271746
},
17281747
}
@@ -1768,13 +1787,13 @@ func TestStopVM_Isolated(t *testing.T) {
17681787
// If the function above uses StopVMRequest, all underlying processes must be dead
17691788
// (either gracefully or forcibly) at the end of the request.
17701789
if test.withStopVM {
1771-
fcExists, err := process.PidExists(fcProcess.Pid)
1772-
require.NoError(err, "failed to find firecracker")
1773-
require.False(fcExists, "firecracker %s is still there", vmID)
1790+
fcExists := pidExists(int(fcProcess.Pid))
1791+
assert.NoError(err, "failed to find firecracker")
1792+
assert.False(fcExists, "firecracker %s (pid=%d) is still there", vmID, fcProcess.Pid)
17741793

1775-
shimExists, err := process.PidExists(shimProcess.Pid)
1776-
require.NoError(err, "failed to find shim")
1777-
require.False(shimExists, "shim %s is still there", vmID)
1794+
shimExists := pidExists(int(shimProcess.Pid))
1795+
assert.NoError(err, "failed to find shim")
1796+
assert.False(shimExists, "shim %s (pid=%d) is still there", vmID, shimProcess.Pid)
17781797
}
17791798

17801799
err = internal.WaitForPidToExit(ctx, time.Second, shimProcess.Pid)
@@ -2327,8 +2346,12 @@ func TestPauseResume_Isolated(t *testing.T) {
23272346
// Ensure the response fields are populated correctly
23282347
assert.Equal(t, request.VMID, resp.VMID)
23292348

2330-
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: request.VMID})
2331-
require.Equal(t, status.Code(err), codes.OK)
2349+
// Resume the VM since state() may pause the VM.
2350+
_, err = fcClient.ResumeVM(ctx, &proto.ResumeVMRequest{VMID: vmID})
2351+
require.NoError(t, err)
2352+
2353+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
2354+
require.NoError(t, err)
23322355
}
23332356

23342357
for _, subtest := range subtests {

0 commit comments

Comments
 (0)