Skip to content

Commit 576514d

Browse files
committed
Kill Firecracker forcefully if StopVM is hitting its context's timeout
This commit refactors StopVM's internal to make sure Firecracker is killed even it is not responding. Signed-off-by: Kazuyoshi Kato <[email protected]>
1 parent e563e5b commit 576514d

File tree

2 files changed

+79
-109
lines changed

2 files changed

+79
-109
lines changed

runtime/service.go

Lines changed: 60 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -647,26 +647,10 @@ func (s *service) StopVM(requestCtx context.Context, request *proto.StopVMReques
647647
timeout = time.Duration(request.TimeoutSeconds) * time.Second
648648
}
649649

650-
info, err := s.machine.DescribeInstanceInfo(requestCtx)
651-
if err != nil {
652-
return nil, errors.Wrapf(err, "failed to get instance info %v", info)
653-
}
654-
655-
if *info.State == models.InstanceInfoStatePaused {
656-
s.logger.Debug("Instance is in Paused state, force shutdown in progress")
657-
err = s.jailer.Stop(true)
658-
if err != nil {
659-
return nil, errors.Wrap(err, "failed to stop VM in paused State")
660-
}
661-
return &empty.Empty{}, nil
662-
}
663-
664-
err = s.waitVMReady()
665-
if err != nil {
666-
return nil, err
667-
}
650+
ctx, cancel := context.WithTimeout(requestCtx, timeout)
651+
defer cancel()
668652

669-
if err = s.shutdown(requestCtx, timeout, &taskAPI.ShutdownRequest{Now: true}); err != nil {
653+
if err = s.terminate(ctx); err != nil {
670654
return nil, err
671655
}
672656
return &empty.Empty{}, nil
@@ -1563,94 +1547,80 @@ func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequ
15631547
return &ptypes.Empty{}, nil
15641548
}
15651549

1566-
if err := s.shutdown(requestCtx, defaultShutdownTimeout, req); err != nil {
1550+
ctx, cancel := context.WithTimeout(requestCtx, defaultShutdownTimeout)
1551+
defer cancel()
1552+
1553+
if err := s.terminate(ctx); err != nil {
15671554
return &ptypes.Empty{}, err
15681555
}
15691556

15701557
return &ptypes.Empty{}, nil
15711558
}
15721559

1573-
func (s *service) shutdown(
1574-
requestCtx context.Context,
1575-
timeout time.Duration,
1576-
req *taskAPI.ShutdownRequest,
1577-
) error {
1578-
s.logger.Info("stopping the VM")
1560+
func (s *service) isPaused(ctx context.Context) (bool, error) {
1561+
info, err := s.machine.DescribeInstanceInfo(ctx)
1562+
if err != nil {
1563+
return false, errors.Wrapf(err, "failed to get instance info %v", info)
1564+
}
1565+
return *info.State == models.InstanceInfoStatePaused, nil
1566+
}
15791567

1580-
go func() {
1581-
s.shutdownLoop(requestCtx, timeout, req)
1582-
}()
1568+
func (s *service) forceTerminate(ctx context.Context) error {
1569+
s.logger.Errorf("forcefully terminate VM %s", s.vmID)
15831570

1584-
var result *multierror.Error
1585-
if err := s.machine.Wait(context.Background()); err != nil {
1586-
result = multierror.Append(result, err)
1587-
}
1588-
if err := s.cleanup(); err != nil {
1589-
result = multierror.Append(result, err)
1571+
err := s.jailer.Stop(true)
1572+
if err != nil {
1573+
s.logger.WithError(err).Error("failed to stop")
15901574
}
15911575

1592-
if err := result.ErrorOrNil(); err != nil {
1593-
return status.Error(codes.Internal, fmt.Sprintf("the VMM was killed forcibly: %v", err))
1576+
err = s.cleanup()
1577+
if err != nil {
1578+
s.logger.WithError(err).Error("failed to cleanup")
15941579
}
1595-
return nil
1580+
1581+
return status.Errorf(codes.Internal, "forcefully terminated VM %s", s.vmID)
15961582
}
15971583

1598-
// shutdownLoop sends multiple different shutdown requests to stop the VMM.
1599-
// 1) send a request to the in-VM agent, which is presumed to cause the VM to begin a reboot.
1600-
// 2) stop the VM through jailer#Stop(). The signal should be visible from the VMM (e.g. SIGTERM)
1601-
// 3) stop the VM through cancelling the associated context. The signal would not be visible from the VMM (e.g. SIGKILL)
1602-
func (s *service) shutdownLoop(
1603-
requestCtx context.Context,
1604-
timeout time.Duration,
1605-
req *taskAPI.ShutdownRequest,
1606-
) {
1607-
actions := []struct {
1608-
name string
1609-
shutdown func() error
1610-
timeout time.Duration
1611-
}{
1612-
{
1613-
name: "send a request to the in-VM agent",
1614-
shutdown: func() error {
1615-
_, err := s.agentClient.Shutdown(requestCtx, req)
1616-
if err != nil {
1617-
return err
1618-
}
1619-
return nil
1620-
},
1621-
timeout: timeout,
1622-
},
1623-
{
1624-
name: "stop the jailer by SIGTERM",
1625-
shutdown: func() error {
1626-
return s.jailer.Stop(false)
1627-
},
1628-
timeout: jailerStopTimeout,
1629-
},
1630-
{
1631-
name: "stop the jailer by SIGKILL",
1632-
shutdown: func() error {
1633-
return s.jailer.Stop(true)
1634-
},
1635-
timeout: jailerStopTimeout,
1636-
},
1584+
func (s *service) terminate(ctx context.Context) (retErr error) {
1585+
var success bool
1586+
defer func() {
1587+
if !success {
1588+
retErr = s.forceTerminate(ctx)
1589+
}
1590+
}()
1591+
1592+
err := s.waitVMReady()
1593+
if err != nil {
1594+
s.logger.WithError(err).Error("failed to wait VM")
1595+
return
16371596
}
16381597

1639-
for _, action := range actions {
1640-
pid, err := s.machine.PID()
1641-
if pid == 0 && err != nil {
1642-
break // we have nothing to kill
1643-
}
1598+
paused, err := s.isPaused(ctx)
1599+
if err != nil {
1600+
s.logger.WithError(err).Error("failed to check VM")
1601+
return
1602+
}
16441603

1645-
s.logger.Debug(action.name)
1646-
err = action.shutdown()
1647-
if err != nil {
1648-
// if sending an request doesn't succeed, don't wait and carry on.
1649-
s.logger.WithError(err).Errorf("failed to %s", action.name)
1650-
} else {
1651-
time.Sleep(action.timeout)
1652-
}
1604+
if paused {
1605+
s.logger.Error("VM is paused and cannot take requests")
1606+
return
16531607
}
1608+
1609+
s.logger.Info("gracefully shutdown VM")
1610+
_, err = s.agentClient.Shutdown(ctx, &taskAPI.ShutdownRequest{ID: s.vmID, Now: true})
1611+
if err != nil {
1612+
s.logger.WithError(err).Error("failed to call in-VM agent")
1613+
return
1614+
}
1615+
1616+
err = s.machine.Wait(ctx)
1617+
if err != nil {
1618+
s.logger.WithError(err).Error("failed to wait VM")
1619+
return
1620+
}
1621+
1622+
success = true
1623+
return
16541624
}
16551625

16561626
func (s *service) Stats(requestCtx context.Context, req *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {

runtime/service_integ_test.go

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,15 +1648,9 @@ func TestStopVM_Isolated(t *testing.T) {
16481648
},
16491649
stopFunc: func(ctx context.Context, fcClient fccontrol.FirecrackerService, req proto.CreateVMRequest) {
16501650
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1651-
errCode := status.Code(err)
1652-
assert.Equal(codes.Internal, errCode, "the error code must be Internal")
1653-
1654-
if req.JailerConfig != nil {
1655-
// No "signal: ..." error with runc since it traps the signal
1656-
assert.Contains(err.Error(), "exit status 1")
1657-
} else {
1658-
assert.Contains(err.Error(), "signal: terminated", "must be 'terminated', not 'killed'")
1659-
}
1651+
require.Error(err)
1652+
assert.Equal(codes.Internal, status.Code(err))
1653+
assert.Contains(err.Error(), "forcefully terminated")
16601654
},
16611655
},
16621656

@@ -1705,12 +1699,10 @@ func TestStopVM_Isolated(t *testing.T) {
17051699
TimeoutSeconds: 10,
17061700
})
17071701

1708-
if req.JailerConfig != nil {
1709-
// No "signal: ..." error with runc since it traps the signal
1710-
assert.Contains(err.Error(), "exit status 137")
1711-
} else {
1712-
assert.Contains(err.Error(), "signal: killed")
1713-
}
1702+
require.Error(err)
1703+
assert.Equal(codes.Internal, status.Code(err))
1704+
// This is technically not accurate (the test is terminating the VM) though.
1705+
assert.Contains(err.Error(), "forcefully terminated")
17141706
},
17151707
},
17161708

@@ -1726,7 +1718,9 @@ func TestStopVM_Isolated(t *testing.T) {
17261718
require.Equal(status.Code(err), codes.OK)
17271719

17281720
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1729-
require.NoError(err)
1721+
require.Error(err)
1722+
assert.Equal(codes.Internal, status.Code(err))
1723+
assert.Contains(err.Error(), "forcefully terminated")
17301724
},
17311725
},
17321726

@@ -1745,7 +1739,9 @@ func TestStopVM_Isolated(t *testing.T) {
17451739
require.NoError(err, "failed to suspend Firecracker")
17461740

17471741
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: req.VMID})
1748-
assert.NotNil(err)
1742+
require.Error(err)
1743+
assert.Equal(codes.Internal, status.Code(err))
1744+
assert.Contains(err.Error(), "forcefully terminated")
17491745
},
17501746
},
17511747
}
@@ -2350,8 +2346,12 @@ func TestPauseResume_Isolated(t *testing.T) {
23502346
// Ensure the response fields are populated correctly
23512347
assert.Equal(t, request.VMID, resp.VMID)
23522348

2353-
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: request.VMID})
2354-
require.Equal(t, status.Code(err), codes.OK)
2349+
// Resume the VM since state() may pause the VM.
2350+
_, err = fcClient.ResumeVM(ctx, &proto.ResumeVMRequest{VMID: vmID})
2351+
require.NoError(t, err)
2352+
2353+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
2354+
require.NoError(t, err)
23552355
}
23562356

23572357
for _, subtest := range subtests {

0 commit comments

Comments
 (0)